### What is pandas?
#### Pandas is a Python library used for working with data sets.

In [1]:
import pandas as pd
import numpy as np

### Datastructure
1. Series -- one dimensional
2. DataFrame -- two dimensional
 

In [2]:
# Creating a series from a list
data = [1, 2, 3, 4, 5]
s = pd.Series(data)

In [3]:
s

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [4]:
# Creating a series from a list with custom index
data = [1, 2, 3, 4, 5]
index = ['A', 'B', 'C', 'D', 'E']

s_with_index = pd.Series(data, index=index)


In [5]:
s_with_index

A    1
B    2
C    3
D    4
E    5
dtype: int64

In [6]:
# Create a dataframe from lists
data_wc_football = {
    'Country': ['Spain', 'Germany', 'France', 'Argentina'],
    'Year': ['2010', '2014', '2018', '2022'],
    'Player': ['Iniesta', 'Klose', 'Mbappe', 'Messi']
}

df_wc_football = pd.DataFrame(data_wc_football)

In [7]:
df_wc_football

Unnamed: 0,Country,Year,Player
0,Spain,2010,Iniesta
1,Germany,2014,Klose
2,France,2018,Mbappe
3,Argentina,2022,Messi


In [8]:
# Create a dataframe from lists with custom index
data_wc_football = {
    'Country': ['Spain', 'Germany', 'France', 'Argentina'],
    'Year': ['2010', '2014', '2018', '2022'],
    'Player': ['Iniesta', 'Klose', 'Mbappe', 'Messi']
}

index = ['A', 'B', 'C', 'D']

df_wc_football_with_index = pd.DataFrame(data_wc_football, index=index)

In [9]:
df_wc_football_with_index

Unnamed: 0,Country,Year,Player
A,Spain,2010,Iniesta
B,Germany,2014,Klose
C,France,2018,Mbappe
D,Argentina,2022,Messi


### What is a dictionary?
#### A dictionary is a collection of key-value pairs. Each key is associated with a value.

In [10]:
# Create a dictionary using a dict constrictor
data_wc_football = {
    'Country': ['Spain', 'Germany', 'France', 'Argentina'],
    'Year': ['2010', '2014', '2018', '2022'],
    'Player': ['Iniesta', 'Klose', 'Mbappe', 'Messi']
}

In [11]:
# Create a dataframe from a dictionary

data_wc_football = {
    'Country': ['Spain', 'Germany', 'France', 'Argentina'],
    'Year': ['2010', '2014', '2018', '2022'],
    'Player': ['Iniesta', 'Klose', 'Mbappe', 'Messi']
}

df_from_dict = pd.DataFrame(data_wc_football)

In [12]:
df_from_dict

Unnamed: 0,Country,Year,Player
0,Spain,2010,Iniesta
1,Germany,2014,Klose
2,France,2018,Mbappe
3,Argentina,2022,Messi


In [13]:
df_wc_football_with_index.dtypes

Country    object
Year       object
Player     object
dtype: object

In [14]:
# Create a fake dataframe with random data

# shape of the dataframe
num_rows = 100
num_cols = 20

data = np.random.randn(num_rows, num_cols) # create a 2D array of random numbers from a normal distribution
# rename the columns

columns = ['column_' + str(i) for i in range(num_cols)]
index = ['row_' + str(i) for i in range(num_rows)]

df = pd.DataFrame(data, columns=columns, index=index)


In [15]:
df

Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,column_12,column_13,column_14,column_15,column_16,column_17,column_18,column_19
row_0,-0.061867,3.494840,1.366145,-0.282716,1.232646,-0.572733,-0.326738,-1.213385,0.723445,1.127667,0.994296,-0.313927,-2.422527,-1.287514,-1.023617,-1.936117,0.284309,-0.275824,-0.292722,-0.291986
row_1,-1.314013,1.432290,1.152912,1.156307,1.297124,0.693444,0.812662,2.682947,-0.296179,-0.161810,-0.691866,-0.489068,-0.186374,-0.581903,0.417727,-0.029335,-1.112440,-1.010581,-0.662007,0.908214
row_2,-1.452162,1.109068,1.246356,0.716861,-0.803672,1.130528,-0.961859,1.785122,-0.573043,-0.477888,-0.619135,0.415023,-0.455231,-1.205265,0.115913,2.056874,-0.193720,-0.569588,-0.077930,1.158553
row_3,0.767121,-1.410148,0.029519,0.472326,0.020758,-0.489636,-0.527717,0.179453,1.982950,0.561855,1.635451,-0.541245,-0.854839,-0.490948,-1.151506,0.910729,-1.401155,-0.326643,1.164274,-0.193142
row_4,0.962889,0.765808,-1.365236,1.370107,0.980378,0.609408,-1.289655,0.730033,0.089912,0.930447,0.226197,1.594845,-0.319074,-1.216257,-1.086353,-0.149670,-0.791340,-1.361137,0.500909,-1.058117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
row_95,1.439471,1.750278,0.884833,-0.679433,-1.032590,1.247483,2.134904,0.782725,-1.464817,-0.839772,-2.079428,-0.653747,1.867005,0.632023,-2.197502,-1.450950,-1.865643,1.342990,-0.685883,0.611289
row_96,1.311461,-0.134759,0.853030,2.352509,2.903976,0.188207,0.390707,-1.117908,-0.018376,0.763738,-2.141927,0.552507,0.530554,-0.630892,0.700762,-1.998996,1.221675,0.088607,-0.559924,-1.436727
row_97,0.714377,0.043508,3.199866,0.713160,-0.326792,-0.058511,-0.039244,2.191589,-0.442348,0.510670,1.083935,-1.141513,-0.785940,1.157206,-1.140128,0.557896,0.963973,-0.548824,0.271483,0.462247
row_98,0.552253,-0.335895,1.070005,0.103915,0.588325,0.959030,0.253804,-0.371038,0.952739,0.032104,0.908743,-1.888959,0.237859,0.703237,-0.552694,1.735655,0.078898,-1.314820,-2.646178,0.524606


In [16]:
# check the indices
df.index

Index(['row_0', 'row_1', 'row_2', 'row_3', 'row_4', 'row_5', 'row_6', 'row_7',
       'row_8', 'row_9', 'row_10', 'row_11', 'row_12', 'row_13', 'row_14',
       'row_15', 'row_16', 'row_17', 'row_18', 'row_19', 'row_20', 'row_21',
       'row_22', 'row_23', 'row_24', 'row_25', 'row_26', 'row_27', 'row_28',
       'row_29', 'row_30', 'row_31', 'row_32', 'row_33', 'row_34', 'row_35',
       'row_36', 'row_37', 'row_38', 'row_39', 'row_40', 'row_41', 'row_42',
       'row_43', 'row_44', 'row_45', 'row_46', 'row_47', 'row_48', 'row_49',
       'row_50', 'row_51', 'row_52', 'row_53', 'row_54', 'row_55', 'row_56',
       'row_57', 'row_58', 'row_59', 'row_60', 'row_61', 'row_62', 'row_63',
       'row_64', 'row_65', 'row_66', 'row_67', 'row_68', 'row_69', 'row_70',
       'row_71', 'row_72', 'row_73', 'row_74', 'row_75', 'row_76', 'row_77',
       'row_78', 'row_79', 'row_80', 'row_81', 'row_82', 'row_83', 'row_84',
       'row_85', 'row_86', 'row_87', 'row_88', 'row_89', 'row_90', 'row_91',

In [17]:
# check the columns
df.columns

Index(['column_0', 'column_1', 'column_2', 'column_3', 'column_4', 'column_5',
       'column_6', 'column_7', 'column_8', 'column_9', 'column_10',
       'column_11', 'column_12', 'column_13', 'column_14', 'column_15',
       'column_16', 'column_17', 'column_18', 'column_19'],
      dtype='object')

In [18]:
# sum the values of the dataframe row wise (horizontally)
# Adding a new column

df['sum'] = df.sum(axis=1)


In [19]:
df

Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,...,column_11,column_12,column_13,column_14,column_15,column_16,column_17,column_18,column_19,sum
row_0,-0.061867,3.494840,1.366145,-0.282716,1.232646,-0.572733,-0.326738,-1.213385,0.723445,1.127667,...,-0.313927,-2.422527,-1.287514,-1.023617,-1.936117,0.284309,-0.275824,-0.292722,-0.291986,-1.078327
row_1,-1.314013,1.432290,1.152912,1.156307,1.297124,0.693444,0.812662,2.682947,-0.296179,-0.161810,...,-0.489068,-0.186374,-0.581903,0.417727,-0.029335,-1.112440,-1.010581,-0.662007,0.908214,4.018051
row_2,-1.452162,1.109068,1.246356,0.716861,-0.803672,1.130528,-0.961859,1.785122,-0.573043,-0.477888,...,0.415023,-0.455231,-1.205265,0.115913,2.056874,-0.193720,-0.569588,-0.077930,1.158553,2.344805
row_3,0.767121,-1.410148,0.029519,0.472326,0.020758,-0.489636,-0.527717,0.179453,1.982950,0.561855,...,-0.541245,-0.854839,-0.490948,-1.151506,0.910729,-1.401155,-0.326643,1.164274,-0.193142,0.337456
row_4,0.962889,0.765808,-1.365236,1.370107,0.980378,0.609408,-1.289655,0.730033,0.089912,0.930447,...,1.594845,-0.319074,-1.216257,-1.086353,-0.149670,-0.791340,-1.361137,0.500909,-1.058117,0.124094
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
row_95,1.439471,1.750278,0.884833,-0.679433,-1.032590,1.247483,2.134904,0.782725,-1.464817,-0.839772,...,-0.653747,1.867005,0.632023,-2.197502,-1.450950,-1.865643,1.342990,-0.685883,0.611289,-0.256764
row_96,1.311461,-0.134759,0.853030,2.352509,2.903976,0.188207,0.390707,-1.117908,-0.018376,0.763738,...,0.552507,0.530554,-0.630892,0.700762,-1.998996,1.221675,0.088607,-0.559924,-1.436727,3.818223
row_97,0.714377,0.043508,3.199866,0.713160,-0.326792,-0.058511,-0.039244,2.191589,-0.442348,0.510670,...,-1.141513,-0.785940,1.157206,-1.140128,0.557896,0.963973,-0.548824,0.271483,0.462247,7.386610
row_98,0.552253,-0.335895,1.070005,0.103915,0.588325,0.959030,0.253804,-0.371038,0.952739,0.032104,...,-1.888959,0.237859,0.703237,-0.552694,1.735655,0.078898,-1.314820,-2.646178,0.524606,1.591589


In [20]:
# Selecting a column

df['column_1']

row_0     3.494840
row_1     1.432290
row_2     1.109068
row_3    -1.410148
row_4     0.765808
            ...   
row_95    1.750278
row_96   -0.134759
row_97    0.043508
row_98   -0.335895
row_99    0.091577
Name: column_1, Length: 100, dtype: float64

In [21]:
type(df['column_1'])

pandas.core.series.Series

In [22]:
# Slicing the dataframe by rows

df[0:5]

Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,...,column_11,column_12,column_13,column_14,column_15,column_16,column_17,column_18,column_19,sum
row_0,-0.061867,3.49484,1.366145,-0.282716,1.232646,-0.572733,-0.326738,-1.213385,0.723445,1.127667,...,-0.313927,-2.422527,-1.287514,-1.023617,-1.936117,0.284309,-0.275824,-0.292722,-0.291986,-1.078327
row_1,-1.314013,1.43229,1.152912,1.156307,1.297124,0.693444,0.812662,2.682947,-0.296179,-0.16181,...,-0.489068,-0.186374,-0.581903,0.417727,-0.029335,-1.11244,-1.010581,-0.662007,0.908214,4.018051
row_2,-1.452162,1.109068,1.246356,0.716861,-0.803672,1.130528,-0.961859,1.785122,-0.573043,-0.477888,...,0.415023,-0.455231,-1.205265,0.115913,2.056874,-0.19372,-0.569588,-0.07793,1.158553,2.344805
row_3,0.767121,-1.410148,0.029519,0.472326,0.020758,-0.489636,-0.527717,0.179453,1.98295,0.561855,...,-0.541245,-0.854839,-0.490948,-1.151506,0.910729,-1.401155,-0.326643,1.164274,-0.193142,0.337456
row_4,0.962889,0.765808,-1.365236,1.370107,0.980378,0.609408,-1.289655,0.730033,0.089912,0.930447,...,1.594845,-0.319074,-1.216257,-1.086353,-0.14967,-0.79134,-1.361137,0.500909,-1.058117,0.124094


In [23]:
# slicing the dataframe by columns

df[['column_1', 'column_2']]

Unnamed: 0,column_1,column_2
row_0,3.494840,1.366145
row_1,1.432290,1.152912
row_2,1.109068,1.246356
row_3,-1.410148,0.029519
row_4,0.765808,-1.365236
...,...,...
row_95,1.750278,0.884833
row_96,-0.134759,0.853030
row_97,0.043508,3.199866
row_98,-0.335895,1.070005


In [24]:
# slicing the dataframe by rows and columns

# See we are using a different method here - loc

df.loc[:, ['column_1', 'column_2']]

Unnamed: 0,column_1,column_2
row_0,3.494840,1.366145
row_1,1.432290,1.152912
row_2,1.109068,1.246356
row_3,-1.410148,0.029519
row_4,0.765808,-1.365236
...,...,...
row_95,1.750278,0.884833
row_96,-0.134759,0.853030
row_97,0.043508,3.199866
row_98,-0.335895,1.070005


In [25]:
# slicing the dataframe by rows and columns

# df.loc - access the rows and columns by labels
df.loc['row_1': 'row_5', ['column_1', 'column_2']]

Unnamed: 0,column_1,column_2
row_1,1.43229,1.152912
row_2,1.109068,1.246356
row_3,-1.410148,0.029519
row_4,0.765808,-1.365236
row_5,-1.311193,2.057363


In [26]:
x = "hello_world"

y = ''
for i in range(5):
    y += x[i]

In [27]:
x[13:15]

''

In [28]:
y = x[0:5]

In [29]:
y

'hello'

In [30]:
x[0:-2]

'hello_wor'

In [31]:
x = np.random.uniform(size=(100, 100))

In [32]:
x[3:400]

array([[0.28472373, 0.75735875, 0.66627589, ..., 0.24338735, 0.61829506,
        0.15459168],
       [0.45907549, 0.07770266, 0.45803428, ..., 0.05095434, 0.41293472,
        0.77508856],
       [0.38054147, 0.6663018 , 0.76533986, ..., 0.00942927, 0.32254622,
        0.96326822],
       ...,
       [0.05519384, 0.55952701, 0.46437679, ..., 0.20861257, 0.58710721,
        0.16033858],
       [0.12145443, 0.94113895, 0.97535452, ..., 0.62322387, 0.02000413,
        0.76724308],
       [0.43025822, 0.26604501, 0.86128776, ..., 0.92820487, 0.30284401,
        0.51298912]])

In [33]:
# slicing the dataframe by rows and columns using iloc

# df.iloc - access the rows and columns by integer index
# remmeber 1: 6 means rows_1 to rows_5

df.iloc[1:6, 1:3]

Unnamed: 0,column_1,column_2
row_1,1.43229,1.152912
row_2,1.109068,1.246356
row_3,-1.410148,0.029519
row_4,0.765808,-1.365236
row_5,-1.311193,2.057363


#### df.loc -- label based indexing (labels / boolean)
#### df.iloc -- interger based indexing 

In [34]:
# find the positive values in the dataframe (conditional selection)

postive_df = df[df > 0]

In [35]:
df>0

Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,...,column_11,column_12,column_13,column_14,column_15,column_16,column_17,column_18,column_19,sum
row_0,False,True,True,False,True,False,False,False,True,True,...,False,False,False,False,False,True,False,False,False,False
row_1,False,True,True,True,True,True,True,True,False,False,...,False,False,False,True,False,False,False,False,True,True
row_2,False,True,True,True,False,True,False,True,False,False,...,True,False,False,True,True,False,False,False,True,True
row_3,True,False,True,True,True,False,False,True,True,True,...,False,False,False,False,True,False,False,True,False,True
row_4,True,True,False,True,True,True,False,True,True,True,...,True,False,False,False,False,False,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
row_95,True,True,True,False,False,True,True,True,False,False,...,False,True,True,False,False,False,True,False,True,False
row_96,True,False,True,True,True,True,True,False,False,True,...,True,True,False,True,False,True,True,False,False,True
row_97,True,True,True,True,False,False,False,True,False,True,...,False,False,True,False,True,True,False,True,True,True
row_98,True,False,True,True,True,True,True,False,True,True,...,False,True,True,False,True,True,False,False,True,True


In [36]:
postive_df

Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,...,column_11,column_12,column_13,column_14,column_15,column_16,column_17,column_18,column_19,sum
row_0,,3.494840,1.366145,,1.232646,,,,0.723445,1.127667,...,,,,,,0.284309,,,,
row_1,,1.432290,1.152912,1.156307,1.297124,0.693444,0.812662,2.682947,,,...,,,,0.417727,,,,,0.908214,4.018051
row_2,,1.109068,1.246356,0.716861,,1.130528,,1.785122,,,...,0.415023,,,0.115913,2.056874,,,,1.158553,2.344805
row_3,0.767121,,0.029519,0.472326,0.020758,,,0.179453,1.982950,0.561855,...,,,,,0.910729,,,1.164274,,0.337456
row_4,0.962889,0.765808,,1.370107,0.980378,0.609408,,0.730033,0.089912,0.930447,...,1.594845,,,,,,,0.500909,,0.124094
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
row_95,1.439471,1.750278,0.884833,,,1.247483,2.134904,0.782725,,,...,,1.867005,0.632023,,,,1.342990,,0.611289,
row_96,1.311461,,0.853030,2.352509,2.903976,0.188207,0.390707,,,0.763738,...,0.552507,0.530554,,0.700762,,1.221675,0.088607,,,3.818223
row_97,0.714377,0.043508,3.199866,0.713160,,,,2.191589,,0.510670,...,,,1.157206,,0.557896,0.963973,,0.271483,0.462247,7.386610
row_98,0.552253,,1.070005,0.103915,0.588325,0.959030,0.253804,,0.952739,0.032104,...,,0.237859,0.703237,,1.735655,0.078898,,,0.524606,1.591589


In [37]:
postive_df = postive_df.fillna(99.99)

In [38]:
postive_df

Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,...,column_11,column_12,column_13,column_14,column_15,column_16,column_17,column_18,column_19,sum
row_0,99.990000,3.494840,1.366145,99.990000,1.232646,99.990000,99.990000,99.990000,0.723445,1.127667,...,99.990000,99.990000,99.990000,99.990000,99.990000,0.284309,99.990000,99.990000,99.990000,99.990000
row_1,99.990000,1.432290,1.152912,1.156307,1.297124,0.693444,0.812662,2.682947,99.990000,99.990000,...,99.990000,99.990000,99.990000,0.417727,99.990000,99.990000,99.990000,99.990000,0.908214,4.018051
row_2,99.990000,1.109068,1.246356,0.716861,99.990000,1.130528,99.990000,1.785122,99.990000,99.990000,...,0.415023,99.990000,99.990000,0.115913,2.056874,99.990000,99.990000,99.990000,1.158553,2.344805
row_3,0.767121,99.990000,0.029519,0.472326,0.020758,99.990000,99.990000,0.179453,1.982950,0.561855,...,99.990000,99.990000,99.990000,99.990000,0.910729,99.990000,99.990000,1.164274,99.990000,0.337456
row_4,0.962889,0.765808,99.990000,1.370107,0.980378,0.609408,99.990000,0.730033,0.089912,0.930447,...,1.594845,99.990000,99.990000,99.990000,99.990000,99.990000,99.990000,0.500909,99.990000,0.124094
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
row_95,1.439471,1.750278,0.884833,99.990000,99.990000,1.247483,2.134904,0.782725,99.990000,99.990000,...,99.990000,1.867005,0.632023,99.990000,99.990000,99.990000,1.342990,99.990000,0.611289,99.990000
row_96,1.311461,99.990000,0.853030,2.352509,2.903976,0.188207,0.390707,99.990000,99.990000,0.763738,...,0.552507,0.530554,99.990000,0.700762,99.990000,1.221675,0.088607,99.990000,99.990000,3.818223
row_97,0.714377,0.043508,3.199866,0.713160,99.990000,99.990000,99.990000,2.191589,99.990000,0.510670,...,99.990000,99.990000,1.157206,99.990000,0.557896,0.963973,99.990000,0.271483,0.462247,7.386610
row_98,0.552253,99.990000,1.070005,0.103915,0.588325,0.959030,0.253804,99.990000,0.952739,0.032104,...,99.990000,0.237859,0.703237,99.990000,1.735655,0.078898,99.990000,99.990000,0.524606,1.591589


In [39]:
# Apply a function on a dataframe column

def circle_area(radius):
    return np.pi * radius ** 2

df['circle_area'] = df['column_1'].apply(circle_area)

In [40]:
df

Unnamed: 0,column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,...,column_12,column_13,column_14,column_15,column_16,column_17,column_18,column_19,sum,circle_area
row_0,-0.061867,3.494840,1.366145,-0.282716,1.232646,-0.572733,-0.326738,-1.213385,0.723445,1.127667,...,-2.422527,-1.287514,-1.023617,-1.936117,0.284309,-0.275824,-0.292722,-0.291986,-1.078327,38.371111
row_1,-1.314013,1.432290,1.152912,1.156307,1.297124,0.693444,0.812662,2.682947,-0.296179,-0.161810,...,-0.186374,-0.581903,0.417727,-0.029335,-1.112440,-1.010581,-0.662007,0.908214,4.018051,6.444835
row_2,-1.452162,1.109068,1.246356,0.716861,-0.803672,1.130528,-0.961859,1.785122,-0.573043,-0.477888,...,-0.455231,-1.205265,0.115913,2.056874,-0.193720,-0.569588,-0.077930,1.158553,2.344805,3.864261
row_3,0.767121,-1.410148,0.029519,0.472326,0.020758,-0.489636,-0.527717,0.179453,1.982950,0.561855,...,-0.854839,-0.490948,-1.151506,0.910729,-1.401155,-0.326643,1.164274,-0.193142,0.337456,6.247115
row_4,0.962889,0.765808,-1.365236,1.370107,0.980378,0.609408,-1.289655,0.730033,0.089912,0.930447,...,-0.319074,-1.216257,-1.086353,-0.149670,-0.791340,-1.361137,0.500909,-1.058117,0.124094,1.842424
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
row_95,1.439471,1.750278,0.884833,-0.679433,-1.032590,1.247483,2.134904,0.782725,-1.464817,-0.839772,...,1.867005,0.632023,-2.197502,-1.450950,-1.865643,1.342990,-0.685883,0.611289,-0.256764,9.624184
row_96,1.311461,-0.134759,0.853030,2.352509,2.903976,0.188207,0.390707,-1.117908,-0.018376,0.763738,...,0.530554,-0.630892,0.700762,-1.998996,1.221675,0.088607,-0.559924,-1.436727,3.818223,0.057051
row_97,0.714377,0.043508,3.199866,0.713160,-0.326792,-0.058511,-0.039244,2.191589,-0.442348,0.510670,...,-0.785940,1.157206,-1.140128,0.557896,0.963973,-0.548824,0.271483,0.462247,7.386610,0.005947
row_98,0.552253,-0.335895,1.070005,0.103915,0.588325,0.959030,0.253804,-0.371038,0.952739,0.032104,...,0.237859,0.703237,-0.552694,1.735655,0.078898,-1.314820,-2.646178,0.524606,1.591589,0.354452


In [41]:
# test

value = df.loc['row_1', 'column_1']
circle_area(value)
print("The area of the circle is: ", circle_area(value))

The area of the circle is:  6.444834623023509


In [42]:
df.loc['row_1', 'circle_area']

6.444834623023509

In [43]:
# Pandas multi index

# Create a multi index dataframe - makes it easy to work with multi-dimensional data

# Create a doubly nested dictionary
#import pandas as pd

# Create the nested dictionary

tardis_dict = {
    "person1": {
        "name": "Anirban",
        "code": "tardis",
        "city": "East Lansing",
        "year": 2023,
        "work": "Gamma-rays"
    },
    "person2": {
        "name": "Jing",
        "code": "dalek",
        "city": "East Lansing",
        "year": 2023,
        "work": "Shock waves"
    },
    "person3": {
        "name": "Josh",
        "code": "stardis",
        "city": "East Lansing",
        "year": 2020,
        "work": "positrons"
    },
    "person4": {
        "name": "Andrew",
        "code": "tardis",
        "city": "East Lansing",
        "year": 2020,
        "work": "X-rays"
    },
    "person5": {
        "name": "wolfgang",
        "code": "tardis",
        "city": "New York",
        "year": 2018,
        "work": "Non-LTE"
    }
}




In [44]:
tardis_dict

{'person1': {'name': 'Anirban',
  'code': 'tardis',
  'city': 'East Lansing',
  'year': 2023,
  'work': 'Gamma-rays'},
 'person2': {'name': 'Jing',
  'code': 'dalek',
  'city': 'East Lansing',
  'year': 2023,
  'work': 'Shock waves'},
 'person3': {'name': 'Josh',
  'code': 'stardis',
  'city': 'East Lansing',
  'year': 2020,
  'work': 'positrons'},
 'person4': {'name': 'Andrew',
  'code': 'tardis',
  'city': 'East Lansing',
  'year': 2020,
  'work': 'X-rays'},
 'person5': {'name': 'wolfgang',
  'code': 'tardis',
  'city': 'New York',
  'year': 2018,
  'work': 'Non-LTE'}}

In [45]:
# Loop through the outer dictionary
for person, attributes in tardis_dict.items():
    print(f"{person}:")
    
    # Loop through the inner dictionary
    for attribute, value in attributes.items():
        print(f"  {attribute}: {value}")


person1:
  name: Anirban
  code: tardis
  city: East Lansing
  year: 2023
  work: Gamma-rays
person2:
  name: Jing
  code: dalek
  city: East Lansing
  year: 2023
  work: Shock waves
person3:
  name: Josh
  code: stardis
  city: East Lansing
  year: 2020
  work: positrons
person4:
  name: Andrew
  code: tardis
  city: East Lansing
  year: 2020
  work: X-rays
person5:
  name: wolfgang
  code: tardis
  city: New York
  year: 2018
  work: Non-LTE


In [46]:
# Convert the nested dict to a multi-index dataframe

df_tardis = pd.DataFrame.from_dict(tardis_dict, orient='index')

1. Efficiently store and manipulate multi-dimensional data
2. Readability
3. Operations

In [47]:
df_tardis

Unnamed: 0,name,code,city,year,work
person1,Anirban,tardis,East Lansing,2023,Gamma-rays
person2,Jing,dalek,East Lansing,2023,Shock waves
person3,Josh,stardis,East Lansing,2020,positrons
person4,Andrew,tardis,East Lansing,2020,X-rays
person5,wolfgang,tardis,New York,2018,Non-LTE


In [48]:
# set the name of the index
df_tardis.index.name = 'person'

In [49]:
df_tardis

Unnamed: 0_level_0,name,code,city,year,work
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
person1,Anirban,tardis,East Lansing,2023,Gamma-rays
person2,Jing,dalek,East Lansing,2023,Shock waves
person3,Josh,stardis,East Lansing,2020,positrons
person4,Andrew,tardis,East Lansing,2020,X-rays
person5,wolfgang,tardis,New York,2018,Non-LTE


In [50]:
# create a new random column in the dataframe

np.random.seed(42)

df_tardis['monthly_salary'] = np.random.random(df_tardis.shape[0]) * 5000

#### Adding complex operations to the dataframe

In [51]:
df_tardis

Unnamed: 0_level_0,name,code,city,year,work,monthly_salary
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
person1,Anirban,tardis,East Lansing,2023,Gamma-rays,1872.700594
person2,Jing,dalek,East Lansing,2023,Shock waves,4753.571532
person3,Josh,stardis,East Lansing,2020,positrons,3659.969709
person4,Andrew,tardis,East Lansing,2020,X-rays,2993.292421
person5,wolfgang,tardis,New York,2018,Non-LTE,780.093202


In [52]:
df_tardis = df_tardis.set_index('year', append=True)

In [53]:
df_tardis

Unnamed: 0_level_0,Unnamed: 1_level_0,name,code,city,work,monthly_salary
person,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
person1,2023,Anirban,tardis,East Lansing,Gamma-rays,1872.700594
person2,2023,Jing,dalek,East Lansing,Shock waves,4753.571532
person3,2020,Josh,stardis,East Lansing,positrons,3659.969709
person4,2020,Andrew,tardis,East Lansing,X-rays,2993.292421
person5,2018,wolfgang,tardis,New York,Non-LTE,780.093202


In [54]:
# People joined before 2020 should get a hike of 25%
df_tardis.loc[df_tardis.index.get_level_values(1) < 2020, 'monthly_salary'] *= 2.5


In [55]:
df_tardis

Unnamed: 0_level_0,Unnamed: 1_level_0,name,code,city,work,monthly_salary
person,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
person1,2023,Anirban,tardis,East Lansing,Gamma-rays,1872.700594
person2,2023,Jing,dalek,East Lansing,Shock waves,4753.571532
person3,2020,Josh,stardis,East Lansing,positrons,3659.969709
person4,2020,Andrew,tardis,East Lansing,X-rays,2993.292421
person5,2018,wolfgang,tardis,New York,Non-LTE,1950.233006


In [140]:
# What is happening above?

# set_index - set the year as the index
# append=True - append the year to the existing index /not replacing the old one "person"
# get_level_values(1) - get the year index
# select the rows where the year is less than 2020
# multiply the monthly salary by 2.5

In [141]:
x = df_tardis.reset_index().set_index(["city", "year"])

In [142]:
x

Unnamed: 0_level_0,Unnamed: 1_level_0,person,name,code,work,monthly_salary
city,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
East Lansing,2023,person1,Anirban,tardis,Gamma-rays,1872.700594
East Lansing,2023,person2,Jing,dalek,Shock waves,4753.571532
East Lansing,2020,person3,Josh,stardis,positrons,3659.969709
East Lansing,2020,person4,Andrew,tardis,X-rays,2993.292421
New York,2018,person5,wolfgang,tardis,Non-LTE,1950.233006


In [143]:
x.loc["East Lansing"]

Unnamed: 0_level_0,person,name,code,work,monthly_salary
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023,person1,Anirban,tardis,Gamma-rays,1872.700594
2023,person2,Jing,dalek,Shock waves,4753.571532
2020,person3,Josh,stardis,positrons,3659.969709
2020,person4,Andrew,tardis,X-rays,2993.292421


In [144]:
x.name

city          year
East Lansing  2023     Anirban
              2023        Jing
              2020        Josh
              2020      Andrew
New York      2018    wolfgang
Name: name, dtype: object