# Pandas tutorial

In [85]:
import numpy as np
import pandas as pd

## Series object

In [51]:
series_1 = pd.Series(np.array([10, 20, 30, 40]))
series_1

0    10
1    20
2    30
3    40
dtype: int32

In [52]:
series_2 = pd.Series(data=np.arange(4), index=['A', 'B', 'C', 'D'])
series_2

A    0
B    1
C    2
D    3
dtype: int32

In [53]:
series_2.values # Getting values of the series

array([0, 1, 2, 3])

In [56]:
series_2.index # Getting indices of the series

Index(['A', 'B', 'C', 'D'], dtype='object')

In [59]:
series_2['A'] # Getting the value corresponding to label(index) 'A'

0

In [60]:
series_2[['A', 'C']] # Getting the values corresponding to several labels: 'A', 'B'

A    0
C    2
dtype: int32

In [64]:
series_2[series_2 > 2] # We can do boolean conditioning 

D    3
dtype: int32

In [65]:
np.exp(series_2) # We can pass a Series object to NumPy functions

A     1.000000
B     2.718282
C     7.389056
D    20.085537
dtype: float64

In [66]:
series_2 * 4 # We can do arithmetic with Series object

A     0
B     4
C     8
D    12
dtype: int32

We can think of Series object as a dictionary since it maps an index to a value.

In [67]:
'B' in series_2

True

In [68]:
'F' in series_2

False

We can pass a dictionary to pd.Series().

In [69]:
dict_1 = {'wellness': 'Yarishna',
          'figure': 'Rebecca',
          'bodybuilding': 'Iman',
         '212': 'Shaun'}
series_2 = pd.Series(dict_1)
series_2

wellness        Yarishna
figure           Rebecca
bodybuilding        Iman
212                Shaun
dtype: object

We can override the index by changing the index explicitly. If the index already exists as a dictionary key, then we will have a value associated with it, otherwise we receive NaN.

In [71]:
pd.Series(dict_1, index=['FBB', 'Natural FBB', 'best natural', 'Giant killer'])

FBB             NaN
Natural FBB     NaN
best natural    NaN
Giant killer    NaN
dtype: object

In [73]:
series_3 = pd.Series(dict_1, index=['figure', '212', 'best natural', 'wellness'])
series_3

figure           Rebecca
212                Shaun
best natural         NaN
wellness        Yarishna
dtype: object

Finding missing values: *isnull*, *notnull* both as pandas functions and Series methods. 

In [74]:
pd.isnull(series_3)

figure          False
212             False
best natural     True
wellness        False
dtype: bool

In [75]:
pd.notnull(series_3)

figure           True
212              True
best natural    False
wellness         True
dtype: bool

In [76]:
series_3.isnull()

figure          False
212             False
best natural     True
wellness        False
dtype: bool

In [77]:
series_3.notnull()

figure           True
212              True
best natural    False
wellness         True
dtype: bool

When we perform arithmetic operations on Series objects, the objects are aligned by index.

In [79]:
series_4 = pd.Series(data=np.random.randint(1, 5, 5), index=['C', 'D', 'E', 'F', 'G'])
series_5 = pd.Series(data=np.random.randint(1, 5, 5), index=['A', 'B', 'C', 'D', 'G'])
print(series_4)
print(series_5)

C    2
D    2
E    1
F    1
G    3
dtype: int32
A    4
B    1
C    3
D    1
G    1
dtype: int32


In [80]:
series_4 + series_5

A    NaN
B    NaN
C    5.0
D    3.0
E    NaN
F    NaN
G    4.0
dtype: float64

We can change a Series index inplace:

In [82]:
series_4.index = range(5)
series_4

0    2
1    2
2    1
3    1
4    3
dtype: int32

Both Series object and its index have name attributes:

In [84]:
series_4.name = 'Example Series'
series_4.index.name = 'myInd'
print(series_4)

myInd
0    2
1    2
2    1
3    1
4    3
Name: Example Series, dtype: int32


## DataFrame

In [87]:
dataframe_1 = pd.DataFrame(data = np.random.randn(6, 3), columns=['Books', 'Pens', 'Shoes'])
dataframe_1

Unnamed: 0,Books,Pens,Shoes
0,0.494267,0.103533,-0.237604
1,-2.831102,0.683317,-0.801478
2,-0.270498,-0.014077,2.144846
3,-0.57036,-1.171104,-1.079871
4,-1.01249,0.554717,-0.292275
5,1.070774,0.519833,0.192617


In [88]:
dataframe_1.index = ['A', 'B', 'C', 'D', 'E', 'F'] # This is how we change the index in place
dataframe_1

Unnamed: 0,Books,Pens,Shoes
A,0.494267,0.103533,-0.237604
B,-2.831102,0.683317,-0.801478
C,-0.270498,-0.014077,2.144846
D,-0.57036,-1.171104,-1.079871
E,-1.01249,0.554717,-0.292275
F,1.070774,0.519833,0.192617


In [89]:
pd.DataFrame(data = np.random.randn(6, 3), columns=['Books', 'Pens', 'Shoes'], index=['A', 'B', 'C','D','E','F'])

Unnamed: 0,Books,Pens,Shoes
A,-0.957765,0.314912,-0.034336
B,-0.8205,1.352835,-0.207851
C,0.767108,-1.607306,1.19088
D,0.021868,-0.309835,0.478874
E,-1.257675,1.775337,-2.102003
F,0.656187,0.051832,1.352361


We can also pass a dictionary to pd.DataFrame to create an object.

In [100]:
dict_2 = {'A': np.arange(10), 'B': np.random.randint(1, 7, 10), 'C': np.random.randn(10)} 
print(f"Created DataFrame using dictionary:\n{pd.DataFrame(dict_2)}") # Automatically assigns the index
print(f"\nCreated DataFrame by specifying the index:\n{pd.DataFrame(dict_2, index=list('EFGHIGKLMN'))}") # We can pass the index like this.


Created DataFrame using dictionary:
   A  B         C
0  0  6 -1.190121
1  1  6  0.400636
2  2  6  1.376592
3  3  2 -0.765050
4  4  2 -0.461140
5  5  1  0.773439
6  6  1  0.548765
7  7  1  1.242279
8  8  1 -1.282618
9  9  4 -0.731828

Created DataFrame by specifying the index:
   A  B         C
E  0  6 -1.190121
F  1  6  0.400636
G  2  6  1.376592
H  3  2 -0.765050
I  4  2 -0.461140
G  5  1  0.773439
K  6  1  0.548765
L  7  1  1.242279
M  8  1 -1.282618
N  9  4 -0.731828


In [102]:
pd.DataFrame(dict_2).head() # head() method shows the first 5 rows of a DataFrame.

Unnamed: 0,A,B,C
0,0,6,-1.190121
1,1,6,0.400636
2,2,6,1.376592
3,3,2,-0.76505
4,4,2,-0.46114


In [103]:
pd.DataFrame(dict_2, columns=['B', 'C', 'A']) # Rearranging the columns

Unnamed: 0,B,C,A
0,6,-1.190121,0
1,6,0.400636,1
2,6,1.376592,2
3,2,-0.76505,3
4,2,-0.46114,4
5,1,0.773439,5
6,1,0.548765,6
7,1,1.242279,7
8,1,-1.282618,8
9,4,-0.731828,9


In [120]:
pd.DataFrame(dict_2, columns=['A', 'B', 'C', 'D']) # Here, we don't have a key 'D' in dict_2. 
#Including 'D' in the columns will add that column and assigns NaN values to it.

Unnamed: 0,A,B,C,D
0,0,6,-1.190121,
1,1,6,0.400636,
2,2,6,1.376592,
3,3,2,-0.76505,
4,4,2,-0.46114,
5,5,1,0.773439,
6,6,1,0.548765,
7,7,1,1.242279,
8,8,1,-1.282618,
9,9,4,-0.731828,


Retrieving a column:

Either use dictionary notation or attribute: df['columnName'] or df.columnName

In [122]:
dataframe_1['Pens']

A    0.103533
B    0.683317
C   -0.014077
D   -1.171104
E    0.554717
F    0.519833
Name: Pens, dtype: float64

In [123]:
dataframe_1.Pens

A    0.103533
B    0.683317
C   -0.014077
D   -1.171104
E    0.554717
F    0.519833
Name: Pens, dtype: float64

Note that dictionary notation for getting a column works with any column name, but using attribute only works if the columnName is a valued Python variable name.

In [124]:
dataframe_1.columns

Index(['Books', 'Pens', 'Shoes'], dtype='object')

In [126]:
dataframe_1.columns = ['Books', 'Pens', 1]
dataframe_1

Unnamed: 0,Books,Pens,1
A,0.494267,0.103533,-0.237604
B,-2.831102,0.683317,-0.801478
C,-0.270498,-0.014077,2.144846
D,-0.57036,-1.171104,-1.079871
E,-1.01249,0.554717,-0.292275
F,1.070774,0.519833,0.192617


In [128]:
dataframe_1[1]

A   -0.237604
B   -0.801478
C    2.144846
D   -1.079871
E   -0.292275
F    0.192617
Name: 1, dtype: float64

In [130]:
dataframe_1.1 # We cannot retrieve column corresponding to 1!

SyntaxError: invalid syntax (492237829.py, line 1)

In [135]:
print(dataframe_1)
dataframe_1.loc['C'] # retrieves row with index 'C'

      Books      Pens         1
A  0.494267  0.103533 -0.237604
B -2.831102  0.683317 -0.801478
C -0.270498 -0.014077  2.144846
D -0.570360 -1.171104 -1.079871
E -1.012490  0.554717 -0.292275
F  1.070774  0.519833  0.192617


Books   -0.270498
Pens    -0.014077
1        2.144846
Name: C, dtype: float64

In [136]:
dataframe_1.iloc[2] # retrieves row number 3

Books   -0.270498
Pens    -0.014077
1        2.144846
Name: C, dtype: float64

In [138]:
dataframe_2 = pd.DataFrame(dict_2, columns=['A', 'B', 'C', 'D'])
print(dataframe_2)
dataframe_2['D'] = 2
print(dataframe_2)

   A  B         C    D
0  0  6 -1.190121  NaN
1  1  6  0.400636  NaN
2  2  6  1.376592  NaN
3  3  2 -0.765050  NaN
4  4  2 -0.461140  NaN
5  5  1  0.773439  NaN
6  6  1  0.548765  NaN
7  7  1  1.242279  NaN
8  8  1 -1.282618  NaN
9  9  4 -0.731828  NaN
   A  B         C  D
0  0  6 -1.190121  2
1  1  6  0.400636  2
2  2  6  1.376592  2
3  3  2 -0.765050  2
4  4  2 -0.461140  2
5  5  1  0.773439  2
6  6  1  0.548765  2
7  7  1  1.242279  2
8  8  1 -1.282618  2
9  9  4 -0.731828  2


In [142]:
dataframe_2['D'] = np.random.randn(10,1) # length of the assigned vector should match the number of rows in DF.
dataframe_2

Unnamed: 0,A,B,C,D
0,0,6,-1.190121,-1.615323
1,1,6,0.400636,-0.229937
2,2,6,1.376592,1.551206
3,3,2,-0.76505,0.038678
4,4,2,-0.46114,-0.75383
5,5,1,0.773439,0.471062
6,6,1,0.548765,-0.046366
7,7,1,1.242279,-0.861164
8,8,1,-1.282618,0.144519
9,9,4,-0.731828,-0.556329


In [155]:
series_6 = pd.Series(np.random.randint(1,5,10), index=np.arange(4, 14)) # 
dataframe_2 ['D'] = series_6 # When assigning a Series to a DF column, index are aligned.
dataframe_2

Unnamed: 0,A,B,C,D
0,0,6,-1.190121,
1,1,6,0.400636,
2,2,6,1.376592,
3,3,2,-0.76505,
4,4,2,-0.46114,1.0
5,5,1,0.773439,1.0
6,6,1,0.548765,3.0
7,7,1,1.242279,4.0
8,8,1,-1.282618,3.0
9,9,4,-0.731828,3.0


In [156]:
del dataframe_2['D'] # del keyword deletes a column
dataframe_2

Unnamed: 0,A,B,C
0,0,6,-1.190121
1,1,6,0.400636
2,2,6,1.376592
3,3,2,-0.76505
4,4,2,-0.46114
5,5,1,0.773439
6,6,1,0.548765
7,7,1,1.242279
8,8,1,-1.282618
9,9,4,-0.731828


In [158]:
dataframe_1 = pd.DataFrame(np.random.randn(6,3), columns=['Books', 'Pens', 'Shoes'])
print(dataframe_1)
book = dataframe_1['Books']
book = 1
print(dataframe_1)

      Books      Pens     Shoes
0 -0.336848 -1.747893 -2.623477
1  0.429558  0.412985 -0.454417
2  0.076991  0.283844 -0.512662
3  0.864625  2.176568  0.495595
4 -0.494927  0.931763  1.603079
5  2.809987  0.002387 -0.343251
      Books      Pens     Shoes
0 -0.336848 -1.747893 -2.623477
1  0.429558  0.412985 -0.454417
2  0.076991  0.283844 -0.512662
3  0.864625  2.176568  0.495595
4 -0.494927  0.931763  1.603079
5  2.809987  0.002387 -0.343251


In [47]:
dict1 = {'color':['red', 'blue', 'green'], 'price': [10, 23, 34]}
myDF = pd.DataFrame(dict1, index=['A', 'B', 'C'])
myDF

Unnamed: 0,color,price
A,red,10
B,blue,23
C,green,34


In [6]:
dict_5 = {'Store': [1, 2, 1, 2], 'Flavor': ['Choc', 'Van', 'Straw', 'Choc'], 'Sales': [26, 12, 18, 22]}
df_5 = pd.DataFrame(dict_5)
df_5

Unnamed: 0,Store,Flavor,Sales
0,1,Choc,26
1,2,Van,12
2,1,Straw,18
3,2,Choc,22


In [12]:
byStore = df_5.groupby('Store')
byStore.mean()

Unnamed: 0_level_0,Sales
Store,Unnamed: 1_level_1
1,22.0
2,17.0


In [13]:
byStore.sum().loc[1]

Sales    44
Name: 1, dtype: int64

In [15]:
byStore.describe()

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Store,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,2.0,22.0,5.656854,18.0,20.0,22.0,24.0,26.0
2,2.0,17.0,7.071068,12.0,14.5,17.0,19.5,22.0


#When you enter the parantheses you can press shift + tab to know the keys you can set. very informative.![image.png](attachment:image.png)

In [16]:
df_5.sum()

Store                    6
Flavor    ChocVanStrawChoc
Sales                   78
dtype: object

In [18]:
df_5.describe()

Unnamed: 0,Store,Sales
count,4.0,4.0
mean,1.5,19.5
std,0.57735,5.972158
min,1.0,12.0
25%,1.0,16.5
50%,1.5,20.0
75%,2.0,23.0
max,2.0,26.0


In [19]:
df_5

Unnamed: 0,Store,Flavor,Sales
0,1,Choc,26
1,2,Van,12
2,1,Straw,18
3,2,Choc,22


In [26]:
df_5['Sales'].apply(np.sqrt)

0    5.099020
1    3.464102
2    4.242641
3    4.690416
Name: Sales, dtype: float64

In [29]:
series1 = pd.Series(('iman', 'Shafikhani'), index = ['First name', 'Last name'])
series1

First name          iman
Last name     Shafikhani
dtype: object

In [32]:
df_1 = pd.DataFrame({'Contact': ['Iman Shafikhani', 'Mohammad Binaei']})
df_1

Unnamed: 0,Contact
0,Iman Shafikhani
1,Mohammad Binaei


In [34]:
def split_names(df):
    def get_names(full_name):
        f_name, l_name = full_name.split()
        return pd.Series(
        (f_name, l_name),
        index = ['First name', 'Last name'])
    names = df['Contact'].apply(get_names)
    df[names.columns] = names
    return df

In [36]:
    def get_names(full_name):
        f_name, l_name = full_name.split()
        return pd.Series(
        (f_name, l_name),
        index = ['First name', 'Last name'])

In [39]:
names = df_1['Contact'].apply(get_names)
names

Unnamed: 0,First name,Last name
0,Iman,Shafikhani
1,Mohammad,Binaei


In [40]:
names.columns

Index(['First name', 'Last name'], dtype='object')

In [42]:
df_1[names.columns] = names

In [43]:
df_1

Unnamed: 0,Contact,First name,Last name
0,Iman Shafikhani,Iman,Shafikhani
1,Mohammad Binaei,Mohammad,Binaei
