# Introduction to Python - Part 2

In [1]:
import numpy as np

In [2]:
arr = np.arange(1,10).reshape(3,3)
arr

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [3]:
arr.mean()

5.0

In [4]:
arr.std()

2.581988897471611

In [5]:
arr.var()

6.666666666666667

In [6]:
arr.max()

9

In [7]:
arr.min()

1

In [8]:
arr.sum()

45

In [9]:
# mean of rows
arr.mean(1)

array([2., 5., 8.])

In [10]:
# mean of columns
arr.mean(0)

array([4., 5., 6.])

In [11]:
np.random.randint(1,101,size=10)

array([30, 36, 61, 92, 25,  1, 19,  7, 45, 37])

In [12]:
from numpy.random import randint

In [13]:
randint(1,51,5)

array([23, 38, 12, 22, 12])

In [14]:
# standard normal distribution
np.random.randn(10)

array([-0.87391089,  0.56139345, -1.80068377,  0.06404249,  1.4835147 ,
       -1.42282554, -0.30973386, -0.68608974,  1.57285013, -0.0358867 ])

In [15]:
a = np.random.randint(1,100,10)
b = np.random.randint(1,100,10)

In [16]:
a

array([56, 92, 32, 94, 74, 29,  6, 11, 53, 46])

In [17]:
b

array([70, 67, 19, 82, 13, 78, 54, 41, 84, 75])

In [18]:
summ = a+b
summ

array([126, 159,  51, 176,  87, 107,  60,  52, 137, 121])

In [19]:
np.maximum(a,b)

array([70, 92, 32, 94, 74, 78, 54, 41, 84, 75])

In [20]:
np.minimum(a,b)

array([56, 67, 19, 82, 13, 29,  6, 11, 53, 46])

In [21]:
arr1 = np.array([1,2,3,4])

In [22]:
arr2 = np.array([100,200,300,400])

In [23]:
arr3 = np.array([True,False,True,False])

In [24]:
np.where(arr3,arr1,arr2)

array([  1, 200,   3, 400])

In [25]:
# Imagine this as ternery operator
np.where(arr1<3,arr1,arr2)

array([  1,   2, 300, 400])

In [26]:
rand1 = np.random.randint(1,100,10)
rand1

array([97, 85, 17, 81, 61, 40,  5, 85, 42, 16])

In [27]:
np.where(a<40)

(array([2, 5, 6, 7]),)

In [28]:
# To detect outliers
rand1[np.where(a<40)]

array([17, 40,  5, 85])

### Pandas

In [29]:
import pandas as pd
from pandas import Series, DataFrame

In [30]:
s1 = Series([10,20,30,40])
s1

0    10
1    20
2    30
3    40
dtype: int64

In [31]:
list('ABCD')

['A', 'B', 'C', 'D']

In [32]:
s2 = Series([10,20,30,40],index=list('ABCD'))
s2

A    10
B    20
C    30
D    40
dtype: int64

In [33]:
s2.index

Index(['A', 'B', 'C', 'D'], dtype='object')

In [34]:
s2.values

array([10, 20, 30, 40])

In [35]:
s3 = Series({'A':'Alpha','B':'Beta'})
s3

A    Alpha
B     Beta
dtype: object

In [36]:
# Grab by index
s3[0]

'Alpha'

In [37]:
# Grab by index label
s3['A']

'Alpha'

In [38]:
# Grab by range
s3[1:3]

B    Beta
dtype: object

In [39]:
# grab by multiple label
s3[['A','B']]

A    Alpha
B     Beta
dtype: object

In [40]:
# grab by values
s1[s1<25]

0    10
1    20
dtype: int64

In [41]:
s4 = Series(['INDIA','BHUTAN','NEPAL'],index = [1,5,10])
s4

1      INDIA
5     BHUTAN
10     NEPAL
dtype: object

In [42]:
s4.reindex(range(15))

0        NaN
1      INDIA
2        NaN
3        NaN
4        NaN
5     BHUTAN
6        NaN
7        NaN
8        NaN
9        NaN
10     NEPAL
11       NaN
12       NaN
13       NaN
14       NaN
dtype: object

In [43]:
s4.reindex(range(15), fill_value='India')

0      India
1      INDIA
2      India
3      India
4      India
5     BHUTAN
6      India
7      India
8      India
9      India
10     NEPAL
11     India
12     India
13     India
14     India
dtype: object

In [44]:
# Forward Filling
s4.reindex(range(15), fill_value='India')

0      India
1      INDIA
2      India
3      India
4      India
5     BHUTAN
6      India
7      India
8      India
9      India
10     NEPAL
11     India
12     India
13     India
14     India
dtype: object

In [45]:
s4.reindex(range(15), method='ffill')

0        NaN
1      INDIA
2      INDIA
3      INDIA
4      INDIA
5     BHUTAN
6     BHUTAN
7     BHUTAN
8     BHUTAN
9     BHUTAN
10     NEPAL
11     NEPAL
12     NEPAL
13     NEPAL
14     NEPAL
dtype: object

In [46]:
s4.reindex(range(15), method='bfill')

0      INDIA
1      INDIA
2     BHUTAN
3     BHUTAN
4     BHUTAN
5     BHUTAN
6      NEPAL
7      NEPAL
8      NEPAL
9      NEPAL
10     NEPAL
11       NaN
12       NaN
13       NaN
14       NaN
dtype: object

In [47]:
s4.reindex(range(15), method='nearest')

0      INDIA
1      INDIA
2      INDIA
3     BHUTAN
4     BHUTAN
5     BHUTAN
6     BHUTAN
7     BHUTAN
8      NEPAL
9      NEPAL
10     NEPAL
11     NEPAL
12     NEPAL
13     NEPAL
14     NEPAL
dtype: object

In [48]:
df1 = DataFrame(np.arange(1,26).reshape(5,5),
                index=list('ABCDE'),
                columns=list('UVWXY'))
df1

Unnamed: 0,U,V,W,X,Y
A,1,2,3,4,5
B,6,7,8,9,10
C,11,12,13,14,15
D,16,17,18,19,20
E,21,22,23,24,25


In [49]:
df1[['U','X']]

Unnamed: 0,U,X
A,1,4
B,6,9
C,11,14
D,16,19
E,21,24


In [50]:
df1[1:3]

Unnamed: 0,U,V,W,X,Y
B,6,7,8,9,10
C,11,12,13,14,15


In [51]:
df1.iloc[0,:]

U    1
V    2
W    3
X    4
Y    5
Name: A, dtype: int64

In [52]:
df1.iloc[1:3,:]

Unnamed: 0,U,V,W,X,Y
B,6,7,8,9,10
C,11,12,13,14,15


In [53]:
df1.iloc[:,0]

A     1
B     6
C    11
D    16
E    21
Name: U, dtype: int64

In [54]:
df1.iloc[:,2:4]

Unnamed: 0,W,X
A,3,4
B,8,9
C,13,14
D,18,19
E,23,24


In [55]:
df1.loc['A']

U    1
V    2
W    3
X    4
Y    5
Name: A, dtype: int64

In [56]:
df1.loc[:,'X']

A     4
B     9
C    14
D    19
E    24
Name: X, dtype: int64

In [57]:
del df1['Y']

In [58]:
df1.index

Index(['A', 'B', 'C', 'D', 'E'], dtype='object')

In [59]:
df1.values

array([[ 1,  2,  3,  4],
       [ 6,  7,  8,  9],
       [11, 12, 13, 14],
       [16, 17, 18, 19],
       [21, 22, 23, 24]])

In [60]:
df1.columns[-1]

'X'

In [61]:
# (DEFAULT) AXIS = 0 means Row
df1.drop('E',axis=0)

Unnamed: 0,U,V,W,X
A,1,2,3,4
B,6,7,8,9
C,11,12,13,14
D,16,17,18,19


In [62]:
# AXIS = 1 means Column
df1.drop('X',axis=1)

Unnamed: 0,U,V,W
A,1,2,3
B,6,7,8
C,11,12,13
D,16,17,18
E,21,22,23


In [63]:
df1.drop(['X','U'], axis=1)

Unnamed: 0,V,W
A,2,3
B,7,8
C,12,13
D,17,18
E,22,23


In [64]:
df1.drop(df1.columns[-1], axis=1)

Unnamed: 0,U,V,W
A,1,2,3
B,6,7,8
C,11,12,13
D,16,17,18
E,21,22,23


In [65]:
df1.drop(df1.index[-3:])

Unnamed: 0,U,V,W,X
A,1,2,3,4
B,6,7,8,9


In [66]:
# Make different data type columns
df2 = DataFrame({'Name':list('ABCDE'),
                'Marks':[22,33,44,55,66]},index=list('12345'))
df2

Unnamed: 0,Name,Marks
1,A,22
2,B,33
3,C,44
4,D,55
5,E,66


In [67]:
df3 = DataFrame({
    'Name':['Google','Microsoft','Yahoo','Facebook','Infosys'],
    'Ticker':['GOOG','MS','YH','FB','IF'],
    'Price':np.random.randint(1,800,5)
})
df3

Unnamed: 0,Name,Ticker,Price
0,Google,GOOG,336
1,Microsoft,MS,384
2,Yahoo,YH,433
3,Facebook,FB,341
4,Infosys,IF,249


In [68]:
df3[df3[df3.columns[-2]]=='YH']

Unnamed: 0,Name,Ticker,Price
2,Yahoo,YH,433


In [69]:
df3[df3['Name']=='Yahoo']

Unnamed: 0,Name,Ticker,Price
2,Yahoo,YH,433


In [70]:
df3['Country'] ='India'
df3

Unnamed: 0,Name,Ticker,Price,Country
0,Google,GOOG,336,India
1,Microsoft,MS,384,India
2,Yahoo,YH,433,India
3,Facebook,FB,341,India
4,Infosys,IF,249,India


In [71]:
df3['Country']=['US','US','IN','IN','US']
df3

Unnamed: 0,Name,Ticker,Price,Country
0,Google,GOOG,336,US
1,Microsoft,MS,384,US
2,Yahoo,YH,433,IN
3,Facebook,FB,341,IN
4,Infosys,IF,249,US


In [72]:
df3['Discount Price']= df3['Price'] - df3['Price']*0.01
df3

Unnamed: 0,Name,Ticker,Price,Country,Discount Price
0,Google,GOOG,336,US,332.64
1,Microsoft,MS,384,US,380.16
2,Yahoo,YH,433,IN,428.67
3,Facebook,FB,341,IN,337.59
4,Infosys,IF,249,US,246.51


In [73]:
df3[df3['Price'] >500]

Unnamed: 0,Name,Ticker,Price,Country,Discount Price


In [74]:
 (df3['Price']>500) & (df3['Country']=='US')

0    False
1    False
2    False
3    False
4    False
dtype: bool

In [75]:
df4 = DataFrame({
    'Name':list('ABCDEFGHIJ'),
    'Marks1': np.random.randint(1,101,10),
    'Marks2': np.random.randint(1,101,10)
})
df4

Unnamed: 0,Name,Marks1,Marks2
0,A,1,16
1,B,51,25
2,C,63,74
3,D,16,29
4,E,17,6
5,F,97,71
6,G,79,83
7,H,58,21
8,I,46,40
9,J,11,17


In [76]:
df4.reindex(range(15))

Unnamed: 0,Name,Marks1,Marks2
0,A,1.0,16.0
1,B,51.0,25.0
2,C,63.0,74.0
3,D,16.0,29.0
4,E,17.0,6.0
5,F,97.0,71.0
6,G,79.0,83.0
7,H,58.0,21.0
8,I,46.0,40.0
9,J,11.0,17.0


In [77]:
df4.reindex(['Name','Marks1','Marks2','Marks3','Marks'],axis=1)

Unnamed: 0,Name,Marks1,Marks2,Marks3,Marks
0,A,1,16,,
1,B,51,25,,
2,C,63,74,,
3,D,16,29,,
4,E,17,6,,
5,F,97,71,,
6,G,79,83,,
7,H,58,21,,
8,I,46,40,,
9,J,11,17,,


In [78]:
df4

Unnamed: 0,Name,Marks1,Marks2
0,A,1,16
1,B,51,25
2,C,63,74
3,D,16,29
4,E,17,6
5,F,97,71
6,G,79,83
7,H,58,21
8,I,46,40
9,J,11,17


In [79]:
df4.iloc[3:5,1:]

Unnamed: 0,Marks1,Marks2
3,16,29
4,17,6


In [80]:
df4.iloc[2:4,1] = np.array([60,60])

In [81]:
df4

Unnamed: 0,Name,Marks1,Marks2
0,A,1,16
1,B,51,25
2,C,60,74
3,D,60,29
4,E,17,6
5,F,97,71
6,G,79,83
7,H,58,21
8,I,46,40
9,J,11,17


In [82]:
df4.sort_values('Marks1', ascending=False)

Unnamed: 0,Name,Marks1,Marks2
5,F,97,71
6,G,79,83
2,C,60,74
3,D,60,29
7,H,58,21
1,B,51,25
8,I,46,40
4,E,17,6
9,J,11,17
0,A,1,16


In [83]:
df4.sort_values(['Marks1','Marks2'], ascending=[False,True])

Unnamed: 0,Name,Marks1,Marks2
5,F,97,71
6,G,79,83
3,D,60,29
2,C,60,74
7,H,58,21
1,B,51,25
8,I,46,40
4,E,17,6
9,J,11,17
0,A,1,16


In [84]:
df4.sort_values(['Marks1','Marks2'], ascending=[False,True],inplace=True)

In [85]:
df4

Unnamed: 0,Name,Marks1,Marks2
5,F,97,71
6,G,79,83
3,D,60,29
2,C,60,74
7,H,58,21
1,B,51,25
8,I,46,40
4,E,17,6
9,J,11,17
0,A,1,16


In [86]:
df4.sort_index(inplace=True)
df4

Unnamed: 0,Name,Marks1,Marks2
0,A,1,16
1,B,51,25
2,C,60,74
3,D,60,29
4,E,17,6
5,F,97,71
6,G,79,83
7,H,58,21
8,I,46,40
9,J,11,17


### Null Handling
* Count of null values
* Convert it into boolean
* Replace
* Drop

In [87]:
null1 = np.array([1,2,np.nan,np.nan])
null1

array([ 1.,  2., nan, nan])

In [88]:
Series(null1)

0    1.0
1    2.0
2    NaN
3    NaN
dtype: float64

In [89]:
null2 = DataFrame({
    'A':[1,np.nan,np.nan,np.nan],
    'B':[2,5,8,np.nan],
    'C':[3,6,np.nan,np.nan]
})
null2

Unnamed: 0,A,B,C
0,1.0,2.0,3.0
1,,5.0,6.0
2,,8.0,
3,,,


In [90]:
null2.isnull()

Unnamed: 0,A,B,C
0,False,False,False
1,True,False,False
2,True,False,True
3,True,True,True


In [91]:
null2.isna()

Unnamed: 0,A,B,C
0,False,False,False
1,True,False,False
2,True,False,True
3,True,True,True


In [92]:
null2.notnull()

Unnamed: 0,A,B,C
0,True,True,True
1,False,True,True
2,False,True,False
3,False,False,False


In [93]:
null2['A'].isnull().values

array([False,  True,  True,  True])

In [94]:
null2['A'].isnull().sum()

3

In [95]:
null2.isnull().sum()

A    3
B    1
C    2
dtype: int64

In [96]:
null2.dropna()

Unnamed: 0,A,B,C
0,1.0,2.0,3.0


In [97]:
null2.dropna(axis=1)

0
1
2
3


In [98]:
null2.dropna(how='all')

Unnamed: 0,A,B,C
0,1.0,2.0,3.0
1,,5.0,6.0
2,,8.0,


In [99]:
# threshold is WRT not null value
null2.dropna(thresh=1)

Unnamed: 0,A,B,C
0,1.0,2.0,3.0
1,,5.0,6.0
2,,8.0,


In [100]:
null2.dropna(thresh=2)

Unnamed: 0,A,B,C
0,1.0,2.0,3.0
1,,5.0,6.0


In [101]:
null2['B'].fillna(null2['B'].mean(),inplace=True)

In [102]:
null2

Unnamed: 0,A,B,C
0,1.0,2.0,3.0
1,,5.0,6.0
2,,8.0,
3,,5.0,


In [103]:
null2.fillna(method='ffill')

Unnamed: 0,A,B,C
0,1.0,2.0,3.0
1,1.0,5.0,6.0
2,1.0,8.0,6.0
3,1.0,5.0,6.0


###### Thank you! 