### Introduction to pandas

Installing and using Pandas:

In [4]:
import pandas
pandas.__version__

'1.0.5'

Introducing Pandas Objects:

In [5]:
import numpy as np
import pandas as pd

Series:

In [6]:
counts = pd.Series([544, 756, 876, 753])
counts

0    544
1    756
2    876
3    753
dtype: int64

In [7]:
counts.values

array([544, 756, 876, 753], dtype=int64)

In [8]:
counts.index

RangeIndex(start=0, stop=4, step=1)

In [9]:
animals = pd.Series([5500, 84, 2800, 3900],
    index = ['Black Rhino','Amur Leopard','Sumatran Elephant','Tiger'])
animals

Black Rhino          5500
Amur Leopard           84
Sumatran Elephant    2800
Tiger                3900
dtype: int64

In [10]:
animals['Tiger']

3900

In [11]:
animals[0]

5500

In [12]:
animals.name = 'counts'
animals.index.name = 'Endangered Animals'
animals

Endangered Animals
Black Rhino          5500
Amur Leopard           84
Sumatran Elephant    2800
Tiger                3900
Name: counts, dtype: int64

In [13]:
np.log(animals)

Endangered Animals
Black Rhino          8.612503
Amur Leopard         4.430817
Sumatran Elephant    7.937375
Tiger                8.268732
Name: counts, dtype: float64

In [14]:
animals[animals>2000]

Endangered Animals
Black Rhino          5500
Sumatran Elephant    2800
Tiger                3900
Name: counts, dtype: int64

In [15]:
animals_dict = {'Black Rhino' : 5500, 'Amur Leopard' : 84, 'Sumtran Elephant' : 2800, 'Tiger' : 3900}
print(animals_dict)
pd.Series(animals_dict)

{'Black Rhino': 5500, 'Amur Leopard': 84, 'Sumtran Elephant': 2800, 'Tiger': 3900}


Black Rhino         5500
Amur Leopard          84
Sumtran Elephant    2800
Tiger               3900
dtype: int64

DataFrame: bi-dimensional Series with two (or more) indices:

In [75]:
data = {"Student": ["Jenni", "John", "James", "Joyce"],
       "Year": [2019, 2018, 2019, 2018],
       "days_present": [190, 200, 188, 170],
       "Total_days": [200, 210, 200, 210]}
print(data)
data = pd.DataFrame(data)
data

{'Student': ['Jenni', 'John', 'James', 'Joyce'], 'Year': [2019, 2018, 2019, 2018], 'days_present': [190, 200, 188, 170], 'Total_days': [200, 210, 200, 210]}


Unnamed: 0,Student,Year,days_present,Total_days
0,Jenni,2019,190,200
1,John,2018,200,210
2,James,2019,188,200
3,Joyce,2018,170,210


In [76]:
df = pd.DataFrame(data, columns=["Year", "Student", "Total_days", "days_present"])
df

Unnamed: 0,Year,Student,Total_days,days_present
0,2019,Jenni,200,190
1,2018,John,210,200
2,2019,James,200,188
3,2018,Joyce,210,170


In [77]:
df['days_absent'] = df.Total_days - df.days_present 
df

Unnamed: 0,Year,Student,Total_days,days_present,days_absent
0,2019,Jenni,200,190,10
1,2018,John,210,200,10
2,2019,James,200,188,12
3,2018,Joyce,210,170,40


In [78]:
df['Series_aligned'] = pd.Series(range(4), index=[0,1,2,3])
df

Unnamed: 0,Year,Student,Total_days,days_present,days_absent,Series_aligned
0,2019,Jenni,200,190,10,0
1,2018,John,210,200,10,1
2,2019,James,200,188,12,2
3,2018,Joyce,210,170,40,3


In [79]:
df.to_dict()

{'Year': {0: 2019, 1: 2018, 2: 2019, 3: 2018},
 'Student': {0: 'Jenni', 1: 'John', 2: 'James', 3: 'Joyce'},
 'Total_days': {0: 200, 1: 210, 2: 200, 3: 210},
 'days_present': {0: 190, 1: 200, 2: 188, 3: 170},
 'days_absent': {0: 10, 1: 10, 2: 12, 3: 40},
 'Series_aligned': {0: 0, 1: 1, 2: 2, 3: 3}}

In [80]:
pd.DataFrame(df.to_dict())

Unnamed: 0,Year,Student,Total_days,days_present,days_absent,Series_aligned
0,2019,Jenni,200,190,10,0
1,2018,John,210,200,10,1
2,2019,James,200,188,12,2
3,2018,Joyce,210,170,40,3


DataFrame as specialized dictionary:

From a list of dicts:

In [81]:
data = [{'a': i, 'b':10 * i}for i in range(6)]
print(data)
pd.DataFrame(data)

[{'a': 0, 'b': 0}, {'a': 1, 'b': 10}, {'a': 2, 'b': 20}, {'a': 3, 'b': 30}, {'a': 4, 'b': 40}, {'a': 5, 'b': 50}]


Unnamed: 0,a,b
0,0,0
1,1,10
2,2,20
3,3,30
4,4,40
5,5,50


In [82]:
pd.DataFrame([{'aa': 1, 'bb':2}, {'bb':3, 'cc': 6}])

Unnamed: 0,aa,bb,cc
0,1.0,2,
1,,3,6.0


From a two-dimensional Numpy array:

In [83]:
pd.DataFrame(np.random.randint(11, 20),
            columns=['num1', 'num2'],
            index=['a', 'b', 'c'])

Unnamed: 0,num1,num2
a,12,12
b,12,12
c,12,12


The Pandas Index Object:

In [84]:
ind = pd.Index([10, 20, 38, 7, 42])
ind

Int64Index([10, 20, 38, 7, 42], dtype='int64')

Index as immutable array:

In [85]:
ind[1]

20

In [86]:
ind[::]

Int64Index([10, 20, 38, 7, 42], dtype='int64')

In [87]:
print(ind.size, ind.shape, ind.ndim, ind.dtype)

5 (5,) 1 int64


In [88]:
ind[1] = 0

TypeError: Index does not support mutable operations

Operating on data in Pandas:

Ufuncs: Index Preservation:

In [89]:
r = np.random.RandomState(15)
s = pd.Series(r.randint(0, 10, 4))
s

0    8
1    5
2    5
3    7
dtype: int32

In [90]:
d = pd.DataFrame(r.randint(0, 10, (5, 4)),
                  columns=['A', 'B', 'C', 'D'])
d

Unnamed: 0,A,B,C,D
0,0,7,5,6
1,1,7,0,4
2,9,7,5,3
3,6,8,2,1
4,1,0,5,2


In [91]:
np.exp(s)

0    2980.957987
1     148.413159
2     148.413159
3    1096.633158
dtype: float64

In [92]:
np.sin(d*np.pi/4)

Unnamed: 0,A,B,C,D
0,0.0,-0.7071068,-0.707107,-1.0
1,0.707107,-0.7071068,0.0,1.224647e-16
2,0.707107,-0.7071068,-0.707107,0.7071068
3,-1.0,-2.449294e-16,1.0,0.7071068
4,0.707107,0.0,-0.707107,1.0


Universal Functions: index Alignment:

Index alignment in Series

In [93]:
area = pd.Series({'Karnataka': 191791, 'TamilNadu': 130058, 'Kerala': 38863}, name='area')
population= pd.Series ({'Karnataka': 67562686,'TamilNadu': 77841267, 'Kerala': 35699443, 'Andhra Pradesh': 49386799}, name='population')
print(area)
population

Karnataka    191791
TamilNadu    130058
Kerala        38863
Name: area, dtype: int64


Karnataka         67562686
TamilNadu         77841267
Kerala            35699443
Andhra Pradesh    49386799
Name: population, dtype: int64

In [94]:
population / area

Andhra Pradesh           NaN
Karnataka         352.272453
Kerala            918.597200
TamilNadu         598.511949
dtype: float64

In [95]:
area.index | population.index

Index(['Andhra Pradesh', 'Karnataka', 'Kerala', 'TamilNadu'], dtype='object')

In [96]:
A = pd.Series([2, 3, 7], index=[0, 1, 5])
B = pd.Series([1, 3, 5], index=[1, 5, 6])
print(A)
print(B)
B
A+B

0    2
1    3
5    7
dtype: int64
1    1
5    3
6    5
dtype: int64


0     NaN
1     4.0
5    10.0
6     NaN
dtype: float64

In [97]:
A.add(B, fill_value=0)

0     2.0
1     4.0
5    10.0
6     5.0
dtype: float64

Ufuncs: Operations between DataFrame and Series 

Data wrangling

Merge operations:

In [98]:
df

Unnamed: 0,Year,Student,Total_days,days_present,days_absent,Series_aligned
0,2019,Jenni,200,190,10,0
1,2018,John,210,200,10,1
2,2019,James,200,188,12,2
3,2018,Joyce,210,170,40,3


In [109]:
df2 = pd.DataFrame({"Year": [2019, 2018, 2019, 2018], "Course": ["B.Tech", "B.Com", "BCA", "B.Sc"]})
df2

Unnamed: 0,Year,Course
0,2019,B.Tech
1,2018,B.Com
2,2019,BCA
3,2018,B.Sc


In [110]:
df.merge(df2)

Unnamed: 0,Year,Student,Total_days,days_present,days_absent,Series_aligned,Course
0,2019,Jenni,200,190,10,0,B.Tech
1,2019,Jenni,200,190,10,0,BCA
2,2019,James,200,188,12,2,B.Tech
3,2019,James,200,188,12,2,BCA
4,2018,John,210,200,10,1,B.Com
5,2018,John,210,200,10,1,B.Sc
6,2018,Joyce,210,170,40,3,B.Com
7,2018,Joyce,210,170,40,3,B.Sc


In [111]:
df3 = pd.DataFrame({"Year": [2019, 2018], "Course": ["B.Tech", "BCA"]})
df3
df.merge(df3, right_on='Year', left_on="Year")

Unnamed: 0,Year,Student,Total_days,days_present,days_absent,Series_aligned,Course
0,2019,Jenni,200,190,10,0,B.Tech
1,2019,James,200,188,12,2,B.Tech
2,2018,John,210,200,10,1,BCA
3,2018,Joyce,210,170,40,3,BCA


In [112]:
df4 = pd.DataFrame({"Year": [2019, 2018, 2017], "Course": ["B.Tech", "B.com", "BBA"]})
df.merge(df4, how='outer')

Unnamed: 0,Year,Student,Total_days,days_present,days_absent,Series_aligned,Course
0,2019,Jenni,200.0,190.0,10.0,0.0,B.Tech
1,2019,James,200.0,188.0,12.0,2.0,B.Tech
2,2018,John,210.0,200.0,10.0,1.0,B.com
3,2018,Joyce,210.0,170.0,40.0,3.0,B.com
4,2017,,,,,,BBA


In [115]:
df5 = pd.DataFrame({"Year": [2019, 2018, 2019], "Course": ["B.Tech", "B.com", "BBA"]})
print(df)
df.merge(df4, how='outer')

   Year Student  Total_days  days_present  days_absent  Series_aligned
0  2019   Jenni         200           190           10               0
1  2018    John         210           200           10               1
2  2019   James         200           188           12               2
3  2018   Joyce         210           170           40               3


Unnamed: 0,Year,Student,Total_days,days_present,days_absent,Series_aligned,Course
0,2019,Jenni,200.0,190.0,10.0,0.0,B.Tech
1,2019,James,200.0,188.0,12.0,2.0,B.Tech
2,2018,John,210.0,200.0,10.0,1.0,B.com
3,2018,Joyce,210.0,170.0,40.0,3.0,B.com
4,2017,,,,,,BBA


Combining data with overlap

In [119]:
series_a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
                    index=['f', 'e', 'd', 'c', 'b', 'a'])
series_b = pd.Series(np.arange(len(series_a), dtype=np.float64),
                    index=['f', 'e', 'd', 'c', 'b', 'a'])

In [120]:
series_a

f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64

In [121]:
series_b

f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    5.0
dtype: float64

In [122]:
pd.Series(np.where(pd.isnull(series_a), series_b, series_a), index=series_a.index)

f    0.0
e    2.5
d    2.0
c    3.5
b    4.5
a    5.0
dtype: float64

In [123]:
series_a.combine_first(series_b)

f    0.0
e    2.5
d    2.0
c    3.5
b    4.5
a    5.0
dtype: float64