# Pandas

#### Essential Imports

In [1]:
import numpy as np
import pandas as pd

## Series

In [2]:
# Make pandas series from List.  

pd.Series( data = [1, 2, 3, 4],  
           index = ['a', 'b', 'c', 'd'])

a    1
b    2
c    3
d    4
dtype: int64

In [3]:
# Make pandas series from dictionary.  
d = {'a': 1,  'b':2, 'c':3}
s1 = pd.Series(d)
s1

a    1
b    2
c    3
dtype: int64

In [4]:
print( s1[1] )   # Numerical indexing

print( s1['b'] ) # Label based indexing 

print( s1[1:] )   # Numerical index based slicing


2
2
b    2
c    3
dtype: int64


In [5]:
s1 + s1

a    2
b    4
c    6
dtype: int64

In [6]:
# Can Apply numpy functions to series
np.mean(s1)

2.0

## DataFrames

In [16]:
my_dict = {"name" : ["Joe","Bob","Frans","Sammy", "Jimmy", "Tommy"],
           "age" : np.array([10,15,20, 25, 50, 40]),
           "weight" : (75,123,239, 50,60,70),
           "height" : pd.Series([4.5, 5, 6.1, 5.5, 5, 6]).values,
           "siblings" :([1, 0,3,1,2,1 ]), 
           "gender" : (["M","F","M", "M", "M" , "F"])}

df = pd.DataFrame(my_dict)
df

Unnamed: 0,name,age,weight,height,siblings,gender
0,Joe,10,75,4.5,1,M
1,Bob,15,123,5.0,0,F
2,Frans,20,239,6.1,3,M
3,Sammy,25,50,5.5,1,M
4,Jimmy,50,60,5.0,2,M
5,Tommy,40,70,6.0,1,F


In [14]:
df.height

0    4.5
1    5.0
2    6.1
3    5.5
4    5.0
5    6.0
Name: height, dtype: float64

In [17]:

df = pd.DataFrame(my_dict, index = my_dict['name'])
df

Unnamed: 0,name,age,weight,height,siblings,gender
Joe,Joe,10,75,4.5,1,M
Bob,Bob,15,123,5.0,0,F
Frans,Frans,20,239,6.1,3,M
Sammy,Sammy,25,50,5.5,1,M
Jimmy,Jimmy,50,60,5.0,2,M
Tommy,Tommy,40,70,6.0,1,F


In [18]:
# df['weight']
df.weight

Joe       75
Bob      123
Frans    239
Sammy     50
Jimmy     60
Tommy     70
Name: weight, dtype: int64

In [19]:
del df['name']  # here "del df.name" does not seem to work. 
del df['weight']
df["IQ"] = [85, 95, 105 , 73, 84, 40]
df

Unnamed: 0,age,height,siblings,gender,IQ
Joe,10,4.5,1,M,85
Bob,15,5.0,0,F,95
Frans,20,6.1,3,M,105
Sammy,25,5.5,1,M,73
Jimmy,50,5.0,2,M,84
Tommy,40,6.0,1,F,40


In [20]:
df['gender'] = 'F'
df

Unnamed: 0,age,height,siblings,gender,IQ
Joe,10,4.5,1,F,85
Bob,15,5.0,0,F,95
Frans,20,6.1,3,F,105
Sammy,25,5.5,1,F,73
Jimmy,50,5.0,2,F,84
Tommy,40,6.0,1,F,40


In [21]:
df['College'] = pd.Series(['MIT', 'VIT'], index=['Joe', 'Bob'])
df  # Notice the NaN for Frans

Unnamed: 0,age,height,siblings,gender,IQ,College
Joe,10,4.5,1,F,85,MIT
Bob,15,5.0,0,F,95,VIT
Frans,20,6.1,3,F,105,
Sammy,25,5.5,1,F,73,
Jimmy,50,5.0,2,F,84,
Tommy,40,6.0,1,F,40,


## Location based Indexing

#### Label Based Indexing

In [22]:
df.loc['Joe':'Bob']  # df.loc[<rows>]

Unnamed: 0,age,height,siblings,gender,IQ,College
Joe,10,4.5,1,F,85,MIT
Bob,15,5.0,0,F,95,VIT


In [23]:
df.loc[ ['Joe', 'Frans'], 
        'age':'gender'   ]  # df.loc[ <rows>, <cols> ]

Unnamed: 0,age,height,siblings,gender
Joe,10,4.5,1,F
Frans,20,6.1,3,F


#### Numerical based indexing

In [24]:
# In this example, rows are specified as range 0:2
# And cols are specified with a list of specific column numbers [0,2,4]

df.iloc[0:2, [0, 2, 4]]  # df.loc[ <rows>, <cols> ]

Unnamed: 0,age,siblings,IQ
Joe,10,1,85
Bob,15,0,95


In [28]:
# Use a list of T, F, T, T, ... to get the indexes we need. 
# For example, we want joe and Frans. But not Bob. 
print(df)

indexes = [True, False, True, False, True, False]
df[indexes]

       age  height  siblings gender   IQ College
Joe     10     4.5         1      F   85     MIT
Bob     15     5.0         0      F   95     VIT
Frans   20     6.1         3      F  105     NaN
Sammy   25     5.5         1      F   73     NaN
Jimmy   50     5.0         2      F   84     NaN
Tommy   40     6.0         1      F   40     NaN


Unnamed: 0,age,height,siblings,gender,IQ,College
Joe,10,4.5,1,F,85,MIT
Frans,20,6.1,3,F,105,
Jimmy,50,5.0,2,F,84,


In [35]:
indexes = df['age'] > 12  # We make a list of [T, F, T, T, ...] using some condition
print("indexes:: ", list(indexes), '\n')
df[indexes]
# df[ df.age > 12 ]

indexes::  [False, True, True, True, True, True] 



Unnamed: 0,age,height,siblings,gender,IQ,College
Bob,15,5.0,0,F,95,VIT
Frans,20,6.1,3,F,105,
Sammy,25,5.5,1,F,73,
Jimmy,50,5.0,2,F,84,
Tommy,40,6.0,1,F,40,


In [36]:
# NOT of the boolean indexes
# [F,  T, T, T, T, T] ==> [T,  F, F, F, F, F]
df[~ indexes]

Unnamed: 0,age,height,siblings,gender,IQ,College
Joe,10,4.5,1,F,85,MIT


In [38]:
(df.age > 12) | (df.IQ > 100)

Joe      False
Bob       True
Frans     True
Sammy     True
Jimmy     True
Tommy     True
dtype: bool

In [39]:
df[ (df.age > 12) | (df.IQ > 100) ]

# Play around with different conditions and see results. Use & or |  operators. 

Unnamed: 0,age,height,siblings,gender,IQ,College
Bob,15,5.0,0,F,95,VIT
Frans,20,6.1,3,F,105,
Sammy,25,5.5,1,F,73,
Jimmy,50,5.0,2,F,84,
Tommy,40,6.0,1,F,40,


In [40]:
df.shape

(6, 6)

In [42]:
df.head(2)  # By default first 5 items are displayed 

Unnamed: 0,age,height,siblings,gender,IQ,College
Joe,10,4.5,1,F,85,MIT
Bob,15,5.0,0,F,95,VIT


In [43]:

df.tail(2)  # By default, last 5 items are displayed

Unnamed: 0,age,height,siblings,gender,IQ,College
Jimmy,50,5.0,2,F,84,
Tommy,40,6.0,1,F,40,


#### Manipulate index of data frame

In [44]:
# Change indexes. 
df.reset_index(inplace=True)
df

Unnamed: 0,index,age,height,siblings,gender,IQ,College
0,Joe,10,4.5,1,F,85,MIT
1,Bob,15,5.0,0,F,95,VIT
2,Frans,20,6.1,3,F,105,
3,Sammy,25,5.5,1,F,73,
4,Jimmy,50,5.0,2,F,84,
5,Tommy,40,6.0,1,F,40,


In [45]:
df.rename(columns={'index':'name'}, inplace=True)
df

Unnamed: 0,name,age,height,siblings,gender,IQ,College
0,Joe,10,4.5,1,F,85,MIT
1,Bob,15,5.0,0,F,95,VIT
2,Frans,20,6.1,3,F,105,
3,Sammy,25,5.5,1,F,73,
4,Jimmy,50,5.0,2,F,84,
5,Tommy,40,6.0,1,F,40,


In [46]:
df.set_index('name', inplace=True)
df

Unnamed: 0_level_0,age,height,siblings,gender,IQ,College
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Joe,10,4.5,1,F,85,MIT
Bob,15,5.0,0,F,95,VIT
Frans,20,6.1,3,F,105,
Sammy,25,5.5,1,F,73,
Jimmy,50,5.0,2,F,84,
Tommy,40,6.0,1,F,40,


In [47]:
list(df.index[0:2]) 


['Joe', 'Bob']

In [48]:
df.columns

Index(['age', 'height', 'siblings', 'gender', 'IQ', 'College'], dtype='object')

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, Joe to Tommy
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       6 non-null      int32  
 1   height    6 non-null      float64
 2   siblings  6 non-null      int64  
 3   gender    6 non-null      object 
 4   IQ        6 non-null      int64  
 5   College   2 non-null      object 
dtypes: float64(1), int32(1), int64(2), object(2)
memory usage: 312.0+ bytes


In [50]:
df.describe()  # This works on NUMERICAL cols only.  Gives various statistics. 

Unnamed: 0,age,height,siblings,IQ
count,6.0,6.0,6.0,6.0
mean,26.666667,5.35,1.333333,80.333333
std,15.383974,0.62849,1.032796,22.535897
min,10.0,4.5,0.0,40.0
25%,16.25,5.0,1.0,75.75
50%,22.5,5.25,1.0,84.5
75%,36.25,5.875,1.75,92.5
max,50.0,6.1,3.0,105.0


In [51]:
np.mean( df, axis=0 )

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


age         26.666667
height       5.350000
siblings     1.333333
IQ          80.333333
dtype: float64