# Pandas

In [107]:
import numpy as np
import pandas as pd

## Series

In [108]:
# Make pandas series from List.  

pd.Series( data = [1, 2, 3, 4],  
           index = ['a', 'b', 'c', 'd'])

a    1
b    2
c    3
d    4
dtype: int64

In [109]:
# Make pandas series from dictionary.  
d = {'a': 1,  'b':2, 'c':3}
s1 = pd.Series(d)
s1

a    1
b    2
c    3
dtype: int64

In [110]:
print( s1[1] )   # Numerical indexing

print( s1['b'] ) # Label based indexing 

print( s1[1:] )   # Numerical index based slicing


2
2
b    2
c    3
dtype: int64


In [111]:
s1 + s1

a    2
b    4
c    6
dtype: int64

In [112]:
# Can Apply numpy functions to series
np.mean(s1)

2.0

## DataFrames

In [113]:
my_dict = {"name" : ["Joe","Bob","Frans"],
           "age" : np.array([10,15,20]),
           "weight" : (75,123,239),
           "height" : pd.Series([4.5, 5, 6.1]),
           "siblings" : 1,
           "gender" : "M"}

df = pd.DataFrame(my_dict)
df

Unnamed: 0,name,age,weight,height,siblings,gender
0,Joe,10,75,4.5,1,M
1,Bob,15,123,5.0,1,M
2,Frans,20,239,6.1,1,M


In [114]:
my_dict = {"name" : ["Joe","Bob","Frans"],
           "age" : np.array([10,15,20]),
           "weight" : (75, 123, 239),
           "height" : (4.5, 5, 6.1),
           "siblings" : 1,
           "gender" : "M"}

df = pd.DataFrame(my_dict, index = my_dict['name'])
df

Unnamed: 0,name,age,weight,height,siblings,gender
Joe,Joe,10,75,4.5,1,M
Bob,Bob,15,123,5.0,1,M
Frans,Frans,20,239,6.1,1,M


In [115]:
# df['weight']
df.weight

Joe       75
Bob      123
Frans    239
Name: weight, dtype: int64

In [116]:
del df['name']  # here "del df.name" does not seem to work. 
del df['weight']
df["IQ"] = [85, 95, 105]
df

Unnamed: 0,age,height,siblings,gender,IQ
Joe,10,4.5,1,M,85
Bob,15,5.0,1,M,95
Frans,20,6.1,1,M,105


In [117]:
df['gender'] = 'F'
df

Unnamed: 0,age,height,siblings,gender,IQ
Joe,10,4.5,1,F,85
Bob,15,5.0,1,F,95
Frans,20,6.1,1,F,105


In [118]:
df['College'] = pd.Series(['MIT', 'VIT'], index=['Joe', 'Bob'])
df  # Notice the NaN for Frans

Unnamed: 0,age,height,siblings,gender,IQ,College
Joe,10,4.5,1,F,85,MIT
Bob,15,5.0,1,F,95,VIT
Frans,20,6.1,1,F,105,


## Location based Indexing

#### Label Based Indexing

In [119]:
df.loc['Joe':'Bob']  # df.loc[<rows>]

Unnamed: 0,age,height,siblings,gender,IQ,College
Joe,10,4.5,1,F,85,MIT
Bob,15,5.0,1,F,95,VIT


In [120]:
df.loc[ ['Joe', 'Frans'], 
        'age':'gender'   ]  # df.loc[ <rows>, <cols> ]

Unnamed: 0,age,height,siblings,gender
Joe,10,4.5,1,F
Frans,20,6.1,1,F


#### Numerical based indexing

In [121]:
# In this example, rows are specified as range 0:2
# And cols are specified with a list of specific column numbers [0,2,4]

df.iloc[0:2, [0, 2, 4]]  # df.loc[ <rows>, <cols> ]

Unnamed: 0,age,siblings,IQ
Joe,10,1,85
Bob,15,1,95


In [122]:
# Use a list of T, F, T, T, ... to get the indexes we need. 
# For example, we want joe and Frans. But not Bob. 

indexes = [True, False, True]
df[indexes]

Unnamed: 0,age,height,siblings,gender,IQ,College
Joe,10,4.5,1,F,85,MIT
Frans,20,6.1,1,F,105,


In [123]:
indexes = df['age'] > 12  # We make a list of [T, F, T, T, ...] using some condition
print("indexes:: \n", indexes, '\n')
df[indexes]
# df[ df.age > 12 ]

indexes:: 
 Joe      False
Bob       True
Frans     True
Name: age, dtype: bool 



Unnamed: 0,age,height,siblings,gender,IQ,College
Bob,15,5.0,1,F,95,VIT
Frans,20,6.1,1,F,105,


In [124]:
df[ (df.age > 12) | (df.IQ > 100) ]

# Play around with different conditions and see results. Use & or |  operators. 

Unnamed: 0,age,height,siblings,gender,IQ,College
Bob,15,5.0,1,F,95,VIT
Frans,20,6.1,1,F,105,


In [125]:
df.shape

(3, 6)

In [126]:
df.head(2)  # By default first 5 items are displayed 
df.tail(2)  # By default, last 5 items are displayed

Unnamed: 0,age,height,siblings,gender,IQ,College
Bob,15,5.0,1,F,95,VIT
Frans,20,6.1,1,F,105,


#### Manipulate index of data frame

In [130]:
# Change indexes. 
df.reset_index(inplace=True)
df

Unnamed: 0,index,age,height,siblings,gender,IQ,College
0,Joe,10,4.5,1,F,85,MIT
1,Bob,15,5.0,1,F,95,VIT
2,Frans,20,6.1,1,F,105,


In [131]:
df.rename(columns={'index':'name'}, inplace=True)
df

Unnamed: 0,name,age,height,siblings,gender,IQ,College
0,Joe,10,4.5,1,F,85,MIT
1,Bob,15,5.0,1,F,95,VIT
2,Frans,20,6.1,1,F,105,


In [133]:
df.set_index('name', inplace=True)
df

Unnamed: 0_level_0,age,height,siblings,gender,IQ,College
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Joe,10,4.5,1,F,85,MIT
Bob,15,5.0,1,F,95,VIT
Frans,20,6.1,1,F,105,


In [138]:
list(df.index[0:2]) 


['Joe', 'Bob']

In [139]:
df.columns

Index(['age', 'height', 'siblings', 'gender', 'IQ', 'College'], dtype='object')

In [140]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, Joe to Frans
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       3 non-null      int32  
 1   height    3 non-null      float64
 2   siblings  3 non-null      int64  
 3   gender    3 non-null      object 
 4   IQ        3 non-null      int64  
 5   College   2 non-null      object 
dtypes: float64(1), int32(1), int64(2), object(2)
memory usage: 156.0+ bytes


In [141]:
df.describe()  # This works on NUMERICAL cols only.  Gives various statistics. 

Unnamed: 0,age,height,siblings,IQ
count,3.0,3.0,3.0,3.0
mean,15.0,5.2,1.0,95.0
std,5.0,0.818535,0.0,10.0
min,10.0,4.5,1.0,85.0
25%,12.5,4.75,1.0,90.0
50%,15.0,5.0,1.0,95.0
75%,17.5,5.55,1.0,100.0
max,20.0,6.1,1.0,105.0


In [143]:
np.mean( df, axis=0 )

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


age         15.0
height       5.2
siblings     1.0
IQ          95.0
dtype: float64