In [1]:
import numpy as np
import pandas as pd

In [2]:
s1 = pd.Series([10,20,30,40], index=['a', 'b', 'c', 'd'])
print(s1)

a    10
b    20
c    30
d    40
dtype: int64


In [3]:
index_obj = s1.index
print(index_obj)

Index(['a', 'b', 'c', 'd'], dtype='object')


In [4]:
#why index is important- in bth series&dataframes structure of
#pandas we use index to refer to the row&column

In [5]:
#Negative Indexes(slices)
index_obj[-2:] #last 2 indexes

Index(['c', 'd'], dtype='object')

In [6]:
index_obj[:-2] #first 2 indexes

Index(['a', 'b'], dtype='object')

In [7]:
#range of indexes
index_obj[2:4]

Index(['c', 'd'], dtype='object')

In [9]:
#warning-you can never change a series/Dataframe index once assigned
index_obj[0] = 'AA'

TypeError: Index does not support mutable operations

In [10]:
#Walkaround if you want to change index name

print(s1.rename(index= { 'a': 'AA'})) #This is temporary
print(s1)

AA    10
b     20
c     30
d     40
dtype: int64
a    10
b    20
c    30
d    40
dtype: int64


In [11]:
#to make it permanent
s1 = s1.rename(index= { 'a': 'AA'})
print(s1)

AA    10
b     20
c     30
d     40
dtype: int64


## Reindexing in Pandas Series and DataFrames

In [13]:
s2 = pd.Series([8,10,12,14], index=['a', 'b', 'c', 'd'])
print(s2)

a     8
b    10
c    12
d    14
dtype: int64


In [14]:
#reindexing in series
#reindex() method
s2 = s2.reindex(['a', 'b', 'c', 'd', 'e', 'f'])
print(s2)

a     8.0
b    10.0
c    12.0
d    14.0
e     NaN
f     NaN
dtype: float64


In [15]:
#reindex() method with fill_value
#fill_value - it only fills the new index that are created and not all the NAN Value
s2 = s2.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g'], fill_value=100)
print(s2)

a      8.0
b     10.0
c     12.0
d     14.0
e      NaN
f      NaN
g    100.0
dtype: float64


In [17]:
#forwardfill
cars = pd.Series(['BMW', 'Toyota', 'Audi'], index=[0,2,4])
new_index = range(12) #it will create a range of numbers from 0-11
cars = cars.reindex(new_index, method='ffill')
print(cars)

0        BMW
1        BMW
2     Toyota
3     Toyota
4       Audi
5       Audi
6       Audi
7       Audi
8       Audi
9       Audi
10      Audi
11      Audi
dtype: object


In [19]:
#Reindexing in DataFrame
df1 = pd.DataFrame(np.random.randn(25).reshape(5,5), index=['a','b', 'c','d','e'], columns=['c1','c2','c3','c4','c5'])
print(df1)

         c1        c2        c3        c4        c5
a  0.809351 -0.963414  0.315374 -2.093022 -1.863182
b  0.444462  0.595545 -0.715835 -1.122231  0.756494
c -1.518347 -0.668090 -0.455313  0.491370 -1.147595
d -0.582754  0.196338  0.295717  0.028512 -2.009612
e -0.536911  1.473923 -0.452438  0.533474 -1.412212


In [21]:
df1 = df1.reindex(index=['a','b','c','d','e','f'], columns=['c1','c2','c3','c4','c5','c6'])
print(df1)

         c1        c2        c3        c4        c5  c6
a  0.809351 -0.963414  0.315374 -2.093022 -1.863182 NaN
b  0.444462  0.595545 -0.715835 -1.122231  0.756494 NaN
c -1.518347 -0.668090 -0.455313  0.491370 -1.147595 NaN
d -0.582754  0.196338  0.295717  0.028512 -2.009612 NaN
e -0.536911  1.473923 -0.452438  0.533474 -1.412212 NaN
f       NaN       NaN       NaN       NaN       NaN NaN


## Dropping entries in pandas series and dataframes

In [2]:
import pandas as pd
import numpy as np
car = pd.Series(['BMW', 'Audi', 'Toyota'], index=['a','b','c'])
print(car)

a       BMW
b      Audi
c    Toyota
dtype: object


In [3]:
#drop values from series - We just use the .drop() function to drop the a index
car = car.drop('a')
print(car)

b      Audi
c    Toyota
dtype: object


In [8]:
cars_df = pd.DataFrame(np.random.randn(9).reshape(3,3), index=['BMW', 'Audi', 'Toyota'], columns=['test1','test2','test3'])
print(cars_df)

           test1     test2     test3
BMW    -0.850127 -0.257112 -1.303719
Audi   -1.508148  0.110887 -0.162927
Toyota -0.080251  1.475709  0.087193


In [9]:
#drop rows from a dataframe
cars_df = cars_df.drop('BMW')
print(cars_df)

           test1     test2     test3
Audi   -1.508148  0.110887 -0.162927
Toyota -0.080251  1.475709  0.087193


In [10]:
#drop column from a dataframe
cars_df = cars_df.drop('test1', axis=1)
print(cars_df)
#If u want to drop a column you need to indicate the axis=1
#the axis for series is 0 while for column is 1.

           test2     test3
Audi    0.110887 -0.162927
Toyota  1.475709  0.087193


## Handling Null/NAn values in pandas

In [11]:
import pandas as pd
import numpy as np

In [12]:
revenue_series = pd.Series([100,200,300,np.nan], index=['Toyota','BMW','Mercedes','Audi'])
print(revenue_series)

Toyota      100.0
BMW         200.0
Mercedes    300.0
Audi          NaN
dtype: float64


In [13]:
#checking for null using isnull()
revenue_series.isnull()

Toyota      False
BMW         False
Mercedes    False
Audi         True
dtype: bool

In [14]:
#dropna() Series - it's used to drop the null/Nan values.
revenue_series.dropna()

Toyota      100.0
BMW         200.0
Mercedes    300.0
dtype: float64

In [17]:
#dropna() Dataframe
df1 = pd.DataFrame(np.random.randn(20).reshape(4,5))
print(df1)

          0         1         2         3         4
0  0.979743  0.462710  1.717713 -1.227572  0.046208
1  1.662072 -0.205818  0.566158 -0.773317  0.718604
2  0.981651 -0.852494  0.456300  0.128907  0.946295
3 -1.277895 -0.963629  0.580312  0.449990 -0.069141


In [18]:
#.loc[] is use to select specific element of a dataframe and assign values to 	them.
df1.loc[1,2] = np.nan
df1.loc[2,1] = np.nan
df1.loc[3,] = np.nan
print(df1)

          0         1         2         3         4
0  0.979743  0.462710  1.717713 -1.227572  0.046208
1  1.662072 -0.205818       NaN -0.773317  0.718604
2  0.981651       NaN  0.456300  0.128907  0.946295
3       NaN       NaN       NaN       NaN       NaN


In [19]:
#how dropna() works in a dataframe
df1.dropna()

Unnamed: 0,0,1,2,3,4
0,0.979743,0.46271,1.717713,-1.227572,0.046208


In [20]:
#Disadvantage of dropping all Nan Values in dataframe
#1. The column will be deleted even thou it has only one NAN Value not considering all other data which might be useful later
#2. It's distorts the original distribution; it can produce skewed data.

In [21]:
#using how(all)- it will drop the column where all NAN value exists
df1.dropna(how='all')

Unnamed: 0,0,1,2,3,4
0,0.979743,0.46271,1.717713,-1.227572,0.046208
1,1.662072,-0.205818,,-0.773317,0.718604
2,0.981651,,0.4563,0.128907,0.946295


In [22]:
#Dataframe - dropna() along column: here column 0,1,2,3,4 have a NAN value which means an empty array will be printed out
df1.dropna(axis=1)

0
1
2
3


In [23]:
df2 = pd.DataFrame([[1,2,3,np.nan], [9,8,7,6], [5,4,np.nan,np.nan], [12,np.nan,np.nan,np.nan]])
print(df2)

    0    1    2    3
0   1  2.0  3.0  NaN
1   9  8.0  7.0  6.0
2   5  4.0  NaN  NaN
3  12  NaN  NaN  NaN


In [25]:
#dropna with thresh parameter
#thresh(n)- checks for the condition if n or more actual values exists more than the nan, then the row should exist but if it's less than n then it should delete that row
df2.dropna(thresh=3)

Unnamed: 0,0,1,2,3
0,1,2.0,3.0,
1,9,8.0,7.0,6.0


In [26]:
df2.dropna(thresh=2)

Unnamed: 0,0,1,2,3
0,1,2.0,3.0,
1,9,8.0,7.0,6.0
2,5,4.0,,


In [27]:
#fillna(n) function - it fills the NAN values with n values 
df2.fillna(0)

Unnamed: 0,0,1,2,3
0,1,2.0,3.0,0.0
1,9,8.0,7.0,6.0
2,5,4.0,0.0,0.0
3,12,0.0,0.0,0.0


In [28]:
df2.fillna({0:0, 1:50, 2:100, 3:200})
#here we filling the NAN value of  column
#0 with value 0
#1 with value 50
#2 with value 100
#3 with value 200

Unnamed: 0,0,1,2,3
0,1,2.0,3.0,200.0
1,9,8.0,7.0,6.0
2,5,4.0,100.0,200.0
3,12,50.0,100.0,200.0
