In [3]:
import numpy as np
import pandas as pd

In [4]:
#Pandas is an open sourced library which is built on top of numpy
#Pandas can be called as the excel version of python. 
#it is used for most of the preprocessing before the machine learning model building
#Pandas also has inbuilt data visualiztion features available

In [5]:
#To install pandas we can use "pip install pandas" if we have python version of distibution installed
#Else we can use "conda install pandas" if we have anaconda installed, pip version of command will also work in condas

<h3>Series</h3>

In [6]:
#Series is similar to the array in numpy but in case of pandas series we can have indexed labels for the entries of series

In [7]:
#For example , we have 2 series
index=[100,200,300]
data=[1,2,3]

In [8]:
#Now to create a series
pd.Series(data)

0    1
1    2
2    3
dtype: int64

In [9]:
#And it gives us the series of data 1,2,3
#But here one thing to notice is like we have something named 0,1,2 which are the indexes of the data points

In [10]:
#We can also have custom indexes as per our requirement
pd.Series(data=data,index=index)

100    1
200    2
300    3
dtype: int64

In [11]:
#Here one more thing to notice is we can provide the function parameters as the data directly by using the sequence
# Or we can do it using the label name like data=data,index=index

In [12]:
#Now let's create a numpy array
array1=np.array(data)

In [13]:
array1

array([1, 2, 3])

In [14]:
#We can create the list from a numpy array also
pd.Series(data=array1,index=index)

100    1
200    2
300    3
dtype: int32

In [15]:
#We can also create a list from dictionary
dict1={'a':1,'b':2,'c':3}
pd.Series(dict1)

a    1
b    2
c    3
dtype: int64

In [16]:
#So here the key is assigned as index and value as the data fro the series

In [17]:
#The best aspect of pandas series is it can take up any kind of data
#For example
pd.Series(data=[sum,max,print])

0      <built-in function sum>
1      <built-in function max>
2    <built-in function print>
dtype: object

In [18]:
#The series indexing is same as a dictionary, I will encourage you to go ahead and check the Introduction to python video
# if you don't know the dictionary indexing

In [19]:
#So we will have one example of it
#Let's create a series
se1=pd.Series(data=array1,index=index)

In [20]:
se1

100    1
200    2
300    3
dtype: int32

In [21]:
se1[100]

1

In [22]:
#We can have mathematical operation in series

In [23]:
#For example, let's say we have 2 series
se1
#and
se2=pd.Series(data=array1,index=['a','b',300])

In [24]:
#Now let's see the series
se1

100    1
200    2
300    3
dtype: int32

In [25]:
se2

a      1
b      2
300    3
dtype: int32

In [26]:
#Now let's add both
se1+se2

100    NaN
200    NaN
300    6.0
a      NaN
b      NaN
dtype: float64

In [27]:
#Note that it will check for the indexes in both the series, where ever the indexes mathes it will check the datatype,
# if they are sam it will add else it will print as NaN

In [28]:
#One more thing to notice in case of any mathematical operation in Series the integers will be converted to floats

<h3>DataFrame</h3>

In [29]:
#Dataframe can be called a matrix with columns and indexes

In [30]:
#Let's go ahead and create a dataframe to understand
df=pd.DataFrame(data=np.random.randn(5,4),index=['a','b','c','d','e'],columns=['x','y','z','p'])

In [31]:
df

Unnamed: 0,x,y,z,p
a,-1.355457,1.218857,2.053384,1.012857
b,1.378939,-0.383053,-0.84617,-1.339467
c,-1.624899,-0.711651,-1.183117,-0.100977
d,0.131765,0.947431,1.04967,-0.247348
e,0.299,0.431135,1.396473,2.230821


In [32]:
#So the dataframe is the collection series we can say becz here x,y,z,p are 4 series which combine together to form the dataframe

In [33]:
#Now let's check the indexing of the dataframe
#If we do
df['x']

a   -1.355457
b    1.378939
c   -1.624899
d    0.131765
e    0.299000
Name: x, dtype: float64

In [34]:
#We get the series x with the index printed out

In [35]:
#To verify we can do...
type(df['x'])

pandas.core.series.Series

In [36]:
#We can also check like
type(df) #Which is a dataframe

pandas.core.frame.DataFrame

In [37]:
#We can also do indexing using
df.x

a   -1.355457
b    1.378939
c   -1.624899
d    0.131765
e    0.299000
Name: x, dtype: float64

In [38]:
#But the [] square bracket notation one is the better approach and widely used also

In [39]:
#For multiple columns to be printed out in indexing we can pass in a list of columns to be printed
df[['x','z']]

Unnamed: 0,x,z
a,-1.355457,2.053384
b,1.378939,-0.84617
c,-1.624899,-1.183117
d,0.131765,1.04967
e,0.299,1.396473


In [40]:
#And it will give us a dataframe containing the lists or columns we have asked for

In [41]:
#We can add new column to a dataframe using
df['new']=df['x']+df['z']

In [42]:
#Now
df # has a new column named new which is the sum of x and z columns

Unnamed: 0,x,y,z,p,new
a,-1.355457,1.218857,2.053384,1.012857,0.697926
b,1.378939,-0.383053,-0.84617,-1.339467,0.532769
c,-1.624899,-0.711651,-1.183117,-0.100977,-2.808015
d,0.131765,0.947431,1.04967,-0.247348,1.181435
e,0.299,0.431135,1.396473,2.230821,1.695473


In [43]:
#To drop any column from a dataframe
df.drop('x')

KeyError: "['x'] not found in axis"

In [44]:
#Here we are getting a key error "['x'] not found in axis", let's dig down
#df.drop() #Do shift+tab to check the documentation, in which axis =0 means all the rows and for columns we have modify axis as 1
df.drop('x',axis=1)

Unnamed: 0,y,z,p,new
a,1.218857,2.053384,1.012857,0.697926
b,-0.383053,-0.84617,-1.339467,0.532769
c,-0.711651,-1.183117,-0.100977,-2.808015
d,0.947431,1.04967,-0.247348,1.181435
e,0.431135,1.396473,2.230821,1.695473


In [45]:
#One thing to notice if we want to permanently delete the column then we have to make inplace=True which means the changes 
# done is True and should be permanent

In [46]:
df.drop('x',axis=1,inplace=True)

In [47]:
#We can also drop any row the only change is we can specify the axis=0 or just don't specify the axis becz axis=0 is by default
df.drop('e')

Unnamed: 0,y,z,p,new
a,1.218857,2.053384,1.012857,0.697926
b,-0.383053,-0.84617,-1.339467,0.532769
c,-0.711651,-1.183117,-0.100977,-2.808015
d,0.947431,1.04967,-0.247348,1.181435


In [48]:
#Rows

In [49]:
#We can select rows using loc or iloc
#so
df.loc['e']

y      0.431135
z      1.396473
p      2.230821
new    1.695473
Name: e, dtype: float64

In [50]:
#It will provide us the series of the 'e' row

In [51]:
#We can alos select the row using indexing
df.iloc[4]

y      0.431135
z      1.396473
p      2.230821
new    1.695473
Name: e, dtype: float64

In [52]:
#We can find the subset of the dataframe just like we did in numpy, I will link that in the i button above and description below
df.loc['a','y']

1.2188568085437748

In [53]:
#We can also have a complete subset of dataframe
df.loc[['a','b'],['y','z']]

Unnamed: 0,y,z
a,1.218857,2.053384
b,-0.383053,-0.84617


In [54]:
#Conditional selection

In [55]:
#Dataframes can have conditional selections just like we did in numpy, I will link it in the i button above and description
# below

df>0

Unnamed: 0,y,z,p,new
a,True,True,True,True
b,False,False,False,True
c,False,False,False,False
d,True,True,False,True
e,True,True,True,True


In [56]:
#So here we get all the boolean values where the condition is True and False

In [57]:
#Now if we will pass it inside the dataframe
df[df>0]

Unnamed: 0,y,z,p,new
a,1.218857,2.053384,1.012857,0.697926
b,,,,0.532769
c,,,,
d,0.947431,1.04967,,1.181435
e,0.431135,1.396473,2.230821,1.695473


In [58]:
#We get all the values which are >0 and NaN where the condition fails

In [59]:
#If we do the same based on columns then
df['y'] >0

a     True
b    False
c    False
d     True
e     True
Name: y, dtype: bool

In [60]:
#Now let's put this condition for the entire dataframe
df[df['y'] >0]

Unnamed: 0,y,z,p,new
a,1.218857,2.053384,1.012857,0.697926
d,0.947431,1.04967,-0.247348,1.181435
e,0.431135,1.396473,2.230821,1.695473


In [61]:
#Notice that here we got only the rows for which the condition is True

In [63]:
#We can findout a specific column out of this sub dataframe
#Let's save the dataframe first
test1=df[df['y'] >0]
test1

Unnamed: 0,y,z,p,new
a,1.218857,2.053384,1.012857,0.697926
d,0.947431,1.04967,-0.247348,1.181435
e,0.431135,1.396473,2.230821,1.695473


In [64]:
#Now let's find one column of test1
test1['new']

a    0.697926
d    1.181435
e    1.695473
Name: new, dtype: float64

In [65]:
#For using multiple conditions we can have & operator as separation
df[df['y'] >0]
#We have this condition already

Unnamed: 0,y,z,p,new
a,1.218857,2.053384,1.012857,0.697926
d,0.947431,1.04967,-0.247348,1.181435
e,0.431135,1.396473,2.230821,1.695473


In [66]:
#Now let's put each condition under brackets to separate
df[(df['y'] >0) & (df['p'] >1)]

Unnamed: 0,y,z,p,new
a,1.218857,2.053384,1.012857,0.697926
e,0.431135,1.396473,2.230821,1.695473


In [67]:
#We get the values filtered out by the 2 conditions specified

In [68]:
#We can also use the OR operation using | (pipe)
df[(df['y'] >0) | (df['p'] >1)]

Unnamed: 0,y,z,p,new
a,1.218857,2.053384,1.012857,0.697926
d,0.947431,1.04967,-0.247348,1.181435
e,0.431135,1.396473,2.230821,1.695473


In [69]:
#Reset index

In [70]:
#One important operation of pandas is reset index
df

Unnamed: 0,y,z,p,new
a,1.218857,2.053384,1.012857,0.697926
b,-0.383053,-0.84617,-1.339467,0.532769
c,-0.711651,-1.183117,-0.100977,-2.808015
d,0.947431,1.04967,-0.247348,1.181435
e,0.431135,1.396473,2.230821,1.695473


In [76]:
#Now 
df.reset_index()

Unnamed: 0,index,y,z,p,new
0,a,1.218857,2.053384,1.012857,0.697926
1,b,-0.383053,-0.84617,-1.339467,0.532769
2,c,-0.711651,-1.183117,-0.100977,-2.808015
3,d,0.947431,1.04967,-0.247348,1.181435
4,e,0.431135,1.396473,2.230821,1.695473


In [77]:
#It will reset the index to 0 to n-1 and will make the proevious indexes as a new column named index

In [78]:
# But to make it permanent again we have provide he inplace=True inside the brackets as we did in previous operation

In [79]:
#Now if we want to set one of our columns of the dataframe as the index we can do set_index
df.set_index('y')

Unnamed: 0_level_0,z,p,new
y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.218857,2.053384,1.012857,0.697926
-0.383053,-0.84617,-1.339467,0.532769
-0.711651,-1.183117,-0.100977,-2.808015
0.947431,1.04967,-0.247348,1.181435
0.431135,1.396473,2.230821,1.695473
