# Advance Pandas

## Intro to Pandas

In [3]:
import pandas as pd

In [7]:
## Lets view the version of pandas
print(pd.__version__)

2.1.4


In [13]:
## Now lets create a series in pandas
## Series is a 1 dimensional array tipically
## Dataframes are multi dimensional arrays
A = pd.Series([3,4,5,6],index=['a','b','c','d'])

In [15]:
## each element is a type int
A.values

array([3, 4, 5, 6], dtype=int64)

In [17]:
## each element is a type object/string
A.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [19]:
## Notice A.values or 3,4,5,6 is an array
type(A.values)

numpy.ndarray

In [24]:
## Notice A is a series
type(A)

pandas.core.series.Series

In [28]:
## Now lets print A
A

a    3
b    4
c    5
d    6
dtype: int64

In [30]:
## Lets index A
A['a']

3

In [36]:
## Lets slice
A['a':'c']

a    3
b    4
c    5
dtype: int64

## Pandas Series

In [4]:
import pandas as pd

In [6]:
## Lets make a pandas series with a dictonary

grades_dict = {'A':90,'B':80,'C':70,'D':60}
grades = pd.Series(grades_dict)
grades

A    90
B    80
C    70
D    60
dtype: int64

In [12]:
type(grades)

pandas.core.series.Series

In [14]:
type(grades_dict)

dict

In [26]:
## You can use explicit indicies
grades['A':'C']

A    90
B    80
C    70
dtype: int64

In [24]:
## You can also use implicit indicies
grades[0:3]

A    90
B    80
C    70
dtype: int64

## Pandas Dataframes Intro

In [4]:
import pandas as pd

In [6]:
## Lets create two series and put them into a dataframe, which is a group of series

grades_dict = {'A':4,'B':3.5,'C':3,'D':2.5}
grades = pd.Series(grades_dict)

marks_dict = {'A':90,'B':80,'C':70,'D':60}
marks = pd.Series(marks_dict)

In [8]:
grades

A    4.0
B    3.5
C    3.0
D    2.5
dtype: float64

In [10]:
marks

A    90
B    80
C    70
D    60
dtype: int64

In [12]:
## Now lets create a dataframe using the two series I created

gradebook = pd.DataFrame({'grades':grades,'marks':marks})
gradebook

Unnamed: 0,grades,marks
A,4.0,90
B,3.5,80
C,3.0,70
D,2.5,60


In [35]:
## Transposed version of the dataframe
gradebook.T

Unnamed: 0,A,B,C,D
grades,4.0,3.5,3.0,2.5
marks,90.0,80.0,70.0,60.0


In [43]:
gradebook

Unnamed: 0,grades,marks
A,4.0,90
B,3.5,80
C,3.0,70
D,2.5,60


In [49]:
## Lets access grade 70 using values
## So in row 2, starting from 0 remember, and then column 1, again starting from 0, we get 70!

gradebook.values[2,1]

70.0

In [51]:
## get the columns
gradebook.columns

Index(['grades', 'marks'], dtype='object')

In [14]:
## Prints the top 5 records in a dataframe
gradebook.head()

Unnamed: 0,grades,marks
A,4.0,90
B,3.5,80
C,3.0,70
D,2.5,60


In [20]:
## implicit indicies
gradebook.iloc[1:3]

Unnamed: 0,grades,marks
B,3.5,80
C,3.0,70


In [22]:
## explicit indicies
gradebook.loc['B':'C']

Unnamed: 0,grades,marks
B,3.5,80
C,3.0,70


In [24]:
## Boolean logic
gradebook['grades'] <= 3

A    False
B    False
C     True
D     True
Name: grades, dtype: bool

In [62]:
## Lets say we want to add another column

gradebook['ScaledMarks'] = 100*gradebook['marks']/90

In [57]:
## Now we have a new column
gradebook

Unnamed: 0,grades,marks,ScaledMarks
A,4.0,90,100.0
B,3.5,80,88.888889
C,3.0,70,77.777778
D,2.5,60,66.666667


In [64]:
## now lets delete the column

del gradebook['ScaledMarks']
gradebook

Unnamed: 0,grades,marks
A,4.0,90
B,3.5,80
C,3.0,70
D,2.5,60


In [70]:
## Now lets mask the gradebook by marks > 60 but less than 90

new_gradebook = gradebook[(gradebook['marks']>60) & (gradebook['marks']<90)]

In [72]:
new_gradebook

Unnamed: 0,grades,marks
B,3.5,80
C,3.0,70


## Pandas: Handling Missing Values

In [2]:
import pandas as pd

In [20]:
## Understanding NaN values which means Not A Number, basically none
## This is missing value and therefore in most cases not helpful
## So lets get rid of them.

## First we need a dataframe with missing value
## Notice NaN's in this data frame

A = pd.DataFrame([{'a':1,'b':2},{'b':3,'c':4}])
A

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [22]:
## Lets handle the missing values with the following method where we replace the missing values with a value of our choosing

A.fillna(0.0)

Unnamed: 0,a,b,c
0,1.0,2,0.0
1,0.0,3,4.0


In [26]:
## Lets reset the dataframe

A = pd.DataFrame([{'a':1,'b':2},{'b':3,'c':4}])
A

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [28]:
## We can drop the missing value NaNs using this method
## Notice however it removes entire rows with missing values rather than just the missing values

A.dropna()

Unnamed: 0,a,b,c


In [30]:
## We can view the dropna() method better by rotating the graph by transposing it
## Now notice that 'b' has no NaN values

A = pd.DataFrame([{'a':1,'b':2},{'b':3,'c':4}])
A = A.T
A

Unnamed: 0,0,1
a,1.0,
b,2.0,3.0
c,,4.0


In [32]:
## If we use dropna() b will be the remaining row as it has no NaN's
A.dropna()

Unnamed: 0,0,1
b,2.0,3.0
