# Advance Pandas

## Intro to Pandas

In [3]:
import pandas as pd

In [7]:
## Lets view the version of pandas
print(pd.__version__)

2.1.4


In [13]:
## Now lets create a series in pandas
## Series is a 1 dimensional array tipically
## Dataframes are multi dimensional arrays
A = pd.Series([3,4,5,6],index=['a','b','c','d'])

In [15]:
## each element is a type int
A.values

array([3, 4, 5, 6], dtype=int64)

In [17]:
## each element is a type object/string
A.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [19]:
## Notice A.values or 3,4,5,6 is an array
type(A.values)

numpy.ndarray

In [24]:
## Notice A is a series
type(A)

pandas.core.series.Series

In [28]:
## Now lets print A
A

a    3
b    4
c    5
d    6
dtype: int64

In [30]:
## Lets index A
A['a']

3

In [36]:
## Lets slice
A['a':'c']

a    3
b    4
c    5
dtype: int64

## Pandas Series

In [4]:
import pandas as pd

In [6]:
## Lets make a pandas series with a dictonary

grades_dict = {'A':90,'B':80,'C':70,'D':60}
grades = pd.Series(grades_dict)
grades

A    90
B    80
C    70
D    60
dtype: int64

In [12]:
type(grades)

pandas.core.series.Series

In [14]:
type(grades_dict)

dict

In [26]:
## You can use explicit indicies
grades['A':'C']

A    90
B    80
C    70
dtype: int64

In [24]:
## You can also use implicit indicies
grades[0:3]

A    90
B    80
C    70
dtype: int64

## Pandas Dataframes Intro

In [4]:
import pandas as pd

In [6]:
## Lets create two series and put them into a dataframe, which is a group of series

grades_dict = {'A':4,'B':3.5,'C':3,'D':2.5}
grades = pd.Series(grades_dict)

marks_dict = {'A':90,'B':80,'C':70,'D':60}
marks = pd.Series(marks_dict)

In [8]:
grades

A    4.0
B    3.5
C    3.0
D    2.5
dtype: float64

In [10]:
marks

A    90
B    80
C    70
D    60
dtype: int64

In [12]:
## Now lets create a dataframe using the two series I created

gradebook = pd.DataFrame({'grades':grades,'marks':marks})
gradebook

Unnamed: 0,grades,marks
A,4.0,90
B,3.5,80
C,3.0,70
D,2.5,60


In [35]:
## Transposed version of the dataframe
gradebook.T

Unnamed: 0,A,B,C,D
grades,4.0,3.5,3.0,2.5
marks,90.0,80.0,70.0,60.0


In [43]:
gradebook

Unnamed: 0,grades,marks
A,4.0,90
B,3.5,80
C,3.0,70
D,2.5,60


In [49]:
## Lets access grade 70 using values
## So in row 2, starting from 0 remember, and then column 1, again starting from 0, we get 70!

gradebook.values[2,1]

70.0

In [51]:
## get the columns
gradebook.columns

Index(['grades', 'marks'], dtype='object')

In [14]:
## Prints the top 5 records in a dataframe
gradebook.head()

Unnamed: 0,grades,marks
A,4.0,90
B,3.5,80
C,3.0,70
D,2.5,60


In [20]:
## implicit indicies
gradebook.iloc[1:3]

Unnamed: 0,grades,marks
B,3.5,80
C,3.0,70


In [22]:
## explicit indicies
gradebook.loc['B':'C']

Unnamed: 0,grades,marks
B,3.5,80
C,3.0,70


In [24]:
## Boolean logic
gradebook['grades'] <= 3

A    False
B    False
C     True
D     True
Name: grades, dtype: bool

In [62]:
## Lets say we want to add another column

gradebook['ScaledMarks'] = 100*gradebook['marks']/90

In [57]:
## Now we have a new column
gradebook

Unnamed: 0,grades,marks,ScaledMarks
A,4.0,90,100.0
B,3.5,80,88.888889
C,3.0,70,77.777778
D,2.5,60,66.666667


In [64]:
## now lets delete the column

del gradebook['ScaledMarks']
gradebook

Unnamed: 0,grades,marks
A,4.0,90
B,3.5,80
C,3.0,70
D,2.5,60


In [70]:
## Now lets mask the gradebook by marks > 60 but less than 90

new_gradebook = gradebook[(gradebook['marks']>60) & (gradebook['marks']<90)]

In [72]:
new_gradebook

Unnamed: 0,grades,marks
B,3.5,80
C,3.0,70


## Pandas: Handling Missing Values

In [2]:
import pandas as pd

In [20]:
## Understanding NaN values which means Not A Number, basically none
## This is missing value and therefore in most cases not helpful
## So lets get rid of them.

## First we need a dataframe with missing value
## Notice NaN's in this data frame

A = pd.DataFrame([{'a':1,'b':2},{'b':3,'c':4}])
A

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [22]:
## Lets handle the missing values with the following method where we replace the missing values with a value of our choosing

A.fillna(0.0)

Unnamed: 0,a,b,c
0,1.0,2,0.0
1,0.0,3,4.0


In [26]:
## Lets reset the dataframe

A = pd.DataFrame([{'a':1,'b':2},{'b':3,'c':4}])
A

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [28]:
## We can drop the missing value NaNs using this method
## Notice however it removes entire rows with missing values rather than just the missing values

A.dropna()

Unnamed: 0,a,b,c


In [30]:
## We can view the dropna() method better by rotating the graph by transposing it
## Now notice that 'b' has no NaN values

A = pd.DataFrame([{'a':1,'b':2},{'b':3,'c':4}])
A = A.T
A

Unnamed: 0,0,1
a,1.0,
b,2.0,3.0
c,,4.0


In [32]:
## If we use dropna() b will be the remaining row as it has no NaN's
A.dropna()

Unnamed: 0,0,1
b,2.0,3.0


## Pandas : loc and iloc

In [2]:
import pandas as pd

In [6]:
## Lets index a series using loc and iloc
## But first we need a series!

A = pd.Series(['a','b','c'], index=[1,3,5])
A

1    a
3    b
5    c
dtype: object

In [12]:
## So first off we can index an indicies using the following:
## At index 1, or row 0, we have the value of A
## Notice that this isn't the 0-2 method we notmally use, that is because this is implicit indicies
A[1]

'a'

In [16]:
## But if i use slicing we get rows 1 and 2 because those are the explicit index's I called for. The 3 and 5 are the implicit index's
A[1:3]

3    b
5    c
dtype: object

In [18]:
## Explicit indexing using loc
## 1 and 3 are the rows being called for by name since 1 and 3 are those row assigned index names

A.loc[1:3]

1    a
3    b
dtype: object

In [20]:
## Implicit indexing using iloc
## Rows 3 and 5 are called because of implicit indexing where the physical row number of 3 and 5 is 1 and 2 (since their the 2nd and 3rd rows)
A.iloc[1:3]

3    b
5    c
dtype: object

In [25]:
## Lets create our gradebook dataframe from earlier:

grades_dict = {'A':4,'B':3.5,'C':3,'D':2.5}
grades = pd.Series(grades_dict)

marks_dict = {'A':90,'B':80,'C':70,'D':60}
marks = pd.Series(marks_dict)

gradebook = pd.DataFrame({'grades':grades,'marks':marks})
gradebook

Unnamed: 0,grades,marks
A,4.0,90
B,3.5,80
C,3.0,70
D,2.5,60


In [27]:
## Lets say we want to access the entire 3rd row of the dataframe, being row C
## here is the iloc, implicit, index
gradebook.iloc[2,:]

grades     3.0
marks     70.0
Name: C, dtype: float64

In [33]:
## Lets say we want to access the entire 3rd row of the dataframe, being row C
## here is the loc, explicit, index

gradebook.loc['C',:]

grades     3.0
marks     70.0
Name: C, dtype: float64

## Pandas: Practice

In [5]:
import pandas as pd

In [15]:
## First we must import the dataset. 
## We are using the adult.data dataset
## There is no header row so we will need to signify that with the header parameter being set to None
## The adult.names is the file with details about adult.data, which is the file with the actual data

df = pd.read_csv('adult.data', header=None)

In [17]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [21]:
## Notice above how the column names are just numbers. Lets change those to be C1, C2, C3... to the end of the dataframe

df.columns = ['C'+str(x) for x in range(df.shape[1])]
df

Unnamed: 0,C0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [23]:
## So now we can call the columns based on the strings

df['C14']

0         <=50K
1         <=50K
2         <=50K
3         <=50K
4         <=50K
          ...  
32556     <=50K
32557      >50K
32558     <=50K
32559     <=50K
32560      >50K
Name: C14, Length: 32561, dtype: object

In [27]:
## lets use .loc function to view all rows in C14

df.loc[:,'C14']

0         <=50K
1         <=50K
2         <=50K
3         <=50K
4         <=50K
          ...  
32556     <=50K
32557      >50K
32558     <=50K
32559     <=50K
32560      >50K
Name: C14, Length: 32561, dtype: object

In [29]:
## lets use .loc function to view all rows in C14 and C0

df.loc[:,['C14','C0']]

Unnamed: 0,C14,C0
0,<=50K,39
1,<=50K,50
2,<=50K,38
3,<=50K,53
4,<=50K,28
...,...,...
32556,<=50K,27
32557,>50K,40
32558,<=50K,58
32559,<=50K,22


In [31]:
## Lets view the top 5 rows

df.head()

Unnamed: 0,C0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [33]:
## Lets look a little further at C14

df['C14'].unique()

array([' <=50K', ' >50K'], dtype=object)

In [35]:
## We can grab the two veriables given and store them in an array

L = df['C14'].unique()

In [37]:
L[0]

' <=50K'

In [39]:
L[1]

' >50K'

In [41]:
## Now lets say we want to replace L[0] or ' <=50K' with -1 and L[1] or ' >50K' with 1
## First we need something to compare against the original dataframe
## Lets start with L[0]
## We'll gets a boolean series for the column telling us which rows have L[0] or ' <=50K'

idx = df['C14']==L[0]
idx

0         True
1         True
2         True
3         True
4         True
         ...  
32556     True
32557    False
32558     True
32559     True
32560    False
Name: C14, Length: 32561, dtype: bool

In [43]:
## Now lets use the .loc function to find all the places where idx is True and replace them with -1

df['C14'].loc[idx] = -1
df['C14']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['C14'].loc[idx] = -1


0           -1
1           -1
2           -1
3           -1
4           -1
         ...  
32556       -1
32557     >50K
32558       -1
32559       -1
32560     >50K
Name: C14, Length: 32561, dtype: object

In [45]:
## Now lets do the same for L[1] or ' >50K' with 1
idx = df['C14']==L[1]
idx

0        False
1        False
2        False
3        False
4        False
         ...  
32556    False
32557     True
32558    False
32559    False
32560     True
Name: C14, Length: 32561, dtype: bool

In [47]:
## Now lets use the .loc function to find all the places where idx is True and replace them with 1

df['C14'].loc[idx] = 1
df['C14']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['C14'].loc[idx] = 1


0        -1
1        -1
2        -1
3        -1
4        -1
         ..
32556    -1
32557     1
32558    -1
32559    -1
32560     1
Name: C14, Length: 32561, dtype: object

In [51]:
## Now lets look at the top 5 columns again
## We want to investigate C1 further so to do this we're going to use the One-Hot Encoding method 
## Which will seperate all of the unique veriables in the column into their own columns and turning them into boolean columns 

df.head()

Unnamed: 0,C0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,-1
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,-1
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,-1
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,-1
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,-1


In [55]:
## First lets find all of the unique veriables in C1

df['C1'].unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

In [57]:
## We can get the count using .size

df['C1'].unique().size

9

In [59]:
## View the original shape of the  dataframe
df.shape

(32561, 15)

In [61]:
## Ok now lets seperate the veriables vis the One-Hot Encoding method using the get_dummies function in pandas
df = pd.get_dummies(df,columns=['C1'])

In [63]:
## Notice the dataframe now has more columns due to the unique variable seperation
df.shape

(32561, 23)

In [65]:
## Notice all of the new C1 boolean columns at the end of the dataframe and that the C1 has been removed
df.head()

Unnamed: 0,C0,C2,C3,C4,C5,C6,C7,C8,C9,C10,...,C14,C1_ ?,C1_ Federal-gov,C1_ Local-gov,C1_ Never-worked,C1_ Private,C1_ Self-emp-inc,C1_ Self-emp-not-inc,C1_ State-gov,C1_ Without-pay
0,39,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,...,-1,False,False,False,False,False,False,False,True,False
1,50,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,...,-1,False,False,False,False,False,False,True,False,False
2,38,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,...,-1,False,False,False,False,True,False,False,False,False
3,53,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,...,-1,False,False,False,False,True,False,False,False,False
4,28,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,...,-1,False,False,False,False,True,False,False,False,False


In [69]:
## Lets also try get_dummies with C14

df = pd.get_dummies(df,columns=['C14'])

In [71]:
df.head()

Unnamed: 0,C0,C2,C3,C4,C5,C6,C7,C8,C9,C10,...,C1_ Federal-gov,C1_ Local-gov,C1_ Never-worked,C1_ Private,C1_ Self-emp-inc,C1_ Self-emp-not-inc,C1_ State-gov,C1_ Without-pay,C14_-1,C14_1
0,39,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,...,False,False,False,False,False,False,True,False,True,False
1,50,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,...,False,False,False,False,False,True,False,False,True,False
2,38,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,...,False,False,False,True,False,False,False,False,True,False
3,53,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,...,False,False,False,True,False,False,False,False,True,False
4,28,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,...,False,False,False,True,False,False,False,False,True,False


In [73]:
## Lets write this to a CSV

df.to_csv('abc.csv')