<a href="https://colab.research.google.com/github/Hbada/Python-data-analysis-lessons-notebooks/blob/master/Pandas_practice_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# original notebook source: Google Colab
# exercise sources: Udemy course Python for Data Science... by Portilla
import numpy as np
import pandas as pd

## Create series

In [0]:
# create lists of data
labels = ['a', 'b', 'c']
my_data = [10, 20, 30]

In [4]:
# create initial array
arr = np.array(my_data)
arr

array([10, 20, 30])

In [0]:
# create dictionary
d = {'a':10, 'b':20, 'c':30} # Python way to combine index with values

In [6]:
# create series
pd.Series(data = my_data)

0    10
1    20
2    30
dtype: int64

In [7]:
# create new series with an index
pd.Series(data=my_data, index=labels) # series use labels to denote indexes

a    10
b    20
c    30
dtype: int64

In [8]:
# shortcut to create series with index
pd.Series(my_data, labels)

a    10
b    20
c    30
dtype: int64

In [9]:
# create series from any numpy array
zeroes = np.zeros(9)
pd.Series(zeroes)
# can't use .reshape(3,3) yet because .Series() requires 1D array

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
5    0.0
6    0.0
7    0.0
8    0.0
dtype: float64

In [10]:
# turn dictionary into series
pd.Series(d)
# automatically converts dictionary keys to index in the series

a    10
b    20
c    30
dtype: int64

In [11]:
# create series of strings
pd.Series(labels)
# index is automatically created, and labels array becomes the values

0    a
1    b
2    c
dtype: object

In [12]:
# can also create series holding other object types
pd.Series(data=[sum,len]) # series of functions

0    <built-in function sum>
1    <built-in function len>
dtype: object

## Using index in a series

In [13]:
# create two series with values then labels
ser1 = pd.Series([1, 2, 3, 4], ['Japan', 'Morocco', 'Canada', 'England'])
ser1


Japan      1
Morocco    2
Canada     3
England    4
dtype: int64

In [14]:
ser2 = pd.Series([1, 5, 3, 4], ['Japan', 'Italy', 'Canada', 'England'])
ser2

Japan      1
Italy      5
Canada     3
England    4
dtype: int64

In [15]:
# retrieve by label as index
ser2['Italy']

5

In [16]:
# see how indexes line up; add two series
ser1 + ser2 # integers become floats automatically so you don't lose info

Canada     6.0
England    8.0
Italy      NaN
Japan      2.0
Morocco    NaN
dtype: float64

## Create data frame

In [0]:
from numpy.random import randn
np.random.seed(101) # set a seed to match the course instructor's results

In [18]:
# create table of random normal distribution numbers in 5 rows, 4 columns
df = pd.DataFrame(randn(5, 4))
df

Unnamed: 0,0,1,2,3
0,2.70685,0.628133,0.907969,0.503826
1,0.651118,-0.319318,-0.848077,0.605965
2,-2.018168,0.740122,0.528813,-0.589001
3,0.188695,-0.758872,-0.933237,0.955057
4,0.190794,1.978757,2.605967,0.683509


In [19]:
# create same table but with labels
df = pd.DataFrame(randn(5, 4), ['A', 'B', 'C', 'D', 'E'], ['Apples', 'Oranges', 'Bananas', 'Mangos'])
df

Unnamed: 0,Apples,Oranges,Bananas,Mangos
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


## Select portions of a dataframe

In [20]:
# select one column only
df['Apples']

A    0.302665
B   -0.134841
C    0.807706
D   -0.497104
E   -0.116773
Name: Apples, dtype: float64

In [21]:
# confirm that a column is considered a series
type(df['Apples'])

pandas.core.series.Series

In [22]:
# select multiple columns by inserting a list as index
df[['Apples', 'Mangos']]

Unnamed: 0,Apples,Mangos
A,0.302665,-1.159119
B,-0.134841,0.184502
C,0.807706,0.329646
D,-0.497104,0.484752
E,-0.116773,1.996652


In [23]:
# confirm that a multi-column selection is not a series, but rather a df
type(df[['Apples', 'Mangos']])

pandas.core.frame.DataFrame

In [24]:
# select a row using .loc method
df.loc['B']
# returns a series

Apples    -0.134841
Oranges    0.390528
Bananas    0.166905
Mangos     0.184502
Name: B, dtype: float64

In [25]:
# select a row using index location .iloc method
df.iloc[3] # returns row 4

Apples    -0.497104
Oranges   -0.754070
Bananas   -0.943406
Mangos     0.484752
Name: D, dtype: float64

In [26]:
# select a cell given a row and column
df.loc['B', 'Oranges']

0.39052784273374097

In [27]:
# display current df
df

Unnamed: 0,Apples,Oranges,Bananas,Mangos
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [28]:
# select a bunch of cells given a few rows and few columns to include
df.loc[['B', 'C'], ['Bananas', 'Mangos']]

Unnamed: 0,Bananas,Mangos
B,0.166905,0.184502
C,0.638787,0.329646


## Add and remove columns and rows

In [29]:
# add a column to df, in the context of a column operation
df['Sum'] = df['Apples'] + df['Oranges'] + df['Bananas'] + df['Mangos'] # sum column is new and contains sums
df

Unnamed: 0,Apples,Oranges,Bananas,Mangos,Sum
A,0.302665,1.693723,-1.706086,-1.159119,-0.868817
B,-0.134841,0.390528,0.166905,0.184502,0.607094
C,0.807706,0.07296,0.638787,0.329646,1.849099
D,-0.497104,-0.75407,-0.943406,0.484752,-1.709828
E,-0.116773,1.901755,0.238127,1.996652,4.019761


In [30]:
# remove a column; requires axis=1 to specify it's a column
df.drop('Bananas', axis=1)

# notice the sum hasn't changed

Unnamed: 0,Apples,Oranges,Mangos,Sum
A,0.302665,1.693723,-1.159119,-0.868817
B,-0.134841,0.390528,0.184502,0.607094
C,0.807706,0.07296,0.329646,1.849099
D,-0.497104,-0.75407,0.484752,-1.709828
E,-0.116773,1.901755,1.996652,4.019761


In [31]:
# confirm Bananas column still exists
df

Unnamed: 0,Apples,Oranges,Bananas,Mangos,Sum
A,0.302665,1.693723,-1.706086,-1.159119,-0.868817
B,-0.134841,0.390528,0.166905,0.184502,0.607094
C,0.807706,0.07296,0.638787,0.329646,1.849099
D,-0.497104,-0.75407,-0.943406,0.484752,-1.709828
E,-0.116773,1.901755,0.238127,1.996652,4.019761


In [32]:
# delete column for real
df.drop('Bananas', axis=1, inplace=True)
# also delete Sum column since it was calculated on a previous columns
df.drop('Sum', axis=1, inplace=True)
df

Unnamed: 0,Apples,Oranges,Mangos
A,0.302665,1.693723,-1.159119
B,-0.134841,0.390528,0.184502
C,0.807706,0.07296,0.329646
D,-0.497104,-0.75407,0.484752
E,-0.116773,1.901755,1.996652


In [33]:
# drop rows
df.drop('E', axis=0) # axis=0 can be omitted because it's the default value

Unnamed: 0,Apples,Oranges,Mangos
A,0.302665,1.693723,-1.159119
B,-0.134841,0.390528,0.184502
C,0.807706,0.07296,0.329646
D,-0.497104,-0.75407,0.484752


In [34]:
# notice it still has row E
df

Unnamed: 0,Apples,Oranges,Mangos
A,0.302665,1.693723,-1.159119
B,-0.134841,0.390528,0.184502
C,0.807706,0.07296,0.329646
D,-0.497104,-0.75407,0.484752
E,-0.116773,1.901755,1.996652


In [35]:
# delete a row for real
df.drop('E', axis=0, inplace=True)
df

Unnamed: 0,Apples,Oranges,Mangos
A,0.302665,1.693723,-1.159119
B,-0.134841,0.390528,0.184502
C,0.807706,0.07296,0.329646
D,-0.497104,-0.75407,0.484752


## Shape of dataframe

In [36]:
# retrieve shape of dataframe, in (rows, columns)
# this is why axis=0 refers to rows and axis=1 refers to columns above
df.shape

(4, 3)

## Conditional selection

In [37]:
# return boolean for whether cell > 0
booldf = df > 0
booldf

Unnamed: 0,Apples,Oranges,Mangos
A,True,True,False
B,False,True,True
C,True,True,True
D,False,False,True


In [38]:
# cells that are True return a value; False give null result
df[booldf]

Unnamed: 0,Apples,Oranges,Mangos
A,0.302665,1.693723,
B,,0.390528,0.184502
C,0.807706,0.07296,0.329646
D,,,0.484752


In [39]:
# shortcut to df of a condition
df[df > 0]

Unnamed: 0,Apples,Oranges,Mangos
A,0.302665,1.693723,
B,,0.390528,0.184502
C,0.807706,0.07296,0.329646
D,,,0.484752


In [40]:
# note df as a variable still exists
df

Unnamed: 0,Apples,Oranges,Mangos
A,0.302665,1.693723,-1.159119
B,-0.134841,0.390528,0.184502
C,0.807706,0.07296,0.329646
D,-0.497104,-0.75407,0.484752


In [41]:
# find all positives in a column
df['Apples'] > 0 # returns a series

A     True
B    False
C     True
D    False
Name: Apples, dtype: bool

In [42]:
# select all rows that have True condition in Apples column
df[df['Apples'] > 0]

Unnamed: 0,Apples,Oranges,Mangos
A,0.302665,1.693723,-1.159119
C,0.807706,0.07296,0.329646


In [43]:
# selections based on the results of the condition
# save the conditional selection to a variable
resultdf = df[df['Apples'] > 0]
# select one column of the results
resultdf['Oranges'] # all the Oranges values, for rows with positive Apples values

A    1.693723
C    0.072960
Name: Oranges, dtype: float64

In [44]:
# shortcut for prior cell
df[df['Apples'] > 0]['Oranges']
# process uses less memory where you use fewer variables

A    1.693723
C    0.072960
Name: Oranges, dtype: float64

In [45]:
# another shortcut; collect Orange & Mango cols for rows with positive Apples value
df[df['Apples']>0][['Oranges', 'Mangos']] # the 'filter' is a list thus double brackets
# combining like this, without breaking each step down and using variables...
# is example of abstraction
# reminds me of gloves as abstraction of German 'hand shoe' noun

Unnamed: 0,Oranges,Mangos
A,1.693723,-1.159119
C,0.07296,0.329646


## Selecting with multiple conditions

In [46]:
# show DataFrame involving mulitiple conditions with 'and'
df[(df['Apples'] > 0) & (df['Mangos'] > 0)]

Unnamed: 0,Apples,Oranges,Mangos
C,0.807706,0.07296,0.329646


In [47]:
# show DataFrame involving multiple conditions with 'or'
df[(df['Apples'] > 0) | (df['Mangos'] > 0)]

Unnamed: 0,Apples,Oranges,Mangos
A,0.302665,1.693723,-1.159119
B,-0.134841,0.390528,0.184502
C,0.807706,0.07296,0.329646
D,-0.497104,-0.75407,0.484752


## Reset index

In [48]:
df

Unnamed: 0,Apples,Oranges,Mangos
A,0.302665,1.693723,-1.159119
B,-0.134841,0.390528,0.184502
C,0.807706,0.07296,0.329646
D,-0.497104,-0.75407,0.484752


In [49]:
# reset index
df.reset_index()
# notice letters for rows are now in a column and you see index number

Unnamed: 0,index,Apples,Oranges,Mangos
0,A,0.302665,1.693723,-1.159119
1,B,-0.134841,0.390528,0.184502
2,C,0.807706,0.07296,0.329646
3,D,-0.497104,-0.75407,0.484752


In [50]:
# note that index number still won't show when you call the variable
df

Unnamed: 0,Apples,Oranges,Mangos
A,0.302665,1.693723,-1.159119
B,-0.134841,0.390528,0.184502
C,0.807706,0.07296,0.329646
D,-0.497104,-0.75407,0.484752


In [0]:
# make the index numbers remain in place
# df.reset_index(inplace=True)
# df

In [52]:
# create new list of index values using split to insert commas
newind = 'CA NY WY OR'.split()
newind

['CA', 'NY', 'WY', 'OR']

In [53]:
# append a list as new column of values
df['State'] = newind
df

Unnamed: 0,Apples,Oranges,Mangos,State
A,0.302665,1.693723,-1.159119,CA
B,-0.134841,0.390528,0.184502,NY
C,0.807706,0.07296,0.329646,WY
D,-0.497104,-0.75407,0.484752,OR


In [54]:
# set a column's values as index
df.set_index('State', inplace=True) # inplace makes it permanent
df

Unnamed: 0_level_0,Apples,Oranges,Mangos
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CA,0.302665,1.693723,-1.159119
NY,-0.134841,0.390528,0.184502
WY,0.807706,0.07296,0.329646
OR,-0.497104,-0.75407,0.484752


## Multi-index DataFrames
Aka. index hierarchy

In [0]:
# index levels
outside = ['G1', 'G1', 'G1', 'G2', 'G2', 'G2']
inside = [1, 2, 3, 1, 2, 3]

In [56]:
# zip into tuples
hier_index = list(zip(outside, inside))
hier_index

[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]

In [58]:
# turns a tuple index into multi index
hier_index = pd.MultiIndex.from_tuples(hier_index)
hier_index

MultiIndex(levels=[['G1', 'G2'], [1, 2, 3]],
           codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])

In [60]:
# create randn with 6 rows, 2 col
df = pd.DataFrame(randn(6,2))
df

Unnamed: 0,0,1
0,0.147027,-0.479448
1,0.558769,1.02481
2,-0.925874,1.862864
3,-1.133817,0.610478
4,0.38603,2.084019
5,-0.376519,0.230336


In [62]:
# create same but with 'Multi index' and column labels A & B
df = pd.DataFrame(randn(6,2), hier_index, ['A', 'B'])
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.992573,1.192241
G1,2,-1.04678,1.292765
G1,3,-1.467514,-0.494095
G2,1,-0.162535,0.485809
G2,2,0.392489,0.221491
G2,3,-0.855196,1.54199


## Selections in a multi-index dataframe

In [63]:
# select an outside index
df.loc['G1']

Unnamed: 0,A,B
1,0.992573,1.192241
2,-1.04678,1.292765
3,-1.467514,-0.494095


In [66]:
# select outside index then an inner selection
df.loc['G1'].loc[1] # returns series of row 1 in section G1

A    0.992573
B    1.192241
Name: 1, dtype: float64

In [67]:
# confirm the indexes have no names
df.index.names

FrozenList([None, None])

In [0]:
# name the indexes
df.index.names = ['Groups', 'Num']

In [69]:
# see dataframe with new index labels
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.992573,1.192241
G1,2,-1.04678,1.292765
G1,3,-1.467514,-0.494095
G2,1,-0.162535,0.485809
G2,2,0.392489,0.221491
G2,3,-0.855196,1.54199


In [71]:
# select a cell
df.loc['G2'].loc[2].loc['B'] # section G2, row 2, col B

0.22149068500354543

In [73]:
# retrieve a cross section: something from each outer index
df.xs(1, level='Num') # select all Num=1 from all Groups

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,0.992573,1.192241
G2,-0.162535,0.485809


## My example of something more meaningful
Eg. fruit harvest sampling data

In [0]:
# create outer and inner index labels
outside = ['Mon', 'Mon', 'Tue', 'Tue', 'Wed', 'Wed', 'Th', 'Th', 'Fri', 'Fri']
inside = [1, 2, 1, 2, 1, 2, 1, 2, 1, 2]

In [75]:
# zip into tuples
hier_index = list(zip(outside, inside))
hier_index

[('Mon', 1),
 ('Mon', 2),
 ('Tue', 1),
 ('Tue', 2),
 ('Wed', 1),
 ('Wed', 2),
 ('Th', 1),
 ('Th', 2),
 ('Fri', 1),
 ('Fri', 2)]

In [76]:
# turns a tuple index into multi index
hier_index = pd.MultiIndex.from_tuples(hier_index)
hier_index # two rows for each day of the work week

MultiIndex(levels=[['Fri', 'Mon', 'Th', 'Tue', 'Wed'], [1, 2]],
           codes=[[1, 1, 3, 3, 4, 4, 2, 2, 0, 0], [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]])

In [81]:
# create dataframe with 'Multi index' and column labels A & B
df = pd.DataFrame(randn(10,2), hier_index, ['Apples', 'Oranges'])
df

Unnamed: 0,Unnamed: 1,Apples,Oranges
Mon,1,0.197524,2.302987
Mon,2,0.729024,-0.863091
Tue,1,0.305632,0.243178
Tue,2,0.864165,-1.560931
Wed,1,-0.251897,-0.57812
Wed,2,0.236996,0.20078
Th,1,0.327845,0.674485
Th,2,-0.174057,0.78014
Fri,1,-0.383258,-0.409318
Fri,2,0.343539,0.196275


In [88]:
# name the indexes
df.index.names = ['Day', 'Sample']
# load the dataframe
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Apples,Oranges
Day,Sample,Unnamed: 2_level_1,Unnamed: 3_level_1
Mon,1,0.197524,2.302987
Mon,2,0.729024,-0.863091
Tue,1,0.305632,0.243178
Tue,2,0.864165,-1.560931
Wed,1,-0.251897,-0.57812
Wed,2,0.236996,0.20078
Th,1,0.327845,0.674485
Th,2,-0.174057,0.78014
Fri,1,-0.383258,-0.409318
Fri,2,0.343539,0.196275


In [89]:
# retrieve a cross section: only the second sample of the day
df.xs(2, level='Sample') # select all Sample=2 from every Day

Unnamed: 0_level_0,Apples,Oranges
Day,Unnamed: 1_level_1,Unnamed: 2_level_1
Mon,0.729024,-0.863091
Tue,0.864165,-1.560931
Wed,0.236996,0.20078
Th,-0.174057,0.78014
Fri,0.343539,0.196275


## Dealing with missing data

In [0]:
# create dictionary with some null values
d = {'A': [1, 2, np.nan], 'B': [5, np.nan, np.nan], 'C': [1, 2, 3]}

In [91]:
# create dataframe from the dictionary
df = pd.DataFrame(d)
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [92]:
# drop rows with null or missing values (called 'na')
df.dropna() # default is axis=0, so rows with NaN are dropped

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [94]:
# create dataframe again
d = {'A': [1, 2, np.nan], 'B': [5, np.nan, np.nan], 'C': [1, 2, 3]}
df = pd.DataFrame(d)
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [95]:
# drop based on columns, axis=1
df.dropna(axis=1)

Unnamed: 0,C
0,1
1,2
2,3


In [99]:
# drop NaN with a threshhold
# a threshhold is the number of occurrences
df.dropna(thresh=2) # keep rows with at least 2 non-NaN values

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2


In [100]:
df # full df remains

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [103]:
# fill in missing values
df.fillna(value='Absent')

Unnamed: 0,A,B,C
0,1,5,1
1,2,Absent,2
2,Absent,Absent,3


In [105]:
# replace NaN with mean of that column
df['A'].fillna(value=df['A'].mean())

# many theories exist about the best ways to fill missing values
# check online for recommendations and philosophies

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64