In [2]:
import numpy as np
import pandas as pd
from numpy.random import randn

## Dataframes

In [3]:
np.random.seed(101)

In [4]:
df = pd.DataFrame(randn(5,4), ['a','b','c','d','e'], ['w','x','y','z'])

In [5]:
df

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [6]:
df['w']

a    2.706850
b    0.651118
c   -2.018168
d    0.188695
e    0.190794
Name: w, dtype: float64

In [7]:
df[['w', 'y']]

Unnamed: 0,w,y
a,2.70685,0.907969
b,0.651118,-0.848077
c,-2.018168,0.528813
d,0.188695,-0.933237
e,0.190794,2.605967


In [8]:
df['aa'] = df['y'] + df['z'] # very similar to r programming

In [9]:
df

Unnamed: 0,w,x,y,z,aa
a,2.70685,0.628133,0.907969,0.503826,1.411795
b,0.651118,-0.319318,-0.848077,0.605965,-0.242112
c,-2.018168,0.740122,0.528813,-0.589001,-0.060187
d,0.188695,-0.758872,-0.933237,0.955057,0.021819
e,0.190794,1.978757,2.605967,0.683509,3.289476


Dropping information from frames

In [10]:
# This would drop column we specify
df.drop('aa', axis=1)
# This would drop row we specify
df.drop('a', axis=0)

Unnamed: 0,w,x,y,z,aa
b,0.651118,-0.319318,-0.848077,0.605965,-0.242112
c,-2.018168,0.740122,0.528813,-0.589001,-0.060187
d,0.188695,-0.758872,-0.933237,0.955057,0.021819
e,0.190794,1.978757,2.605967,0.683509,3.289476


In [11]:
df

Unnamed: 0,w,x,y,z,aa
a,2.70685,0.628133,0.907969,0.503826,1.411795
b,0.651118,-0.319318,-0.848077,0.605965,-0.242112
c,-2.018168,0.740122,0.528813,-0.589001,-0.060187
d,0.188695,-0.758872,-0.933237,0.955057,0.021819
e,0.190794,1.978757,2.605967,0.683509,3.289476


In [12]:
df.shape

(5, 5)

In [13]:
# selecting columns
df[['w','y']]

# selecting rows
df.iloc[2]

w    -2.018168
x     0.740122
y     0.528813
z    -0.589001
aa   -0.060187
Name: c, dtype: float64

## Subsetting

In [14]:
# Subsetting

df.loc['b', 'y']

# I want rows A & B and W & Y columns

df.loc[ ['a', 'b'], ['w', 'y'] ]

Unnamed: 0,w,y
a,2.70685,0.907969
b,0.651118,-0.848077


## First Review

In [15]:
df_review = pd.DataFrame(randn(5,4), ['a','b','c','d','e'], ['w','x','y','z'])
df_review

Unnamed: 0,w,x,y,z
a,0.302665,1.693723,-1.706086,-1.159119
b,-0.134841,0.390528,0.166905,0.184502
c,0.807706,0.07296,0.638787,0.329646
d,-0.497104,-0.75407,-0.943406,0.484752
e,-0.116773,1.901755,0.238127,1.996652


In [16]:
df_review['new'] = df_review['w'] + df_review['z']
df_review

Unnamed: 0,w,x,y,z,new
a,0.302665,1.693723,-1.706086,-1.159119,-0.856454
b,-0.134841,0.390528,0.166905,0.184502,0.049661
c,0.807706,0.07296,0.638787,0.329646,1.137352
d,-0.497104,-0.75407,-0.943406,0.484752,-0.012352
e,-0.116773,1.901755,0.238127,1.996652,1.879879


In [17]:
# Remove Columns
df_review.drop('new', axis=1)
# Remove Rows
df_review.drop('a', axis=0)
df_review

Unnamed: 0,w,x,y,z,new
a,0.302665,1.693723,-1.706086,-1.159119,-0.856454
b,-0.134841,0.390528,0.166905,0.184502,0.049661
c,0.807706,0.07296,0.638787,0.329646,1.137352
d,-0.497104,-0.75407,-0.943406,0.484752,-0.012352
e,-0.116773,1.901755,0.238127,1.996652,1.879879


In [18]:
# Selection
df_review.loc[['a', 'c'], ['w', 'z']]

Unnamed: 0,w,z
a,0.302665,-1.159119
c,0.807706,0.329646


## Conditional Selection

In [19]:
df

Unnamed: 0,w,x,y,z,aa
a,2.70685,0.628133,0.907969,0.503826,1.411795
b,0.651118,-0.319318,-0.848077,0.605965,-0.242112
c,-2.018168,0.740122,0.528813,-0.589001,-0.060187
d,0.188695,-0.758872,-0.933237,0.955057,0.021819
e,0.190794,1.978757,2.605967,0.683509,3.289476


In [20]:
df>0 # condition
df[df>0] # apply the condition to the dataframe and then display the dataframe

Unnamed: 0,w,x,y,z,aa
a,2.70685,0.628133,0.907969,0.503826,1.411795
b,0.651118,,,0.605965,
c,,0.740122,0.528813,,
d,0.188695,,,0.955057,0.021819
e,0.190794,1.978757,2.605967,0.683509,3.289476


In [21]:
# grab all rows in dataframe where z is less than zero
df
cond = df['z'] < 0
Result = df[cond]
Result


Unnamed: 0,w,x,y,z,aa
c,-2.018168,0.740122,0.528813,-0.589001,-0.060187


In [22]:
## Multiple Conditions
df
cond1 = df['z'] > 0
cond2 = df['w'] > 1
df[cond1 & cond2] ## and
df[cond1 | cond2] ## or

Unnamed: 0,w,x,y,z,aa
a,2.70685,0.628133,0.907969,0.503826,1.411795
b,0.651118,-0.319318,-0.848077,0.605965,-0.242112
d,0.188695,-0.758872,-0.933237,0.955057,0.021819
e,0.190794,1.978757,2.605967,0.683509,3.289476


## Missing Data

In [23]:
df_1 = df
df_1

Unnamed: 0,w,x,y,z,aa
a,2.70685,0.628133,0.907969,0.503826,1.411795
b,0.651118,-0.319318,-0.848077,0.605965,-0.242112
c,-2.018168,0.740122,0.528813,-0.589001,-0.060187
d,0.188695,-0.758872,-0.933237,0.955057,0.021819
e,0.190794,1.978757,2.605967,0.683509,3.289476


In [24]:
## Create dataframe from dictionary
d= {
    'A':[1,2,np.nan],
    'B':[5,np.nan,np.nan],
    'C':[1,2,3]
}

df_2 = pd.DataFrame(d)


df_2

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [25]:
df_2.dropna() # drops rows where there are na values
df_2.dropna(axis=1) # drops columns where there are na values
df_2

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [26]:
df_2.dropna(thresh=2) # drops rows where there are na values but stops if there are at least 2 na values.


Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2


In [27]:
## Filling na data
df_2
df_2.fillna(value= 'Fill')

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,Fill,2
2,Fill,Fill,3


In [28]:
## Filling na data with mean value of dataframe
df_2
df_2.fillna(value= df_2['A'].mean())



Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,1.5,2
2,1.5,1.5,3


## Group By

In [31]:
data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
       'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
       'Sales':[200,120,340,124,243,350]}

In [32]:
Sample_df = pd.DataFrame(data)
Sample_df

Unnamed: 0,Company,Person,Sales
0,GOOG,Sam,200
1,GOOG,Charlie,120
2,MSFT,Amy,340
3,MSFT,Vanessa,124
4,FB,Carl,243
5,FB,Sarah,350


In [41]:
by_company = Sample_df.groupby('Company')
TotalSum_df = Sample_df.groupby('Company').sum()
TotalSum_df

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
FB,593
GOOG,320
MSFT,464


In [42]:
Sample_df.groupby('Company').describe()

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Company,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
FB,2.0,296.5,75.660426,243.0,269.75,296.5,323.25,350.0
GOOG,2.0,160.0,56.568542,120.0,140.0,160.0,180.0,200.0
MSFT,2.0,232.0,152.735065,124.0,178.0,232.0,286.0,340.0


In [43]:
Sample_df.groupby('Company').describe().transpose()

Unnamed: 0,Company,FB,GOOG,MSFT
Sales,count,2.0,2.0,2.0
Sales,mean,296.5,160.0,232.0
Sales,std,75.660426,56.568542,152.735065
Sales,min,243.0,120.0,124.0
Sales,25%,269.75,140.0,178.0
Sales,50%,296.5,160.0,232.0
Sales,75%,323.25,180.0,286.0
Sales,max,350.0,200.0,340.0
