# Prepatation

In [134]:
import numpy as np
import pandas as pd
import random
from sklearn import datasets

In [136]:
iris = datasets.load_iris()
species = iris.target
yesno = np.random.randint(2, size=len(species))
iris = pd.DataFrame(iris.data)
species = pd.DataFrame(species)
yesno = pd.DataFrame(yesno)
species = species.replace({0:'setosa', 1:'versicolor', 2:'verginica'})
iris = pd.concat([iris.reset_index(drop=True), species], axis=1)
iris = pd.concat([iris.reset_index(drop=True), yesno], axis=1)
iris.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species', 'yes_no']

In [137]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,yes_no
0,5.1,3.5,1.4,0.2,setosa,1
1,4.9,3.0,1.4,0.2,setosa,0
2,4.7,3.2,1.3,0.2,setosa,0
3,4.6,3.1,1.5,0.2,setosa,1
4,5.0,3.6,1.4,0.2,setosa,0


---

# groupby()
Group the dataset by desired categorical variables and aggregate numerical variables by desired functions (e.g. mean, sd, max, min, head(1) etc...)

In [138]:
iris_groupby = iris.groupby(['species']).max().reset_index()
iris_groupby

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width,yes_no
0,setosa,5.8,4.4,1.9,0.6,1
1,verginica,7.9,3.8,6.9,2.5,1
2,versicolor,7.0,3.4,5.1,1.8,1


In [139]:
iris.groupby(['species'])['sepal_length', 'petal_length'].mean().reset_index()

Unnamed: 0,species,sepal_length,petal_length
0,setosa,5.006,1.462
1,verginica,6.588,5.552
2,versicolor,5.936,4.26


# pivot_table()
Very similar to groupby(), except we can choose multiple categorical variables.

In [143]:
iris.pivot_table(values=['sepal_length', 'petal_length'], index=['yes_no', 'species'], aggfunc=np.mean)

Unnamed: 0_level_0,Unnamed: 1_level_0,petal_length,sepal_length
yes_no,species,Unnamed: 2_level_1,Unnamed: 3_level_1
0,setosa,1.470833,4.941667
0,verginica,5.635,6.595
0,versicolor,4.283333,5.925
1,setosa,1.453846,5.065385
1,verginica,5.496667,6.583333
1,versicolor,4.238462,5.946154


In [144]:
# identical to the above one
iris.pivot_table(values=['sepal_length', 'petal_length'], index=['species'], aggfunc=np.mean)

Unnamed: 0_level_0,petal_length,sepal_length
species,Unnamed: 1_level_1,Unnamed: 2_level_1
setosa,1.462,5.006
verginica,5.552,6.588
versicolor,4.26,5.936


# crosstab()
Often used for a confusion matrix.

In [145]:
pd.crosstab(iris["species"],iris["yes_no"],margins=True)

yes_no,0,1,All
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,24,26,50
verginica,20,30,50
versicolor,24,26,50
All,68,82,150


# One-Hot Encoding
Expand categorical variables into new variables of binaries. This is REQUIRED if one is to apply machine-learning algorithms (except for LightGBM).

In [123]:
# One way
iris_1 = pd.get_dummies(iris)
iris_1.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species_setosa,species_verginica,species_versicolor
0,5.1,3.5,1.4,0.2,1,0,0
1,4.9,3.0,1.4,0.2,1,0,0
2,4.7,3.2,1.3,0.2,1,0,0
3,4.6,3.1,1.5,0.2,1,0,0
4,5.0,3.6,1.4,0.2,1,0,0


In [126]:
# Most popular way
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
iris_2 = enc.fit_transform(iris)
iris_2.toarray()

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

# merge()
Merge two distinct data files. Inner, outer, left, or right.

# apply() and vectorize()
Used when each element of columns (or a vector) is to be applied by a function.

In [94]:
iris[['sepal_length', 'petal_length']].apply(lambda l: np.square(l)).head()

Unnamed: 0,sepal_length,petal_length
0,26.01,1.96
1,24.01,1.96
2,22.09,1.69
3,21.16,2.25
4,25.0,1.96


In [100]:
sq = np.vectorize(lambda l: np.square(l))
sq(iris['sepal_length'].to_numpy())[0:5]

array([26.01, 24.01, 22.09, 21.16, 25.  ])