# SLU15 - Working with Real Data - Examples

In [1]:
import pandas as pd

import warnings
warnings.filterwarnings('ignore')


avengers = pd.read_csv('data/avengers.csv')
avengers.head(3)

Unnamed: 0,Name,Active,Gender,Membership,Universe,Appearances,TotalDeaths,TotalReturns
0,"Henry Jonathan ""Hank"" Pym",YES,MALE,Full,Earth-616,1269,1,0
1,Janet van Dyne,YES,FEMALE,Full,Earth-616,1165,1,1
2,"Anthony Edward ""Tony"" Stark",YES,MALE,Full,Earth-616,3068,1,1


### Select different column types (dypes) with `select_dtypes`

In [2]:
non_numerical_data = avengers.select_dtypes(include="object")

non_numerical_data.head(3)

Unnamed: 0,Name,Active,Gender,Membership,Universe
0,"Henry Jonathan ""Hank"" Pym",YES,MALE,Full,Earth-616
1,Janet van Dyne,YES,FEMALE,Full,Earth-616
2,"Anthony Edward ""Tony"" Stark",YES,MALE,Full,Earth-616


In [3]:
# we can also exclude specific types
numerical_data =  avengers.select_dtypes(exclude="object")

numerical_data.head(3)

Unnamed: 0,Appearances,TotalDeaths,TotalReturns
0,1269,1,0
1,1165,1,1
2,3068,1,1


### Apply functions over variables (or columns)

In [4]:
import numpy as np

avengers.select_dtypes(exclude="object").apply(np.mean)

Appearances     446.219355
TotalDeaths       0.509677
TotalReturns      0.348387
dtype: float64

### Apply functions over observations (or rows)

In [5]:
from numpy.linalg import norm

def normalize(row):
    """
    Takes a vector of values and transforms it into a unit vector with length 1.
    This is achieved by computing v / ||v|| for each value in the row vector.
    """
    return row / norm(row)

(avengers.select_dtypes(exclude="object")
         .apply(normalize, axis=1)
         .head(3)
)

Unnamed: 0,Appearances,TotalDeaths,TotalReturns
0,1.0,0.000788,0.0
1,0.999999,0.000858,0.000858
2,1.0,0.000326,0.000326


### Scaling data

In [6]:
from sklearn.preprocessing import MinMaxScaler

def scale_data(df, scaler, plot=True):
    df = df.copy()
    cols = df.select_dtypes(exclude="object").columns
    df[cols] = scaler.fit_transform(df[cols])
    return df

min_max_scaler = MinMaxScaler()
(avengers.pipe(scale_data, min_max_scaler)
         .describe())

Unnamed: 0,Appearances,TotalDeaths,TotalReturns
count,155.0,155.0,155.0
mean,0.102567,0.101935,0.069677
std,0.162742,0.153518,0.134055
min,0.0,0.0,0.0
25%,0.014662,0.0,0.0
50%,0.036019,0.0,0.0
75%,0.121681,0.2,0.2
max,1.0,1.0,1.0


### Standardize all variables

In [7]:
from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler()
(avengers.pipe(scale_data, standard_scaler)
         .describe())

Unnamed: 0,Appearances,TotalDeaths,TotalReturns
count,155.0,155.0,155.0
mean,-6.87622e-17,6.87622e-17,0.0
std,1.003241,1.003241,1.003241
min,-0.6322885,-0.666151,-0.521453
25%,-0.5419045,-0.666151,-0.521453
50%,-0.4102428,-0.666151,-0.521453
75%,0.1178275,0.6408541,0.975311
max,5.532327,5.868874,6.962366


### Use `map` to easily replace values in a categorical column for numbers

In [8]:
(avengers['Active'].map({'YES': 1, 'NO': 0})
                   .head(n=3))

0    1
1    1
2    1
Name: Active, dtype: int64

### Turn a column to a `category` dtype with `astype(category)`

In [9]:
avengers = avengers.assign(Universe=avengers['Universe'].astype('category'))

avengers.describe(include='category')

Unnamed: 0,Universe
count,155
unique,7
top,Earth-616
freq,144


### Pandas `get_dummies` turns categorical columns to numerical ones

In [10]:
categorical_features = avengers.select_dtypes(include='category').columns
avengers = pd.get_dummies(avengers, columns=categorical_features, drop_first=True)
avengers.head(3)

Unnamed: 0,Name,Active,Gender,Membership,Appearances,TotalDeaths,TotalReturns,Universe_Earth-616,Universe_Earth-6311,Universe_Earth-691,Universe_Earth-8009,Universe_Earth-921,Universe_Earth-96020
0,"Henry Jonathan ""Hank"" Pym",YES,MALE,Full,1269,1,0,1,0,0,0,0,0
1,Janet van Dyne,YES,FEMALE,Full,1165,1,1,1,0,0,0,0,0
2,"Anthony Edward ""Tony"" Stark",YES,MALE,Full,3068,1,1,1,0,0,0,0,0


In [11]:
avengers.shape

(155, 13)

In [12]:
categorical_features = avengers.select_dtypes(include='category').columns
avengers = pd.get_dummies(avengers, columns=categorical_features, 
                          drop_first=True)
avengers.head(3)

Unnamed: 0,Name,Active,Gender,Membership,Appearances,TotalDeaths,TotalReturns,Universe_Earth-616,Universe_Earth-6311,Universe_Earth-691,Universe_Earth-8009,Universe_Earth-921,Universe_Earth-96020
0,"Henry Jonathan ""Hank"" Pym",YES,MALE,Full,1269,1,0,1,0,0,0,0,0
1,Janet van Dyne,YES,FEMALE,Full,1165,1,1,1,0,0,0,0,0
2,"Anthony Edward ""Tony"" Stark",YES,MALE,Full,3068,1,1,1,0,0,0,0,0


### The pandas `category` dtype behaves as an ordinal variable (the values are internally sorted from smaller to bigger). You can use this to transform ordinal variables into sorted numbers.

In [13]:
avengers = avengers.assign(Membership=avengers['Membership'].astype('category'))
avengers['Membership'].cat.categories

Index(['Academy', 'Full', 'Honorary', 'Probationary'], dtype='object')

We can set the hierarchy of the values manually using `.cat.set_categories` with a category column:

In [14]:
ordered_cats = ['Honorary', 'Academy', 'Probationary', 'Full']
avengers.Membership = avengers.Membership.cat.set_categories(ordered_cats,
                                                             ordered=True)

In [15]:
avengers['Membership'].min(), avengers['Membership'].max()

('Honorary', 'Full')

In [17]:
(avengers.assign(Membership=avengers.Membership.cat.codes)
         .sample(10))

Unnamed: 0,Name,Active,Gender,Membership,Appearances,TotalDeaths,TotalReturns,Universe_Earth-616,Universe_Earth-6311,Universe_Earth-691,Universe_Earth-8009,Universe_Earth-921,Universe_Earth-96020
130,Flash Thompson,YES,MALE,0,746,0,0,1,0,0,0,0,0
60,Craig Hollis,YES,MALE,3,33,0,0,1,0,0,0,0,0
26,Stakar,NO,MALE,0,100,0,0,0,0,1,0,0,0
96,Jessica Miriam Drew,YES,FEMALE,3,525,0,0,1,0,0,0,0,0
42,John F. Walker,NO,MALE,3,352,0,0,1,0,0,0,0,0
29,Carol Susan Jane Danvers,YES,FEMALE,3,935,0,0,1,0,0,0,0,0
115,Yvette,YES,FEMALE,1,22,0,0,1,0,0,0,0,0
149,Alexis,YES,FEMALE,3,13,0,0,1,0,0,0,0,0
106,Dennis Sykes,NO,MALE,3,6,1,0,1,0,0,0,0,0
126,Taki Matsuya,YES,MALE,1,18,0,0,1,0,0,0,0,0
