# Tutorial 2 - Data Preprocessing

##### The following notebook is a modified version version of the talk given by April Chen in 2016 titled Depy 2016 Talk: Pre-Modeling: Data Preprocessing and Feature Exploration in Python. More information can be found from the original repo: https://github.com/aprilypchen/depy2016


### The objective of the tutorial is to show the importance of data manipulation, and how to apply common data pre-processing techniques to improve model performance.

In [None]:
# Bread and butter libraries to deal with dataframes and matrices
import numpy as np
import pandas as pd

##### For the following workshop, we will be using an edited version of the "adult" dataset from the public UCI repository. The dataset consists of information on various individuals, including age, education, marital status, gender, and income.

In [None]:
# Machine learning models cannot deal with null values, we will go over techniques to deal with them
na_values = ['#NAME?']
df = pd.read_csv('adult.csv', na_values=na_values)

In [None]:
# peek at first 10
df.head(10)

#### Binary Classification Problem: Predict, based on various features from the dataset, if someone's income is greater or less than 50k

In [None]:
# Observe dataset distribution, in practice when unbalanced datasets are not well treated, performance metrics can
# be very misleading
df['income'].value_counts()

In [None]:
# Encode as 0 if income <=50K and as 1 if income >50K

df['income'] = [0 if x == '<=50K' else 1 for x in df['income']]

In [None]:
df.income.head(10)

## 1. Benchmark performance with unprocessed data

In [None]:
# Drop na values to not throw any errors
df_unprocessed = df
df_unprocessed = df_unprocessed.dropna(axis=0, how='any')

print(df.shape)
print(df_unprocessed.shape)

In [None]:
# Remove non-numeric columns so model doesn't throw errors
# Potential loss of information from categorical features is evident

for col_name in df_unprocessed.columns:
    if df_unprocessed[col_name].dtypes not in ['int32','int64','float32','float64']:
        df_unprocessed = df_unprocessed.drop(col_name, 1)

In [None]:
# Split into features and target variable
X_unprocessed = df_unprocessed.drop('income', 1)
y_unprocessed = df_unprocessed.income

In [None]:
X_unprocessed.head(5)

### Import algo to measure baseline accuracy

In [None]:
# Import common ML tools from sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### Split data into train/test split

In [None]:
X_train_unproc, X_test_unproc, y_train, y_test = train_test_split(
    X_unprocessed, y_unprocessed, train_size=0.70, test_size=0.30)

In [None]:
# Function that returns model accuracy
def find_model_perf(X_train, y_train, X_test, y_test):
    model = LogisticRegression()
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    acc = accuracy_score(y_test, pred)

    
    return acc

In [None]:
acc_unprocessed= find_model_perf(X_train_unproc, y_train, X_test_unproc, y_test)

In [None]:
acc_unprocessed

#### NOTE: when feeding a test features into your prediction model, ensure test set has gone through the same preprocessing as your training set

## 2. Explore feature space to determine how to perform data pre-processing, then feed processed data into model to evaluate performance difference

In [None]:
# Separate features from target var.

y = df.income
X = df.drop(['income'], 1)

In [None]:
y.head(15)

In [None]:
X.head(5)

### Dealing with Categorical Data: One-hot encoding

#### One simple strategy to convert categorical data to numerical data is to create one dummy variable for each categorical value possible, then flagging with a 1 when the value is present

In [None]:
pd.get_dummies(X['education']).head(5)

#### Determine how many possibles values there are for each feature

In [None]:
# Decide which categorical variables you want to use in model 
for col_name in X.columns:
    if X[col_name].dtypes == 'object':
        unique_cat = len(X[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} unique categories".format(col_name=col_name, unique_cat=unique_cat))

In [None]:
# Investigate why there is a high number of unique values for 'native_country'
X['native_country'].value_counts().sort_values(ascending=False).head(10)

In [None]:
# In this case, bucket low frequecy categories as "Other"
X['native_country'] = ['United-States ' if x == 'United-States' else 'Other' for x in X['native_country']]

print(X['native_country'].value_counts().sort_values(ascending=False))

#### Create list of important categorical features to encode

In [None]:
# Create a list of features to dummy# Create 
todummy_list = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']

In [None]:
# Function to dummy all the categorical variables used for modeling
def dummy_df(df, todummy_list):
    for x in todummy_list:
        dummies = pd.get_dummies(df[x], prefix=x, dummy_na=False)
        df = df.drop(x, 1)
        df = pd.concat([df, dummies], axis=1)
    return df

In [None]:
X = dummy_df(X, todummy_list)
print(X.head(5))

### Investigate null values

In [None]:
X.isnull().sum().sort_values(ascending=False).head()

In [None]:
# Impute missing values using Imputer in sklearn.preprocessing
from sklearn.preprocessing import Imputer

imp = Imputer(missing_values='NaN', strategy='median', axis=0)
imp.fit(X)
X = pd.DataFrame(data=imp.transform(X) , columns=X.columns)

In [None]:
# Sanity check
X.isnull().sum().sort_values(ascending=False).head()

### PCA to find most important components

In [None]:
# Use PCA from sklearn.decompostion to find principal components
from sklearn.decomposition import PCA

pca = PCA(n_components=10)
X_pca = pd.DataFrame(pca.fit_transform(X))

In [None]:
X_pca.head(10)

### Evaluate the same algorithm, but with preprocessed dataset

In [None]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, train_size=0.70, random_state=1)

In [None]:
acc_proc= find_model_perf(X_train, y_train, X_test, y_test)

In [None]:
acc_proc

In [None]:
improvement = np.round(((acc_proc - acc_unprocessed)/acc_unprocessed),3)*100

In [None]:
improvement