# Final Capstone

In this notebook we load ~10MM rows of data from the Kaggle Avazu click prediction competition. We then create train/test splits, perform feature engineering, and then make predictions which we validate with our test split of data. Similar to the Kaggle competition we use Log Loss as our metric for evaluation of the model's performance.

With the exception of a date column ('hour'), all of the columns in this data set are categorical, even though some are encoded as integers (source: Kaggle docs and boards).  After 'id' and 'hour' are dropped, therefore, all values are converted to strings, so that every column can be treated with a separate CountVectorizer which is fit on training data and which performs transformations on both training and test data.  

Predictions were created with LogisticRegression and MultinomialNaiveBayes.  Predictions were attempted with MLP and Random Forest but have not been completed due to time required in fitting.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.utils import resample
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
from scipy import sparse
from sklearn.feature_extraction import FeatureHasher 
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost
from scipy.sparse import hstack
from sklearn.metrics import classification_report, log_loss, roc_auc_score, roc_curve, confusion_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.neural_network import MLPClassifier



In [2]:
# Loading 10MM rows
%time data = pd.read_csv("../assets/trainaa")

KeyboardInterrupt: 

In [3]:
print(f"Shape of data before sampling: {data.shape}")

NameError: name 'data' is not defined

In [None]:
# Taking 2MM random rows
train = data.sample(frac=0.2, replace=False)

In [None]:
print(f"Shape of train after sampling: {train.shape}")

In [None]:
# Dropping un-needed columns
train = train.drop(columns=['hour', 'id'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train.drop(columns=['click']), train['click'], random_state=99, stratify=train['click'])

In [None]:
X_train.columns

In [None]:
# From EDA we know there's a negative value here 
X_train.C20.replace(to_replace=-1, value=1, inplace=True)
X_test.C20.replace(to_replace=-1, value=1, inplace=True)

In [None]:
# two banner dimensions combined into one feature
X_train['banner'] = X_train['C15'].map(str) + X_train['C16'].map(str)
X_test['banner'] = X_test['C15'].map(str) + X_test['C16'].map(str)

In [None]:
X_train = X_train.drop(columns=['C15', 'C16'])
X_test = X_test.drop(columns=['C15', 'C16'])

In [None]:
# Convert all values to strings since these are all categorical columns
X_train = X_train.applymap(str)
X_test = X_test.applymap(str)

In [None]:
columns = X_train.columns.values.tolist()

In [None]:
train_vec = []
test_vec = []

In [None]:
# create a vectorized representation of all columns, while ensuring congruent shapes between train and test

for col in columns:
    cvec = CountVectorizer(stop_words=None, min_df=1, token_pattern=r"(?u)\b\w+\b")
    train_transform = cvec.fit_transform(X_train[col])
    test_transform = cvec.transform(X_test[col])
    train_vec.append(train_transform)
    test_vec.append(test_transform)
    print(f"completed {col}. train shape {train_transform.shape} - test shape {test_transform.shape}")


In [None]:
train_matrix = sparse.hstack(train_vec)

In [None]:
test_matrix = sparse.hstack(test_vec)

In [None]:
print(f"Train shape: {train_matrix.shape} Test shape: {test_matrix.shape}")

In [None]:
results = pd.DataFrame(y_test)

In [None]:
nb = MultinomialNB()
nb.fit(train_matrix, y_train)
y_pred_nb = nb.predict(test_matrix)
results['NB'] = y_pred_nb

In [None]:
log_loss(y_test, y_pred_nb)

In [None]:
print(classification_report(y_test, y_pred_nb))

In [None]:
lr = LogisticRegression()
lr.fit(train_matrix, y_train)
y_pred_lr = lr.predict(test_matrix)
results['LR'] = y_pred_lr
log_loss(y_test, y_pred_lr)

In [None]:
print(classification_report(y_test, y_pred_lr))

In [None]:
lr2 = LogisticRegression(class_weight= {0:.3, 1:.7} )
lr2.fit(train_matrix, y_train)
y_pred_lr2 = lr2.predict(test_matrix)
results['LR2'] = y_pred_lr2
log_loss(y_test, y_pred_lr2)

In [None]:
print(classification_report(y_test, y_pred_lr2))

In [None]:
mlp = MLPClassifier(early_stopping=True)
mlp.fit(train_SVD_scaled, y_train)
y_pred_mlp = mlp.predict(test_SVD_scaled)
results['MLP'] = y_pred_mlp
log_loss(y_test, y_pred_mlp)

In [None]:
print(classification_report(y_test, y_pred_mlp))

In [None]:
tsvd = TruncatedSVD(n_components=25) 

In [None]:
train_SVD = tsvd.fit_transform(train_matrix)
test_SVD = tsvd.transform(test_matrix)

In [None]:
mmx = MinMaxScaler()
train_SVD_scaled = mmx.fit_transform(train_SVD)
test_SVD_scaled = mmx.transform(test_SVD)

In [None]:
mlp2 = MLPClassifier(early_stopping=True)
mlp2.fit(train_SVD_scaled, y_train)
y_pred_mlp2 = mlp2.predict(test_SVD_scaled)
results['MLP2'] = y_pred_mlp2
log_loss(y_test, y_pred_mlp2)

In [None]:
print(classification_report(y_test, y_pred_mlp2))

In [None]:
results['AVG'] = (results['NB'] + results['MLP'] + results['LR'])/3

In [None]:
log_loss(y_test, results['AVG'])

In [None]:
print(classification_report(y_test, results['AVG']))

In [None]:
rf = RandomForestClassifier(class_weight='balanced')
rf.fit(train_SVD_scaled, y_train)
y_pred_rf = rf.predict(test_SVD_scaled)
results['RF'] = y_pred_rf
log_loss(y_test, y_pred_rf)

In [None]:
print(classification_report(y_test, y_pred_rf))

In [None]:
results['AVG'] = (results['NB'] + results['MLP'] + results['LR'] + results['RF'])/4
log_loss(y_test, results['AVG'])

In [None]:
print(classification_report(y_test, results['AVG']))