In [1]:
# This notebook contains steps to create a sparse matrix of features from the training dataset
# The id, device_id, device_ip and click columns are left out
# Object columns are cast as categorical and one-hot-encoded, then joined to a sparse CSR matrix of numeric features
# The resulting sparse matrix (~29k features x 40MM rows) has been stored as an .npz file in the /assets/ directory
# Clicks from the training set have been stored in a CSV file called y_train
# TODO (if needed) - check to make sure that correct index values have been persisted through transformations and joining

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from scipy import sparse


In [3]:
# loading in test data as well as the Sample Submission file
sample = pd.read_csv('../assets/sampleSubmission')
test = pd.read_csv('../assets/test')

In [4]:
# Because of GitHub space limits (no files over 2GB), train data file was split into 5 pieces

# Loading the first file with header row to use for column names
trainaa = pd.read_csv("../assets/trainaa")
print("trainaa loaded")

# Loading subsequent files as separate dataframes with common column names
trainab = pd.read_csv("../assets/trainab", header=None, names=trainaa.columns)
print("trainab loaded")
trainac = pd.read_csv("../assets/trainac", header=None, names=trainaa.columns)
print("trainac loaded")
trainad = pd.read_csv("../assets/trainad", header=None, names=trainaa.columns)
print("trainad loaded")
trainae = pd.read_csv("../assets/trainae", header=None, names=trainaa.columns)
print("trainae loaded")

# Concatenating all files into one training set for EDA purposes
%time train = pd.concat([trainaa, trainab, trainac, trainad, trainae], ignore_index=True)
train.head(1)

trainaa loaded
trainab loaded
trainac loaded
trainad loaded
trainae loaded
CPU times: user 11.2 s, sys: 35 s, total: 46.2 s
Wall time: 1min 3s


Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000009e+18,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15706,320,50,1722,0,35,-1,79


In [5]:
# creating a list of categorical columns
# dropping device_id and device_ip columns (with 2MM+ and 6MM+ values) for now
obj_cols_train = train.select_dtypes(include='object').drop(columns=['device_id',  'device_ip'])
obj_cols_test = test.select_dtypes(include='object').drop(columns=['device_id',  'device_ip'])

In [6]:
obj_cols_train.dtypes

site_id          object
site_domain      object
site_category    object
app_id           object
app_domain       object
app_category     object
device_model     object
dtype: object

In [7]:
obj_cols_test.dtypes

site_id          object
site_domain      object
site_category    object
app_id           object
app_domain       object
app_category     object
device_model     object
dtype: object

In [None]:
%time obj_dummies_train = pd.get_dummies(obj_cols_train)

In [None]:
obj_dummies_test = pd.get_dummies(obj_cols_test)

In [None]:
%train dummies_aligned_train, dummies_aligned_test = obj_dummies_train.align(obj_dummies_test, fill_value=0)

In [10]:
# Label Encoding the object columns to make them into 1s and 0s

#le_train = LabelEncoder()
#le_test = LabelEncoder()

#%time obj_cols_enc_train = obj_cols_train.apply(le_train.fit_transform)
#obj_cols_enc_test = obj_cols_test.apply(le_test.fit_transform)

KeyboardInterrupt: 

In [None]:
assert red = blue

In [9]:
# setting columns as categorical so we can generate sparse matrix
obj_cols_train = obj_cols_train.apply(lambda x: x.astype('category'))
obj_cols_test = obj_cols_test.apply(lambda x: x.astype('category'))


In [11]:
# checking shapes
print(f"Train object cols encoded shape: {obj_cols_enc_train.shape}\n Test object cols encoded shape: {obj_cols_enc_test.shape}")




NameError: name 'obj_cols_enc_train' is not defined

In [None]:
# One Hot Encoding the categorical columns

ohe_train = OneHotEncoder()
ohe_test = OneHotEncoder()

%time onehot_train = ohe_train.fit_transform(obj_cols_enc_train)
onehot_test = ohe_test.fit_transform(obj_cols_enc_test)

In [None]:
object_cols_onehot.shape

In [None]:
# OneHot returned a sparse matrix

type(object_cols_onehot)

In [None]:
# Y (which we will need for validation)
y = train.click.copy()

# Test IDs (which we will need for sample submission)
test_ids = test['id'].copy()

In [None]:
# Numeric columns
num_cols_train = train.select_dtypes(exclude='object').drop(columns=['id','click'])

num_cols_test = test.select_dtypes(exclude='object').drop(columns=['id'])

In [None]:
# Normalizing numeric columns 

ss_train = StandardScaler()
ss_test = StandardScaler()

%time num_cols_scaled_train = ss_train.fit_transform(num_cols_train)
num_cols_scaled_test = ss_test.fit_transform(num_cols_test)


In [None]:
num_cols_train_scaled.shape

In [None]:
num_cols_test_scaled.shape

In [None]:
# Making a sparse matrix from the scaled numeric columns

#%time num_sparse_train = sparse.csr_matrix(num_cols_scaled_train)
#num_sparse_test = sparse.csr_matrix(num_cols_scaled_test)

In [None]:
# checking shape

#numeric_sparse.shape#

In [None]:
# assembling a complete sparse matrix from numeric and dummy features

#%time sparse_features = sparse.hstack([numeric_sparse, object_cols_onehot])

In [None]:
# checking shape - ~1 Trillion "cells" represented
#sparse_features.shape

In [None]:
# save sparse matrix to a file

#%time saved = sparse.save_npz('../assets/sparse_no_dev_id_dev_ip', sparse_features)

In [None]:
# save y to a file

#%time y_saved = y.to_csv('../assets/y_train')