In [1]:
# This notebook contains steps to create a sparse matrix of features from the training dataset
# The id, device_id, device_ip and click columns are left out
# Object columns are cast as categorical and one-hot-encoded, then joined to a sparse CSR matrix of numeric features
# The resulting sparse matrix (~29k features x 40MM rows) has been stored as an .npz file in the /assets/ directory
# Clicks from the training set have been stored in a CSV file called y_train
# TODO (if needed) - check to make sure that correct index values have been persisted through transformations and joining

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from scipy import sparse


In [3]:
# loading in test data as well as the Sample Submission file
sample = pd.read_csv('../assets/sampleSubmission')
test = pd.read_csv('../assets/test')

In [4]:
# Because of GitHub space limits (no files over 2GB), train data file was split into 5 pieces

# Loading the first file with header row to use for column names
trainaa = pd.read_csv("../assets/trainaa")
print("trainaa loaded")

# Loading subsequent files as separate dataframes with common column names
trainab = pd.read_csv("../assets/trainab", header=None, names=trainaa.columns)
print("trainab loaded")
trainac = pd.read_csv("../assets/trainac", header=None, names=trainaa.columns)
print("trainac loaded")
trainad = pd.read_csv("../assets/trainad", header=None, names=trainaa.columns)
print("trainad loaded")
trainae = pd.read_csv("../assets/trainae", header=None, names=trainaa.columns)
print("trainae loaded")

# Concatenating all files into one training set for EDA purposes
%time train = pd.concat([trainaa, trainab, trainac, trainad, trainae], ignore_index=True)
train.head(1)

trainaa loaded
trainab loaded
trainac loaded
trainad loaded
trainae loaded


Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000009e+18,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15706,320,50,1722,0,35,-1,79


In [5]:
# creating a list of categorical columns
# dropping device_id and device_ip columns (with 2MM+ and 6MM+ values) for now
object_cols = train.select_dtypes(include='object').drop(columns=['device_id',  'device_ip'])

In [6]:
object_cols.dtypes

site_id          object
site_domain      object
site_category    object
app_id           object
app_domain       object
app_category     object
device_model     object
dtype: object

In [7]:
# setting columns as categorical so we can generate sparse matrix
object_cols = object_cols.apply(lambda x: x.astype('category'))


In [8]:
# dropping ID which we don't need
numeric_cols = train.select_dtypes(exclude='object').drop(columns=['id'])

In [9]:
# pulling out clicks to make our y 
y = numeric_cols.click.copy()
numeric_cols = numeric_cols.drop(columns=['click'])
y.shape

(40428967,)

In [10]:
# Label Encoding the object columns to make them into 1s and 0s

le = LabelEncoder()
%time object_cols_encoded = object_cols.apply(le.fit_transform)

CPU times: user 6min 43s, sys: 27.9 s, total: 7min 11s
Wall time: 7min 15s


In [11]:
# checking shape
object_cols_encoded.shape

(40428967, 7)

In [12]:
# One Hot Encoding the categorical columns

ohe = OneHotEncoder() 
%time object_cols_onehot = ohe.fit_transform(object_cols_encoded)

CPU times: user 30.5 s, sys: 50.3 s, total: 1min 20s
Wall time: 1min 29s


In [13]:
object_cols_onehot.shape

(40428967, 29906)

In [14]:
# OneHot returned a sparse matrix

type(object_cols_onehot)

scipy.sparse.csr.csr_matrix

In [15]:
# Normalizing numeric columns before sparsifying the resulting matrix

ss = StandardScaler()
%time numeric_cols_scaled = ss.fit_transform(numeric_cols)

In [22]:
numeric_cols_scaled.shape

NameError: name 'numeric_cols_scaled' is not defined

In [None]:
type(numeric_cols_scaled)

In [16]:
# Making a sparse matrix from the scaled numeric columns

%time numeric_sparse = sparse.csr_matrix(numeric_cols_scaled)

CPU times: user 45.3 s, sys: 1min 57s, total: 2min 42s
Wall time: 4min 11s


In [17]:
# checking shape

numeric_sparse.shape

(40428967, 13)

In [18]:
# assembling a complete sparse matrix from numeric and dummy features

%time sparse_features = sparse.hstack([numeric_sparse, object_cols_onehot])

In [19]:
# checking shape - ~1 Trillion "cells" represented
sparse_features.shape

(40428967, 29919)

In [20]:
# save sparse matrix to a file

%time saved = sparse.save_npz('../assets/sparse_no_dev_id_dev_ip', sparse_features)

In [21]:
# save y to a file

%time y_saved = y.to_csv('../assets/y_train')