In [6]:
# import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from category_encoders import TargetEncoder

In [7]:
# data paths
train_data_path = "train.gz"
test_data_path = "test.gz"

In [8]:
# get train data sample using pd chunks. Each chunk has a size of 1000000
train_data = pd.read_csv(train_data_path, chunksize = 1000000)
train_df = pd.DataFrame()

for chunk in train_data:
    train_df = pd.concat([train_df, chunk.sample(frac=.05, replace=False, random_state=123)], axis=0)

In [9]:
# get the test data
test_data = pd.read_csv(test_data_path, chunksize = 1000000)
test_df = pd.DataFrame()

for chunk in test_data:
    test_df = pd.concat([test_df, chunk], axis = 0)

In [10]:
test_df = test_df.loc[0:2021448]

In [11]:
#check out the data
train_df.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
373315,2.754752e+18,1,14102102,1005,1,d9750ee7,98572c79,f028772b,ecad2386,7801e8d9,...,1,0,17753,320,50,1993,2,1063,-1,33
459286,9.630799e+18,0,14102102,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15699,320,50,1722,0,35,100083,79
262398,1.048226e+19,0,14102102,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15703,320,50,1722,0,35,100083,79
789396,1.830856e+19,0,14102104,1005,1,b8eae5f9,1e334bd3,f028772b,ecad2386,7801e8d9,...,1,0,19950,320,50,1800,3,167,100077,23
383229,3.559389e+18,0,14102102,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15701,320,50,1722,0,35,-1,79


In [12]:
#quick information on the data
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2021448 entries, 373315 to 40318589
Data columns (total 24 columns):
 #   Column            Dtype  
---  ------            -----  
 0   id                float64
 1   click             int64  
 2   hour              int64  
 3   C1                int64  
 4   banner_pos        int64  
 5   site_id           object 
 6   site_domain       object 
 7   site_category     object 
 8   app_id            object 
 9   app_domain        object 
 10  app_category      object 
 11  device_id         object 
 12  device_ip         object 
 13  device_model      object 
 14  device_type       int64  
 15  device_conn_type  int64  
 16  C14               int64  
 17  C15               int64  
 18  C16               int64  
 19  C17               int64  
 20  C18               int64  
 21  C19               int64  
 22  C20               int64  
 23  C21               int64  
dtypes: float64(1), int64(14), object(9)
memory usage: 385.6+ MB


In [13]:
for i in train_df.columns:
    print(train_df[i].value_counts())

1.156582e+19    1
1.608933e+19    1
1.050306e+19    1
1.604667e+19    1
5.166533e+18    1
               ..
5.820502e+18    1
4.844970e+18    1
3.788262e+18    1
5.010368e+18    1
1.072459e+19    1
Name: id, Length: 2021448, dtype: int64
0    1677654
1     343794
Name: click, dtype: int64
14102209    22395
14102210    21890
14102813    21550
14102212    20458
14102814    19423
            ...  
14102419     1733
14102423     1305
14102420     1171
14102421      992
14102422      724
Name: hour, Length: 240, dtype: int64
1005    1857442
1002     110650
1010      45052
1012       5780
1007       1782
1001        456
1008        286
Name: C1, dtype: int64
0    1454765
1     563132
7       2150
2        653
4        350
5        284
3        114
Name: banner_pos, dtype: int64
85f751fd    728733
1fbe01fe    324585
e151e245    131951
d9750ee7     48385
5b08c53b     45472
             ...  
063584d1         1
2a555dc4         1
c685914b         1
4edf288e         1
23029590         1
Name: si

In [14]:
# some feature engineering and pipeline building
# get the day of the weeko through a pipeline class
class DateTransformer(BaseEstimator, TransformerMixin):
    '''Custom transformer for making date variables'''
    def __init__(self, data_col):
        self.data_col = data_col
        
    def build_date(self, hour):
        day = str(hour)[4:6]
        month = str(hour)[2:4]
        year = '20'+str(hour)[0:2]
        return day + '-' + month + '-' + year
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y = None):
        X_ = X.copy()
        X_['fdate'] = pd.to_datetime(X_.hour.apply(self.build_date))
        X_['weekday'] = X_['fdate'].dt.dayofweek.astype(str)
        return X_

# many of our categorical variables have high cardinality. let's try target encoding/mean encoding on them
class EncoderTransformer(BaseEstimator, TransformerMixin):
    '''Custom transformer for encoding categorical variables'''
    def __init__(self, encoder, target, train):
        self.encoder = encoder
        self.target = target
        self.train = train
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_ = X.copy()
        if self.train == 1:
            X_ = self.encoder.fit_transform(X_, self.target)
        elif self.train == 0:
            X_ = self.encoder.transform(X_, self.target)
        return X_

In [15]:
cat_cols = ['hour','C1', 'banner_pos', 'site_id', 'site_domain', 'site_category',
       'app_id', 'app_domain', 'app_category', 'device_id', 'device_ip',
       'device_model', 'device_type', 'device_conn_type', 'C14', 'C15', 'C16',
       'C17', 'C18', 'C19', 'C20', 'C21']

In [16]:
# declare target encoder
encoder = TargetEncoder(cols = ['C1', 'banner_pos', 'site_id', 'site_domain', 'site_category',
       'app_id', 'app_domain', 'app_category', 'device_id', 'device_ip',
       'device_model', 'device_type', 'device_conn_type', 'C14', 'C15', 'C16',
       'C17', 'C18', 'C19', 'C20', 'C21'])



In [17]:
# make training and test sets
X_train = train_df.drop(['click', 'id'], axis=1)
y_train = train_df.click.values
X_test = test_df.drop('id', axis=1)

In [18]:
num_cols = ['C1', 'banner_pos', 'site_id', 'site_domain', 'site_category',
       'app_id', 'app_domain', 'app_category', 'device_id', 'device_ip',
       'device_model', 'device_type', 'device_conn_type', 'C14', 'C15', 'C16',
       'C17', 'C18', 'C19', 'C20', 'C21']

In [19]:
# create pipeline steps
train = 1
cat_pipeline = Pipeline(steps=[
    ('date_maker', DateTransformer('hour')),
    ('en_coder', EncoderTransformer(encoder, y_train, train))
    #('std_scaler', StandardScaler())
])

In [20]:
# create second pipeline step. alternatively we could have combined the two pipelines using the Column Transformer module
num_pipeline = Pipeline(steps=[
    ('std_scaler', StandardScaler())
])

In [21]:
# fit and transform our trainingv data on our pieplines
X_train_tr = cat_pipeline.fit_transform(X_train)
X_train_tr = num_pipeline.fit_transform(X_train_tr[num_cols])

In [22]:
# we want to use logistic regression because of the binary nature of our target. 
# but we have some class imbalance so let's first use a gird search to find the right weights for our model
lr = LogisticRegression(solver='newton-cg')

#Setting the range for class weights
weights = np.linspace(0.0,0.99,6)


#Creating a dictionary grid for grid search
param_grid = {'class_weight': [{0:x, 1:1.0-x} for x in weights]}

#Fitting grid search to the train data with 5 folds
gridsearch = GridSearchCV(estimator= lr, 
                          param_grid= param_grid,
                          cv=StratifiedKFold(), 
                          n_jobs=-1, 
                          scoring='f1', 
                          verbose=2).fit(X_train_tr, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


24 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\x2939\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\x2939\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_logistic.py", line 1233, in fit
    fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, prefer=prefer)(
  File "c:\Users\x2939\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 1085, in __call__
    if self.dispatch_one_batch(iterator):
  

In [24]:
# declare sklearn logistic regressor and fit the model using the optimal classw weights from above
lr = LogisticRegression(class_weight={0: 0.8, 1: 0.2}, max_iter=5000)
lr.fit(X_train_tr, y_train)

In [25]:
# we can use the statsmodel library for logit regression to better understand out model. especially to see if any variables are statistically insignificant. 
# The 5th variable (site category) seems to be statistically insignificant (P > 0.05). We can take that variable out and rerun the process 
logit_model=sm.Logit(y_train,sm.add_constant(X_train_tr))
logit_model
result=logit_model.fit()
stats1=result.summary()
stats2=result.summary2()
print(stats1)
print(stats2)

Optimization terminated successfully.
         Current function value: 0.360985
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:              2021448
Model:                          Logit   Df Residuals:                  2021426
Method:                           MLE   Df Model:                           21
Date:                Mon, 19 Sep 2022   Pseudo R-squ.:                  0.2084
Time:                        12:59:50   Log-Likelihood:            -7.2971e+05
converged:                       True   LL-Null:                   -9.2178e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.0094      0.003   -777.599      0.000      -2.014      -2.004
x1             0.0330      0.

In [26]:
# Checking for Multicollinearity. This will help us find correlated groups of variables. A high VIF (>10) signifies issues with collinearity. 
# A few variables show a high VIF meaning we need to go back and do a proper feature selection process.
# For each X, calculate VIF and save in dataframe
x_temp = sm.add_constant(X_train_tr)

hold = ['const']
for i in list(X_train)[1:]:
    hold.append(i)

vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(x_temp, i) for i in range(x_temp.shape[1])]
vif["features"] = hold
vif.round(1)

Unnamed: 0,VIF Factor,features
0,1.0,const
1,12.2,C1
2,1.4,banner_pos
3,7.6,site_id
4,7.4,site_domain
5,2.0,site_category
6,2.6,app_id
7,2.9,app_domain
8,2.6,app_category
9,1.1,device_id
