In [1]:
import os
import sys
import json
import time
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix, vstack
import scipy
import random
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, make_scorer, precision_recall_fscore_support
from sklearn.model_selection import GridSearchCV
from hypopt import GridSearch
from hypopt.model_selection import GridSearch

import warnings
warnings.filterwarnings('ignore')

## get vocab and numer_stats files

In [2]:
with open('../data/processed/mm-cpc-generator/train/categorical-vocab.json') as f:
    cat_vocab = json.load(f)
f.close()

with open("../data/processed/mm-cpc-generator/train/numerical-stats.json") as f:
    numer_stats = json.load(f)
f.close()

In [3]:
def get_vocab_key_len_mapper(cat_vocab):
    idx_mapper = {}
    for key, value in cat_vocab.items():
        temp_idx = {}
        for i, val in enumerate(value):
            temp_idx[val] = i
        idx_mapper[key] = temp_idx
    return idx_mapper

In [4]:
def get_sparse_array(cat_vocab, numer_stats, data_df, data_type="csv"):
    if data_type == "csv":
        data_df = pd.read_csv(data_df)
    idx_mapper = get_vocab_key_len_mapper(cat_vocab)
    count = 0
    data_df = data_df.drop(["conversion_target"], axis=1)
    for col in numer_stats:
        if numer_stats[col]['std'] == 0:
            continue
        data_df[col] = (data_df[col] - numer_stats[col]['mean'])/numer_stats[col]['std']
    df_cols = list(data_df.columns)
    for idx, row in data_df.iterrows():
        row_list = []
        for col in df_cols:
            if col in idx_mapper:
                temp = [0]*(len(idx_mapper[col]) + 1)
                if row[col] in idx_mapper[col]:
                    temp[idx_mapper[col][row[col]]] = 1
                else:
                    temp[-1] = 1
            else:
                temp = [row[col]]
            row_list += temp
        if idx == 0:
            pos_arr = coo_matrix(row_list)
        else:
            temp_sparse = coo_matrix(row_list)
            pos_arr = vstack([pos_arr, temp_sparse])
        count += 1
        if count%10000 == 0:
            print("finished ", count, "samples")
    return pos_arr

## create sparse array for positive training labels

In [None]:
dir_path = "../data/processed/mm-cpc-generator/train/"
files = os.listdir(dir_path)
flag = True
for i,file in enumerate(files):
    if "positive" in file:
        filepath = dir_path + file
        if flag:
            pos_sparse_arr = get_sparse_array(cat_vocab, numer_stats,filepath)
            flag = False
        else:
            temp_arr = get_sparse_array(cat_vocab, numer_stats,filepath)
            pos_sparse_arr = vstack([pos_sparse_arr, temp_arr])

scipy.sparse.save_npz('../data/intermediate/pos_sparse_arr',pos_sparse_arr)

## Create sparse array for negative training labels

In [None]:
def getnegativearray(cat_vocab, numer_stats, dir_path, count):
    files = os.listdir(dir_path)
    flag = True
    counter = 0
    files = [d for d in files if "negative" in d]
    random.shuffle(files)
    for i,file in enumerate(files):
        if counter == count:
            break
        filepath = dir_path + file
        if flag:
            sparse_arr = get_sparse_array(cat_vocab, numer_stats, filepath)
            flag = False
        else:
            temp_arr = get_sparse_array(cat_vocab, numer_stats, filepath)
            sparse_arr = vstack([sparse_arr, temp_arr])
        counter += 1
    return sparse_arr

In [None]:
dir_path = "../data/processed/mm-cpc-generator/train/"
neg_sparse_arr = getnegativearray(cat_vocab, numer_stats, dir_path, count=14)
scipy.sparse.save_npz('../data/intermediate/neg_sparse_arr',neg_sparse_arr)

## Prepare Validation data

In [None]:
filepath = "../data/processed/mm-cpc-generator/validation/part-00000-f7421232-0a5f-43a1-a30d-359dd1e1b618-c000.csv"
valid_df = pd.read_csv(filepath)
y_valid = valid_df.conversion_target.values
X_valid = get_sparse_array(cat_vocab, numer_stats, valid_df, data_type="pandas")
scipy.sparse.save_npz('../data/intermediate/valid_sparse_arr', X_valid)

## Prepare Test data

In [None]:
filepath = "../data/processed/mm-cpc-generator/test/part-00000-66d695ad-776c-44b9-8feb-9f3906f297a0-c000.csv"
test_df = pd.read_csv(filepath)
y_test = test_df.conversion_target.values
X_test = get_sparse_array(cat_vocab, numer_stats, test_df, data_type="pandas")
scipy.sparse.save_npz('../data/intermediate/test_sparse_arr',X_test)

## Load training data

In [3]:
X_pos = scipy.sparse.load_npz('../data/intermediate/pos_sparse_arr.npz')
X_neg = scipy.sparse.load_npz('../data/intermediate/full_neg_sparse_arr.npz')
X_train = vstack([X_pos, X_neg])
y_pos = [1]*X_pos.shape[0]
y_neg = [0]*X_neg.shape[0]
y_train = y_pos + y_neg
y_train = np.array(y_train)
X_train, y_train = shuffle(X_train, y_train)

## Load Validation data

In [4]:
filepath = "../data/processed/mm-cpc-generator/validation/part-00000-f7421232-0a5f-43a1-a30d-359dd1e1b618-c000.csv"
valid_df = pd.read_csv(filepath)
y_valid = valid_df.conversion_target.values
X_valid = scipy.sparse.load_npz('../data/intermediate/valid_sparse_arr.npz')

## Load Test data

In [5]:
filepath = "../data/processed/mm-cpc-generator/test/part-00000-66d695ad-776c-44b9-8feb-9f3906f297a0-c000.csv"
test_df = pd.read_csv(filepath)
y_test = test_df.conversion_target.values
X_test = scipy.sparse.load_npz('../data/intermediate/test_sparse_arr.npz')

## Logistic Regression model

In [5]:
clf = LogisticRegression()
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

## ROC of the model

In [6]:
y_train_pred = clf.predict_proba(X_train)
y_valid_pred = clf.predict_proba(X_valid)
y_test_pred = clf.predict_proba(X_test)
train_auc_score = roc_auc_score(y_train, y_train_pred.T[1])
valid_auc_score = roc_auc_score(y_valid, y_valid_pred.T[0])
test_auc_score = roc_auc_score(y_test, y_test_pred.T[0])
print("training roc auc score is: ", train_auc_score)
print("validation roc auc score is: ", valid_auc_score)
print("test roc auc score is: ", test_auc_score)

training roc auc score is:  1.0
validation roc auc score is:  0.8456836796445666
test roc auc score is:  0.9349833819723681


## Precision, Recall and F1 score 

In [7]:
y_test_dis = clf.predict(X_test)
precision_recall_fscore_support(y_test, y_test_dis)

(array([0.88689899, 0.08928571]),
 array([9.99797873e-01, 1.55400155e-04]),
 array([9.39970526e-01, 3.10260308e-04]),
 array([252317,  32175]))

## GridSearch

In [6]:
param_grid = [{'C':np.linspace(0.001, 100, num=30),'penalty':['l2']}]
gs = GridSearch(model=LogisticRegression())
scorer = make_scorer(roc_auc_score)
gs.fit(X_train, y_train, param_grid, X_valid, y_valid, scoring=scorer)

Comparing 30 parameter setting(s) using 30 CPU thread(s) ( 1 job(s) per thread ).


LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [7]:
y_train_pred = gs.predict_proba(X_train)
y_valid_pred = gs.predict_proba(X_valid)
y_test_pred = gs.predict_proba(X_test)
train_auc_score = roc_auc_score(y_train, y_train_pred.T[1])
valid_auc_score = roc_auc_score(y_valid, y_valid_pred.T[0])
test_auc_score = roc_auc_score(y_test, y_test_pred.T[0])
print("training roc auc score is: ", train_auc_score)
print("validation roc auc score is: ", valid_auc_score)
print("test roc auc score is: ", test_auc_score)

training roc auc score is:  1.0
validation roc auc score is:  0.8227894296169733
test roc auc score is:  0.9034792923797628


In [8]:
y_test_dis = gs.predict(X_test)
precision_recall_fscore_support(y_test, y_test_dis)

(array([0.88689859, 0.0877193 ]),
 array([9.99793910e-01, 1.55400155e-04]),
 array([9.39968552e-01, 3.10250683e-04]),
 array([252317,  32175]))

## Merging Validation and Training Data

In [8]:
X_train = vstack([X_train, X_valid])
y_train = np.concatenate([y_train, y_valid])

### Logistic Regression

In [9]:
clf = LogisticRegression()
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

### Accuracy of the model

In [10]:
y_train_pred = clf.predict_proba(X_train)
y_valid_pred = clf.predict_proba(X_valid)
y_test_pred = clf.predict_proba(X_test)
train_auc_score = roc_auc_score(y_train, y_train_pred.T[1])
valid_auc_score = roc_auc_score(y_valid, y_valid_pred.T[1])
test_auc_score = roc_auc_score(y_test, y_test_pred.T[1])
print("training roc auc score is: ", train_auc_score)
print("validation roc auc score is: ", valid_auc_score)
print("test roc auc score is: ", test_auc_score)

training roc auc score is:  0.9993892079471345
validation roc auc score is:  0.9870191685449049
test roc auc score is:  0.9402191538394806


### Precision, Recall and F1 score

In [11]:
y_test_dis = clf.predict(X_test)
precision_recall_fscore_support(y_test, y_test_dis)

(array([0.88691596, 0.22580645]),
 array([9.99904882e-01, 2.17560218e-04]),
 array([9.40027348e-01, 4.34701608e-04]),
 array([252317,  32175]))