In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
datapath = "data/data_no_outlier/"
file_Xtrain = "X_train_no_outlier.csv"
file_ytrain = "y_train_no_outlier.csv"
file_Xtest  = "X_test_no_outlier.csv"
file_ytest  = "y_test_no_outlier.csv"

In [3]:
dfx0  = pd.read_csv(datapath+file_Xtrain)
dfx0_ = pd.read_csv(datapath+file_Xtest)
print(dfx0.shape, dfx0_.shape)

(471866, 95) (232412, 95)


  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
dfx0 = pd.concat([dfx0, dfx0_], axis=0, ignore_index=True)
dfx0.shape

(704278, 95)

In [5]:
dfy0  = pd.read_csv(datapath+file_ytrain, names=["fullVisitorId", "transactionRevenue"])
dfy0_ = pd.read_csv(datapath+file_ytest,  names=["fullVisitorId", "transactionRevenue"])
print(dfy0.shape, dfy0_.shape)

(471866, 2) (232412, 2)


In [6]:
dfy0 = pd.concat([dfy0, dfy0_], axis=0, ignore_index=True)
dfy0.shape

(704278, 2)

In [7]:
df_total = dfx0.copy()
df_total["transactionRevenue"] = dfy0["transactionRevenue"]
df_total["trans_label"] = (df_total["transactionRevenue"]>0)*1
df_total.shape

(704278, 97)

In [8]:
df_total.head()

Unnamed: 0,fullVisitorId,avg_hits,avg_pageviews,bounces,newVisits,fake_traffic,ctadwords,channelGrouping_Affiliates,channelGrouping_Direct,channelGrouping_Display,...,visit_hour_17,visit_hour_18,visit_hour_19,visit_hour_20,visit_hour_21,visit_hour_22,visit_hour_23,visitNumber,transactionRevenue,trans_label
0,6144139465131417809,3.0,3.0,0.0,1.0,0,0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0
1,539966107187723079,2.0,2.0,0.0,0.0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0
2,5943783354516112054,1.0,1.0,1.0,1.0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0
3,181201331571930526,3.0,2.0,0.0,1.0,0,0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0
4,818252312438640620,1.0,1.0,1.0,1.0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0


### Preprocess for Imbalanced Test Dataset

In [9]:
print(dfx0_.shape, dfy0_.shape)

(232412, 95) (232412, 2)


## 1) Resample the Imbalanced Data

In [10]:
from imblearn.over_sampling import SMOTE

Using TensorFlow backend.


In [11]:
df1 = df_total.drop("fullVisitorId", axis=1)
df1.shape

(704278, 96)

In [12]:
# Target variable is "trans_label"

dfx1 = df1.drop("trans_label", axis=1)
dfy1 = pd.DataFrame({"trans_label" : df1["trans_label"]})
print(dfx1.shape, dfy1.shape)

(704278, 95) (704278, 1)


In [13]:
dfy1["trans_label"].value_counts()

0    694333
1      9945
Name: trans_label, dtype: int64

In [14]:
# Resample the dataset

smote = SMOTE()
dfx1_resampled, dfy1_resampled = smote.fit_resample(dfx1, dfy1)
print(dfx1_resampled.shape, dfy1_resampled.shape)

(1388666, 95) (1388666, 1)


In [15]:
dfy1_resampled["trans_label"].value_counts()

1    694333
0    694333
Name: trans_label, dtype: int64

## 2) Predictor Variables Standardization

In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:
def standardize_feature(series):
    '''
    Use StandardScaler to standardizing a single feature
    ----------
    Parameters
    series: A Series
    ----------
    Returns
    scaled_array: A standardized numpy.array
    '''
    arr = np.reshape(series.tolist(), (-1,1))
    stscaler = StandardScaler().fit(arr)
    scaled_array = stscaler.transform(arr)
    return scaled_array

In [18]:
def standardize_dataframe(df):
    '''
    Use StandardScaler to standardizing a dataframe
    ----------
    Parameters
    series: A dataframe
    ----------
    Returns
    scaled_array: A standardized numpy.array
    '''
    stscaler = StandardScaler().fit(df)
    scaled_array = stscaler.transform(df)
    return scaled_array

In [19]:
# Standardize the dataframe

dfx2 = standardize_dataframe(dfx1_resampled)
dfx2.shape

(1388666, 95)

In [20]:
# Delete "transactionRevenue" from Predictor variables

dfx2 = pd.DataFrame(dfx2, columns=dfx1_resampled.columns)
dfx2 = dfx2.drop("transactionRevenue", axis=1)
dfx2.shape

(1388666, 94)

### Preprocess for Imbalanced Test Dataset

In [21]:
dfy2_ = dfy0_.copy()
dfy2_["trans_label"] = (dfy2_["transactionRevenue"]>0)*1
dfy2_ = dfy2_.drop(["fullVisitorId", "transactionRevenue"], axis=1)
print(dfy2_.shape)

(232412, 1)


In [22]:
dfx2_ = standardize_dataframe(dfx0_)
dfx2_ = pd.DataFrame(dfx2_, columns=dfx0_.columns)
dfx2_ = dfx2_.drop("fullVisitorId", axis=1)
print(dfx2_.shape)

(232412, 94)


## 3) Train & Test Split

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
X_train, X_test, y_train, y_test = \
train_test_split(dfx2, dfy1_resampled, test_size=0.1, random_state=42)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(1249799, 94) (1249799, 1) (138867, 94) (138867, 1)


In [25]:
y_train["trans_label"].sum()

624510

## 4) Feature Selection

In [26]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics  import accuracy_score, classification_report

In [27]:
# 3 features by their importance

features_selected = ['avg_pageviews',
                     'subContinent_Northern America',
                     'avg_hits']

In [28]:
# Select Features 

dfx4 = X_train[features_selected]
dfx4.shape

(1249799, 3)

### Preprocess for Balanced Test Dataset

In [29]:
dftx4 = X_test[features_selected]
dftx4.shape

(138867, 3)

### Preprocess for Imbalanced Test Dataset

In [30]:
dfx4_ = dfx2_[features_selected]
dfx4_.shape

(232412, 3)

## 5) Feature Extraction: PCA

In [31]:
# NOT using PCA
X_train_pca = dfx4
print(X_train_pca.shape)

(1249799, 3)


### Preprocess for Balanced Test Dataset

In [32]:
X_test_pca = dftx4
print(X_test_pca.shape)

(138867, 3)


### Preprocess for Imbalanced Test Dataset

In [33]:
X_ibtest = dfx4_[features_selected]
y_ibtest = dfy2_
print(X_ibtest.shape, y_ibtest.shape)

(232412, 3) (232412, 1)


## 6) Modeling: Classification for Transaction or Non-transaction

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree         import DecisionTreeClassifier
from sklearn.svm          import SVC
from sklearn.ensemble     import RandomForestRegressor
from sklearn.metrics      import mean_squared_error, r2_score
from sklearn.metrics      import accuracy_score, classification_report
#from sklearn.metrics      import accuracy_score, classification_report, f1_score, precision_score, recall_score
import pickle

In [35]:
# 6.1) Run Logistic Regression

s1_lr = LogisticRegression()
s1_lr.fit(X_train_pca, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [36]:
# Save and load the fitted model
modelfile = 's1_lr.sav'
pickle.dump(s1_lr, open(modelfile, 'wb'))
#s1_lr = pickle.load(open(modelfile, 'rb'))

In [37]:
# model evaluation
labels = [0, 1]

y_pred_train = s1_lr.predict(X_train_pca)
accuracy = accuracy_score(y_train, y_pred_train)
classrpt = classification_report(y_train, y_pred_train, labels=labels)
print(f"Train Accuracy   : {accuracy}")
print(classrpt)
print("==============================")

y_pred_test = s1_lr.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred_test)
classrpt = classification_report(y_test, y_pred_test, labels=labels)
print(f"Test  Accuracy   : {accuracy}")
print(classrpt)

Train Accuracy   : 0.9530100440150776
              precision    recall  f1-score   support

           0       0.96      0.94      0.95    625289
           1       0.94      0.96      0.95    624510

    accuracy                           0.95   1249799
   macro avg       0.95      0.95      0.95   1249799
weighted avg       0.95      0.95      0.95   1249799

Test  Accuracy   : 0.9535238753627572
              precision    recall  f1-score   support

           0       0.96      0.94      0.95     69044
           1       0.94      0.96      0.95     69823

    accuracy                           0.95    138867
   macro avg       0.95      0.95      0.95    138867
weighted avg       0.95      0.95      0.95    138867



In [38]:
# Imbalanced Data Test

y_pred_test = s1_lr.predict(X_ibtest)
accuracy = accuracy_score(y_ibtest, y_pred_test)
classrpt = classification_report(y_ibtest, y_pred_test, labels=labels)
print(f"Test  Accuracy   : {accuracy}")
print(classrpt)

Test  Accuracy   : 0.5617481025076158
              precision    recall  f1-score   support

           0       1.00      0.56      0.71    229180
           1       0.03      1.00      0.06      3232

    accuracy                           0.56    232412
   macro avg       0.52      0.78      0.39    232412
weighted avg       0.99      0.56      0.71    232412



In [39]:
y_pred_test.sum()

105083