In [1]:
import pandas as pd
import numpy as np
import shap
import sklearn
import seaborn as sn
import matplotlib.pyplot as plt
from numpy.linalg import eig

  from .autonotebook import tqdm as notebook_tqdm


# Load dataset

In [4]:
df = pd.read_csv("../data_format1/full_data/train_format1.csv")

In [6]:
df

Unnamed: 0,user_id,merchant_id,label
0,34176,3906,0
1,34176,121,0
2,34176,4356,1
3,34176,2217,0
4,230784,4818,0
...,...,...,...
260859,359807,4325,0
260860,294527,3971,0
260861,294527,152,0
260862,294527,2537,0


# Processing

## NOTE: Recent runs are just attempts to verify the use of the sklearn library.

1. No need to perform train test split, load train and test from separate CSV files.
2. Y = label, X = everything else

#

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
train, test = train_test_split(df, test_size=0.2)
print(train)

        user_id  merchant_id  label
177145    15242         3835      1
124319   339694         2738      0
67725    417351         1892      0
181905    88472         3826      0
15448    330669         1659      0
...         ...          ...    ...
105128   149941         2769      0
169850   356980         2592      1
85180    238971         4659      0
16841    214449         4273      0
131925   188677         4976      0

[208691 rows x 3 columns]


In [21]:
X_train = train.drop(['label'], axis=1)
X_train


Unnamed: 0,user_id,merchant_id
177145,15242,3835
124319,339694,2738
67725,417351,1892
181905,88472,3826
15448,330669,1659
...,...,...
105128,149941,2769
169850,356980,2592
85180,238971,4659
16841,214449,4273


In [22]:
y_train = train['label']
y_train

177145    1
124319    0
67725     0
181905    0
15448     0
         ..
105128    0
169850    1
85180     0
16841     0
131925    0
Name: label, Length: 208691, dtype: int64

# Model Instantiation and Training

In [23]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# Model Testing

In [24]:
X_test = test.drop(['label'], axis=1)
y_test = test['label']

In [25]:
y_pred = gnb.predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 52173 points : 3179


# Model Evaluation

In [15]:
test_df = pd.read_csv("../data_format1/full_data/test_format1.csv")

In [17]:
X_test = test_df.drop(['prob'], axis=1)
X_test


Unnamed: 0,user_id,merchant_id
0,163968,4605
1,360576,1581
2,98688,1964
3,98688,3645
4,295296,3361
...,...,...
261472,228479,3111
261473,97919,2341
261474,97919,3971
261475,32639,3536


In [18]:
y_test = test_df['prob']
y_test

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
          ..
261472   NaN
261473   NaN
261474   NaN
261475   NaN
261476   NaN
Name: prob, Length: 261477, dtype: float64

In [14]:
y_pred = gnb.predict(X_train)
print("Number of mislabeled points out of a total %d points : %d" % (X_train.shape[0], (y_train != y_pred).sum()))

Number of mislabeled points out of a total 260864 points : 15952


## Feature Engineered 

In [2]:

train_df = pd.read_csv("../data_format1/use_data/merged_df.csv")
test_df = pd.read_csv('../data_format1/test_merged_df.csv')

In [3]:
X_train = train_df.drop(['label'], axis=1)
y_train = train_df['label']

In [4]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

In [5]:
y_pred = gnb.predict(X_train)
print("Number of mislabeled points out of a total %d points back on TRAIN SET: %d" % (X_train.shape[0], (y_train != y_pred).sum()))

Number of mislabeled points out of a total 385 points back on TRAIN SET: 76


In [6]:
X_test = test_df.drop(['prob'], axis=1)
y_test = test_df['prob']
y_probs = gnb.predict_proba(X_test)
y_probs

array([[1.79562059e-167, 1.00000000e+000],
       [1.83472348e-167, 1.00000000e+000],
       [1.16843605e-014, 1.00000000e+000],
       ...,
       [2.20107519e-156, 1.00000000e+000],
       [8.17974487e-157, 1.00000000e+000],
       [2.92702080e-183, 1.00000000e+000]])

In [7]:
train50_df = pd.read_csv('./df_50.csv')

In [8]:
X_train50 = train50_df.drop(['label'], axis=1)
y_train50 = train50_df['label']

In [10]:
gnb50 = GaussianNB()
gnb50.fit(X_train50, y_train50)

In [11]:
y_pred50 = gnb50.predict(X_train50)
print("Number of mislabeled points out of a total %d points back on TRAIN50 SET: %d" % (X_train50.shape[0], (y_train50 != y_pred50).sum()))

Number of mislabeled points out of a total 385 points back on TRAIN50 SET: 77


In [12]:
train50_df['label'].sum()

28