In [1]:
import pandas as pd
import numpy as np
import shap
import sklearn
import seaborn as sn
import matplotlib.pyplot as plt
from numpy.linalg import eig

  from .autonotebook import tqdm as notebook_tqdm


# Load dataset

In [4]:
df = pd.read_csv("../data_format1/full_data/train_format1.csv")

In [6]:
df

Unnamed: 0,user_id,merchant_id,label
0,34176,3906,0
1,34176,121,0
2,34176,4356,1
3,34176,2217,0
4,230784,4818,0
...,...,...,...
260859,359807,4325,0
260860,294527,3971,0
260861,294527,152,0
260862,294527,2537,0


# Processing

## NOTE: Recent runs are just attempts to verify the use of the sklearn library.

1. No need to perform train test split, load train and test from separate CSV files.
2. Y = label, X = everything else

#

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
train, test = train_test_split(df, test_size=0.2)
print(train)

        user_id  merchant_id  label
177145    15242         3835      1
124319   339694         2738      0
67725    417351         1892      0
181905    88472         3826      0
15448    330669         1659      0
...         ...          ...    ...
105128   149941         2769      0
169850   356980         2592      1
85180    238971         4659      0
16841    214449         4273      0
131925   188677         4976      0

[208691 rows x 3 columns]


In [21]:
X_train = train.drop(['label'], axis=1)
X_train


Unnamed: 0,user_id,merchant_id
177145,15242,3835
124319,339694,2738
67725,417351,1892
181905,88472,3826
15448,330669,1659
...,...,...
105128,149941,2769
169850,356980,2592
85180,238971,4659
16841,214449,4273


In [22]:
y_train = train['label']
y_train

177145    1
124319    0
67725     0
181905    0
15448     0
         ..
105128    0
169850    1
85180     0
16841     0
131925    0
Name: label, Length: 208691, dtype: int64

# Model Instantiation and Training

In [23]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# Model Testing

In [24]:
X_test = test.drop(['label'], axis=1)
y_test = test['label']

In [25]:
y_pred = gnb.predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 52173 points : 3179


# Model Evaluation

In [15]:
test_df = pd.read_csv("../data_format1/full_data/test_format1.csv")

In [17]:
X_test = test_df.drop(['prob'], axis=1)
X_test


Unnamed: 0,user_id,merchant_id
0,163968,4605
1,360576,1581
2,98688,1964
3,98688,3645
4,295296,3361
...,...,...
261472,228479,3111
261473,97919,2341
261474,97919,3971
261475,32639,3536


In [18]:
y_test = test_df['prob']
y_test

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
          ..
261472   NaN
261473   NaN
261474   NaN
261475   NaN
261476   NaN
Name: prob, Length: 261477, dtype: float64

In [14]:
y_pred = gnb.predict(X_train)
print("Number of mislabeled points out of a total %d points : %d" % (X_train.shape[0], (y_train != y_pred).sum()))

Number of mislabeled points out of a total 260864 points : 15952


## Shenanigans Below

In [29]:
train_df_full = pd.read_csv("../data_format1/full_data/train_format1.csv")
train_df = pd.read_csv("../data_format1/use_data/train_format1[161-320].csv")
#same number of samples when merging with full or subset dataset
merged = pd.read_csv("../data_format1/use_data/merged_dataset.csv")

#only pick those in merged whose ["user_id", "seller_id"] match with ["user_id", "merchant_id"] in the train_format1 document
training_key = train_df.loc[:, ["user_id", "merchant_id", "label"]] 
training_set = merged.merge(
        training_key, 
        how="inner", 
        left_on=["user_id", "seller_id"], 
        right_on=["user_id", "merchant_id"]
)

In [30]:
training_set

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,age_range,gender,merchant_id,label
0,379824,198,656,145,3462.0,1111,0,5.0,1.0,145,0
1,379824,198,656,145,3462.0,1111,0,5.0,1.0,145,0
2,379824,198,656,145,3462.0,1111,2,5.0,1.0,145,0
3,379824,198,656,145,3462.0,1110,0,5.0,1.0,145,0
4,379824,198,656,145,3462.0,1110,0,5.0,1.0,145,0
...,...,...,...,...,...,...,...,...,...,...,...
380,122632,175,1181,4760,247.0,1109,0,3.0,0.0,4760,0
381,122632,175,1181,4760,247.0,1108,0,3.0,0.0,4760,0
382,122632,175,1181,4760,247.0,1108,0,3.0,0.0,4760,0
383,95362,253,962,3263,626.0,1111,0,0.0,0.0,3263,0


In [31]:
merged_train, merged_test = train_test_split(training_set, test_size=0.2)

In [33]:
Xt_merge = merged_train.drop(['label'], axis=1)
yt_merge = merged_train['label']
print(Xt_merge)


     user_id  item_id  cat_id  seller_id  brand_id  time_stamp  action_type  \
283    32155      186     267       1200    2276.0        1111            0   
287   367079      279     898       3323     683.0        1111            0   
378    36385      219     349       1943    6208.0        1111            2   
236   389062      221     598       4405    4631.0        1111            2   
201   316235      211    1188       2277    1246.0        1110            0   
..       ...      ...     ...        ...       ...         ...          ...   
368   113078      175    1181       4760     247.0        1111            0   
292   246586      186     267       1200    2276.0        1110            0   
68    353962      184     656        145    3462.0        1111            0   
343   378696      175    1181       4760     247.0        1110            0   
281     5534      198     656        145    3462.0        1110            0   

     age_range  gender  merchant_id  
283        4.

In [34]:
print(yt_merge)

283    0
287    0
378    0
236    0
201    0
      ..
368    0
292    0
68     0
343    0
281    0
Name: label, Length: 308, dtype: int64


In [35]:
mergedGNB = GaussianNB()
mergedGNB.fit(Xt_merge, yt_merge)

In [37]:
Xtest_merge = merged_test.drop(['label'], axis=1)
ytest_merge = merged_test['label']

In [42]:
Xtest_merge

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,age_range,gender,merchant_id
355,218579,175,1181,4760,247.0,1111,0,0.0,1.0,4760
140,185381,279,898,3323,683.0,1111,0,3.0,1.0,3323
214,275260,175,1181,4760,247.0,1111,0,4.0,0.0,4760
60,134532,279,898,3323,683.0,1111,0,0.0,1.0,3323
91,46699,175,1181,4760,247.0,1110,0,4.0,1.0,4760
...,...,...,...,...,...,...,...,...,...,...
216,275260,175,1181,4760,247.0,1111,2,4.0,0.0,4760
150,130186,279,898,3323,683.0,1110,3,0.0,0.0,3323
237,227925,277,1023,4282,7989.0,627,0,3.0,0.0,4282
210,108672,175,1181,4760,247.0,1111,0,0.0,1.0,4760


In [40]:
ytest_merge

355    0
140    0
214    0
60     1
91     0
      ..
216    0
150    1
237    1
210    0
137    0
Name: label, Length: 77, dtype: int64

In [41]:
merged_pred = gnb.predict(Xtest_merge)
print("Number of mislabeled points out of a total %d points : %d" % (Xtest_merge.shape[0], (ytest_merge != merged_pred).sum()))

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- action_type
- age_range
- brand_id
- cat_id
- gender
- ...
