# Neural Network Classifier

In [2]:
from keras import models
from keras import layers
from keras import optimizers
from keras import losses
from keras import metrics
import pandas as pd

from sklearn.model_selection import train_test_split

### Testing Dataset Creation

In [58]:
train_df_full = pd.read_csv("../data_format1/full_data/train_format1.csv")
train_df = pd.read_csv("../data_format1/use_data/train_format1[161-320].csv")
#same number of samples when merging with full or subset dataset
merged = pd.read_csv("../data_format1/use_data/merged_dataset.csv")

#only pick those in merged whose ["user_id", "seller_id"] match with ["user_id", "merchant_id"] in the train_format1 document
training_key = train_df.loc[:, ["user_id", "merchant_id", "label"]] 
training_set = merged.merge(
        training_key, 
        how="inner", 
        left_on=["user_id", "seller_id"], 
        right_on=["user_id", "merchant_id"]
)

In [59]:
training_set.shape

(385, 11)

In [60]:
training_set.head()

Unnamed: 0,user_id,item_id,cat_id,seller_id,brand_id,time_stamp,action_type,age_range,gender,merchant_id,label
0,379824,198,656,145,3462.0,1111,0,5.0,1.0,145,0
1,379824,198,656,145,3462.0,1111,0,5.0,1.0,145,0
2,379824,198,656,145,3462.0,1111,2,5.0,1.0,145,0
3,379824,198,656,145,3462.0,1110,0,5.0,1.0,145,0
4,379824,198,656,145,3462.0,1110,0,5.0,1.0,145,0


In [78]:
train, test = train_test_split(training_set, test_size=0.2)

In [79]:
model = models.Sequential()
model.add(layers.Dense(10, activation='relu', input_shape=(train.shape[1]-1,) ))        #label kept getting counted as a feature
model.add(layers.Dense(5, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer=optimizers.Adam(learning_rate=0.001),loss='binary_crossentropy',metrics=['acc'])

In [80]:
history = model.fit(train.drop('label', axis=1), train['label'],
                    epochs=10,
                    validation_data= (test.drop('label',axis=1), test['label'])
                    )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Baseline

In [3]:
# FULL DATASET
train_complete = pd.read_csv('../data_format1/use_data/merged_df.csv')
train_complete.shape

(385, 182)

In [5]:
train_complete, val_complete = train_test_split(train_complete, test_size=0.2)

In [4]:
train_50 = pd.read_csv("df_50.csv")
train_50.shape

(385, 45)

In [7]:
train_50_complete, val_50_complete = train_test_split(train_50, test_size=0.2)

In [None]:
# DIFFERING AMOUNTS OF PCA

In [20]:
model = models.Sequential()
model.add(layers.Dense(10, activation='relu', input_shape=(train_complete.shape[1]-1,) ))        #label kept getting counted as a feature
model.add(layers.Dense(5, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer=optimizers.Adam(learning_rate=0.001),loss='binary_crossentropy',metrics=['acc'])

In [19]:
history = model.fit(train_complete.drop('label', axis=1), train_complete['label'],
                    epochs=10,
                    validation_data= (val_complete.drop('label',axis=1), val_complete['label'])
                    )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [26]:
model = models.Sequential()
model.add(layers.Dense(10, activation='relu', input_shape=(train_50_complete.shape[1]-1,) ))        #label kept getting counted as a feature
model.add(layers.Dense(5, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer=optimizers.Adam(learning_rate=0.001),loss='binary_crossentropy',metrics=['acc'])

In [27]:
history = model.fit(train_50_complete.drop('label', axis=1), train_50_complete['label'],
                    epochs=10,
                    validation_data= (val_50_complete.drop('label',axis=1), val_50_complete['label'])
                    )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [51]:
from sklearn.decomposition import PCA

x = train_50.copy()
y = train_50['label']

In [52]:
pca = PCA(n_components=5)
principalComponents = pca.fit_transform(x)

In [53]:
principalDF = pd.DataFrame(data = principalComponents, columns = ['1', '2', '3', '4', '5'])
finalDF = pd.concat([principalDF, y], axis = 1)

In [54]:
finalDF.head()

Unnamed: 0,1,2,3,4,5,label
0,175507.95037,2535.847367,-13.409704,-52.627332,23.207866,0
1,175507.95037,2535.847367,-13.409704,-52.627332,23.207866,0
2,175507.95037,2535.847367,-13.409704,-52.627332,23.207866,0
3,175507.95027,2535.844857,-12.657206,-52.920406,23.195732,0
4,175507.95027,2535.844857,-12.657206,-52.920406,23.195732,0


In [55]:
train_final, val_final = train_test_split(finalDF, test_size=0.2)

In [56]:
model = models.Sequential()
model.add(layers.Dense(10, activation='relu', input_shape=(finalDF.shape[1]-1,) ))        #label kept getting counted as a feature
model.add(layers.Dense(5, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer=optimizers.Adam(learning_rate=0.001),loss='binary_crossentropy',metrics=['acc'])

In [57]:
history = model.fit(train_final.drop('label', axis=1), train_final['label'],
                    epochs=10,
                    validation_data= (val_final.drop('label',axis=1), val_final['label'])
                    )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [46]:
model.evaluate(test.drop(['label'], axis=1), test['label'],)



[73.73917388916016, 0.9350649118423462]

super high loss?