In [None]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
# import seaborn as sns
import matplotlib.pyplot as plt
from utils.utils import F1_accuracy, thresholding

plt.style.use('ggplot')

import warnings as w
w.filterwarnings('ignore')

In [None]:
train = pd.read_csv("../data/test.csv").drop(columns=['id', 'user_id'])
train.replace('Male', 1., inplace=True)
train.replace('Female', 0., inplace=True)
train.dropna(inplace=True)
train['date'] = pd.to_datetime(train['date'], format="%m-%d %H:%M")
train['date'] = train['date'].dt.hour
train.head()

In [None]:
# plt.figure(figsize=(15,8))
# sns.countplot(x='product',hue='isClick',data=train)

In [None]:
from sklearn.preprocessing import LabelEncoder
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, get_feature_names
from sklearn.model_selection import train_test_split

In [None]:
encoder = LabelEncoder()
data_all = train
sparse_features = train.columns
target = ['isClick']
for feat in sparse_features:
    data_all[feat] = encoder.fit_transform(data_all[feat])

In [None]:
sparse_features.get_loc(target[0])

In [None]:
sparse_features = sparse_features.delete(sparse_features.get_loc(target[0]))
sparse_features

In [None]:
train_, validate_ = train_test_split(data_all, test_size=.2, random_state=0)

##### change sparse feature to network input

In [None]:
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=train_[feat].max()+1, embedding_dim=4)
            for feat in sparse_features]

##### Current deep neural network input (DNN feature) and factorization machine input (linear feature) is the same
- consider separating sparse and dense features

In [None]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

##### actually training the model

In [None]:
train_model_input = {name: train_[name] for name in feature_names}
test_model_input = {name: validate_[name] for name in feature_names}
# TODO: dropout necessary for sparse features?
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy'])

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
checkpoints = ModelCheckpoint(filepath='../model_data/Deep_fm.h5', save_best_only=True)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [None]:
history = model.fit(
        train_model_input, train_[target].values, 
        batch_size=256, 
        epochs=15, 
        verbose=2, # change to 1 for bar information
        validation_split=.2,
        callbacks=[checkpoints, early_stopping]
    )

In [None]:
from tensorflow.keras.models import load_model
from deepctr.layers import Linear, DNN, FM, NoMask, _Add, Concat, PredictionLayer
pretrained = load_model('../model_data/Deep_fm.h5', custom_objects=
    {'Concat': Concat, 'Linear': Linear, 'DNN': DNN, 'NoMask': NoMask, 'FM': FM, 
    '_Add': _Add, 'PredictionLayer': PredictionLayer})
pretrained.summary()

In [None]:
prediction = model.predict(test_model_input)
true = validate_[target].values

In [None]:
out = thresholding(prediction, .5)
F1, acc = F1_accuracy(out, true)

In [None]:
for i in range(len(out)):
    print(f"Ground truth: {true[i]}, predicted as: {out[i]}")

In [2]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE

##### create balanced data
- if **data imbalance is the cause of the problem**, then refer to `imblearn.SMOTE` (Synthetic Minority Over-sampling Technique) for **better balancing data** (use interpolation instead)
- else might be model problem
- currently negative : positive = 1 : 1
- **observations: before adjusting data, all predicted values are small after sigmoid, after adjusting data, values tend to goto 0.5**

In [None]:
data = pd.read_csv('../data/train.csv').drop(columns=['id', 'user_id'])
columns = data.columns
data.replace('Male', 1., inplace=True)
data.replace('Female', 0., inplace=True)
data.dropna(inplace=True)
data['date'] = pd.to_datetime(data['date'], format="%Y/%m/%d %H:%M")
data['date'] = data['date'].dt.hour

In [None]:
sm = SMOTE()
data = data.values
labels = data[..., -1]
train = data[..., :-1]
X_res, y_res = sm.fit_resample(train, labels)

In [None]:
print(f"Before transform: {labels.shape}, after transform: {y_res.shape}")

In [None]:
dataframe = pd.DataFrame(data_all, columns=columns)
dataframe.to_csv('../data/balanced_data_1_1.csv', index=False)

In [None]:
sm = SMOTE(random_state=42)

In [None]:
test_label = np.array([0] * 90 + [1] * 10)
test_sample = np.random.rand(100, 2)
X_res, y_res = sm.fit_resample(test_sample, test_label)

In [3]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
def balance_data(data: pd.DataFrame):
    """
    Perform Synthetic Minority Over-Sampling (SMOTE) Technique 
    to balance positive and negative data

    Returns
    -------
    balanced_train: pd.DataFrame
        hstack the return value from SMOTE.fit_transform (after shuffling)
        and form dataframe with the original columns
    """
    data_all = data.values
    train, labels = data_all[..., :-1], data_all[..., -1]
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(train, labels)
    combined = np.array(np.hstack((X_res, y_res[:, np.newaxis])), dtype=np.int32)
    np.random.shuffle(combined)
    balanced_train = pd.DataFrame(combined, columns=data.columns)
    return balanced_train

In [33]:
data = pd.read_csv('../data/train.csv').drop(columns=['id', 'user_id'])
data.replace('Male', 1., inplace=True)
data.replace('Female', 0., inplace=True)
data.dropna(inplace=True)
data['date'] = pd.to_datetime(data['date'], format="%Y/%m/%d %H:%M")
data['date'] = data['date'].dt.hour

In [34]:
balanced_data = balance_data(data)

In [36]:
data_value = balanced_data.values
data_label = data_value[..., -1]
np.count_nonzero(data_label)

280104