In [1]:
import h5py
import pickle
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import os

import warnings
from sklearn.exceptions import DataConversionWarning, ConvergenceWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)

In [None]:
data_path = os.path.join("..", "data")
name_ext = input("Enter the file-name identifier extension")
#eg 30h_15h uses dataset w.r.t 30 user history size and 15 news size.
if len(name_ext) == 0:
    name_ext = "30h_15n"

In [1]:
## Load Train data
h5f = h5py.File(
    os.path.join(data_path,
                 'train_static_neural_shuffle_3057_200_' + name_ext + '.h5'),
    'r')
train = h5f['dataset_1'][:]
h5f.close()

with open(
        os.path.join(
            data_path,
            'static_neural_labels_temp_shuffle_new_' + name_ext + '.pickle'),
        'rb') as handle:
    l = pickle.load(handle)

## Load Test data
h5f = h5py.File(
    os.path.join(data_path,
                 'test_static_neural_shuffle_765_500_' + name_ext + '.h5'),
    'r')
test = h5f['dataset_1'][:]
h5f.close()

with open(
        os.path.join(
            data_path, 'static_neural_labels_temp_test_shuffle_new_' +
            name_ext + '.pickle'), 'rb') as handle:
    l_test = pickle.load(handle)

In [6]:
nsamples, nx, ny = train.shape
training1 = train.reshape((nsamples * nx, ny))

nsamples, nx, ny = l.shape
train_labels = l.reshape((nsamples * nx, ny))

mean = np.nanmean(training1, axis=0)
std = np.nanstd(training1, axis=0)
X_train_scaled = (training1 - mean) / std
X_train_scaled = np.nan_to_num(X_train_scaled)

In [13]:
nsamples, nx, ny = test.shape
training1 = test.reshape((nsamples * nx, ny))

nsamples, nx, ny = l_test.shape
test_labels = l_test.reshape((nsamples * nx, ny))

mean = np.nanmean(training1, axis=0)
std = np.nanstd(training1, axis=0)
X_test_scaled = (training1 - mean) / std
X_test_scaled = np.nan_to_num(X_test_scaled)

  


In [14]:
X_train_scaled.shape

(611400, 8116)

In [15]:
X_test_scaled.shape

(382500, 8116)

In [25]:
train_labels.shape

(611400, 1)

In [26]:
test_labels.shape

(382500, 1)

In [27]:
# Logistic Reg
clf = LogisticRegression(random_state=0).fit(X_train_scaled, train_labels)
pred_labels = clf.predict(X_test_scaled)
print("ACC = {}\n".format(clf.score(X_test_scaled, test_labels)))
print(classification_report(test_labels, pred_labels))
print("AUC = {}\n".format(roc_auc_score(test_labels, pred_labels)))

ACC = 0.9611607843137255

              precision    recall  f1-score   support

           0       0.99      0.97      0.98    373306
           1       0.33      0.61      0.43      9194

    accuracy                           0.96    382500
   macro avg       0.66      0.79      0.70    382500
weighted avg       0.97      0.96      0.97    382500

AUC = 0.7889849080566969



In [28]:
# Decision Trees
clf = tree.DecisionTreeClassifier(random_state=0,
                                  max_depth=10).fit(X_train_scaled,
                                                    train_labels)
pred_labels = clf.predict(X_test_scaled)
print("ACC = {}\n".format(clf.score(X_test_scaled, test_labels)))
print(classification_report(test_labels, pred_labels))
print("AUC = {}\n".format(roc_auc_score(test_labels, pred_labels)))

ACC = 0.9535921568627451

              precision    recall  f1-score   support

           0       0.99      0.96      0.98    373306
           1       0.28      0.59      0.38      9194

    accuracy                           0.95    382500
   macro avg       0.63      0.78      0.68    382500
weighted avg       0.97      0.95      0.96    382500

AUC = 0.7765142786567709



In [30]:
# RandomForest
clf = RandomForestClassifier(n_estimators=50).fit(X_train_scaled, train_labels)
pred_labels = clf.predict(X_test_scaled)
print("ACC = {}\n".format(clf.score(X_test_scaled, test_labels)))
print(classification_report(test_labels, pred_labels))
print("AUC = {}\n".format(roc_auc_score(test_labels, pred_labels)))

ACC = 0.9670640522875817

              precision    recall  f1-score   support

           0       0.98      0.98      0.98    373306
           1       0.33      0.36      0.34      9194

    accuracy                           0.97    382500
   macro avg       0.66      0.67      0.66    382500
weighted avg       0.97      0.97      0.97    382500

AUC = 0.6687882344464029

