### modeling

In [13]:
# import statements
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
import sklearn.metrics as metrics

In [2]:
X_test_30_transform = pd.read_pickle("data/X_test_30_transform.pkl")

In [3]:
X_train_30_transform = pd.read_pickle("data/X_train_30_transform.pkl")

In [4]:
y_train_30 = pd.read_pickle("data/y_train_30.pkl")

In [5]:
y_test_30 = pd.read_pickle("data/y_test_30.pkl")

### resampling

In [31]:
rus = RandomUnderSampler(sampling_strategy = 0.05, random_state=42)


In [32]:
X_train_transform_resampled, y_train_resampled = rus.fit_resample(X_train_30_transform, y_train_30)

In [None]:
X_train_transform_resampled.to_pickle("data/X_train_transform_resampled.pkl")

In [None]:
y_train_resampled.to_pickle("data/y_train_resampled.pkl")

### classifying

#### stochastic gradient descent

In [8]:
SGDC = SGDClassifier(loss= "log_loss", warm_start=True, alpha = 0.0001, penalty= 'l2')

In [9]:
list_df = [X_train_30_transform[i:i+50000] for i in range(0,len(X_train_30_transform),50000)]


In [10]:
list_y = [y_train_30[i:i+50000] for i in range(0,len(y_train_30),50000)]

In [14]:
counter = 1
for df, y in zip(list_df ,list_y):
    SGDC.partial_fit(df, y, classes=np.unique(y_train_30))
    print(f"{counter}. fit done!")
    counter += 1

1. fit done!
2. fit done!
3. fit done!
4. fit done!


In [15]:
y_pred = SGDC.predict(X_test_30_transform)

#### random forest

In [16]:
rfc = RandomForestClassifier(n_estimators=250, max_depth = 40, n_jobs = -1, random_state = 42)

In [17]:
rfc.fit(X_train_30_transform, y_train_30, )

In [18]:
rfc_pred = rfc.predict(X_test_30_transform)

#### random forest undersampled

In [33]:
rfc_sampled = RandomForestClassifier(n_estimators=250, max_depth = 40, n_jobs = -1, random_state = 42, )

In [34]:
rfc_sampled.fit(X_train_transform_resampled, y_train_resampled)

In [35]:
rfc_pred_sampled_pred = rfc_sampled.predict(X_test_30_transform)

#### results

In [22]:
print("stochastic gradient descent")
print("accuracy: ",metrics.accuracy_score(y_test_30, y_pred))
print("recall: ",metrics.recall_score(y_test_30, y_pred))
print("F1: ",metrics.f1_score(y_test_30, y_pred))
metrics.confusion_matrix(y_test_30, y_pred)

stochastic gradient descent
accuracy:  0.9971992870912596
recall:  0.0
F1:  0.0


array([[54832,     0],
       [  154,     0]])

In [23]:
print("random forest")
print("accuracy: ",metrics.accuracy_score(y_test_30, rfc_pred))
print("recall: ",metrics.recall_score(y_test_30, rfc_pred))
print("F1: ",metrics.f1_score(y_test_30, rfc_pred))
metrics.confusion_matrix(y_test_30, rfc_pred)

random forest
accuracy:  0.997635761830284
recall:  0.21428571428571427
F1:  0.336734693877551


array([[54823,     9],
       [  121,    33]])

In [24]:
print("random forest undersampled")
print("accuracy: ",metrics.accuracy_score(y_test_30, rfc_pred_sampled_pred))
print("recall: ",metrics.recall_score(y_test_30, rfc_pred_sampled_pred))
print("F1: ",metrics.f1_score(y_test_30, rfc_pred_sampled_pred))
metrics.confusion_matrix(y_test_30, rfc_pred_sampled_pred)

random forest undersampled
accuracy:  0.99690830393191
recall:  0.35064935064935066
F1:  0.38848920863309355


array([[54762,    70],
       [  100,    54]])

In [36]:
print("random forest undersampled")
print("accuracy: ",metrics.accuracy_score(y_test_30, rfc_pred_sampled_pred))
print("recall: ",metrics.recall_score(y_test_30, rfc_pred_sampled_pred))
print("F1: ",metrics.f1_score(y_test_30, rfc_pred_sampled_pred))
metrics.confusion_matrix(y_test_30, rfc_pred_sampled_pred)

random forest undersampled
accuracy:  0.99325282799258
recall:  0.538961038961039
F1:  0.3091247672253259


array([[54532,   300],
       [   71,    83]])