In [201]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [179]:
#extract processed data
df = pd.read_csv("processed_data.csv")

#caterogerize whether a household is a superuser by gasuge 
df['SuperUser'] = 0
df.loc[df[df['GASUSGE'] >= df.GASUSGE.quantile(0.9)].index, 'SuperUser'] = 1

#remove extreme high correlation varaibles
df.drop(['ANNMILES', 'GASUSGE','STRATUMID','TDAYDATE','FLAG100'], axis=1, inplace=True)

#separate prediction variable and id from dataframe
df_y = df.pop('SuperUser')
id = df.pop("HOUSEID")

#create train and test dataset
x_train, x_test, y_train, y_test = train_test_split(df, df_y, test_size=0.3, random_state=42)


In [180]:
y_train.value_counts()

SuperUser
0    4527
1     489
Name: count, dtype: int64

In [183]:
print(y_train.value_counts())

SuperUser
0    4527
1     489
Name: count, dtype: int64


In [182]:
x = pd.concat([x_train, y_train], axis=1)
non_superuser = x[x.SuperUser == 0]
superuser = x[x.SuperUser == 1]
superuser_upsampled = resample(superuser, replace=True, n_samples = len(non_superuser), random_state=42)
upsampled = pd.concat([non_superuser,superuser_upsampled])
up_y_train = upsampled.SuperUser
up_x_train = upsampled.drop('SuperUser', axis=1)
up_y_train.value_counts()


SuperUser
0    4527
1    4527
Name: count, dtype: int64

In [184]:
non_superuser_downsampled = resample(non_superuser, replace = False, n_samples = len(superuser), random_state=42)
downsampled = pd.concat([non_superuser_downsampled, superuser])
d_y_train = downsampled.SuperUser
d_x_train = downsampled.drop('SuperUser', axis=1)
d_y_train.value_counts()

SuperUser
0    489
1    489
Name: count, dtype: int64

In [185]:
y_test.value_counts()

SuperUser
0    1922
1     228
Name: count, dtype: int64

In [194]:
knn = KNeighborsClassifier(n_neighbors=2)
#knn.fit(up_x_train, up_y_train)
#knn.fit(d_x_train, d_y_train)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", class_report)

print('Confusion matrix')
print(pd.DataFrame(confusion_matrix(y_test, y_pred)))


Accuracy: 89.12%
Classification Report:
               precision    recall  f1-score   support

           0       0.89      1.00      0.94      1922
           1       0.20      0.01      0.02       228

    accuracy                           0.89      2150
   macro avg       0.55      0.50      0.48      2150
weighted avg       0.82      0.89      0.84      2150

Confusion matrix
      0  1
0  1914  8
1   226  2


In [202]:
rfc = RandomForestClassifier(class_weight='balanced')
rfc.fit(up_x_train, up_y_train)
#rfc.fit(d_x_train, d_y_train)
#rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"ROCAUC: {accuracy * 100:.2f}%")
print(f"F1: {accuracy * 100:.2f}%")
print("Classification Report:\n", class_report)
print(pd.DataFrame(confusion_matrix(y_test, y_pred)))

Accuracy: 89.30%
ROCAUC: 89.30%
F1: 89.30%
Classification Report:
               precision    recall  f1-score   support

           0       0.89      1.00      0.94      1922
           1       0.33      0.01      0.02       228

    accuracy                           0.89      2150
   macro avg       0.61      0.50      0.48      2150
weighted avg       0.84      0.89      0.85      2150

      0  1
0  1918  4
1   226  2
