In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error


In [3]:
#extract processed data
df = pd.read_csv("processed_data.csv")

#caterogerize whether a household is a superuser by gasuge 
df['SuperUser'] = 0
df.loc[df[df['GASUSGE'] >= df.GASUSGE.quantile(0.9)].index, 'SuperUser'] = 1

#remove extreme high correlation varaibles
df.drop(['ANNMILES', 'GASUSGE','STRATUMID','TDAYDATE','FLAG100'], axis=1, inplace=True)

#separate prediction variable and id from dataframe
df_y = df.pop('SuperUser')
id = df.pop("HOUSEID")

#create train and test dataset
x_train, x_test, y_train, y_test = train_test_split(df, df_y, test_size=0.3, random_state=42)


In [4]:
y_train.value_counts()

SuperUser
0    4527
1     489
Name: count, dtype: int64

In [5]:
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
mae = mean_squared_error(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", class_report)

Accuracy: 88.79%
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.99      0.94      1922
           1       0.19      0.02      0.03       228

    accuracy                           0.89      2150
   macro avg       0.54      0.50      0.49      2150
weighted avg       0.82      0.89      0.84      2150

