# Set up the selected model

In [None]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split,GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, make_scorer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from tabulate import tabulate

In [2]:
# Load the dataset
df = pd.read_csv("Data/data set.csv")

## Rebuilding the model

In [3]:
# Load a copy of the dataset
df_index = df.copy()

# Delete columns
df_index.drop(['CONS_NO','10/3/2014'],axis=1,inplace=True)

In [4]:
# Remove data from the majority class (FLAG = 0)
df_clean = df_index.copy()
df_minority = df_clean[df_clean['FLAG'] == 1]
df_majority = df_clean[df_clean['FLAG'] == 0].sample(len(df_minority))
df_undersampling = pd.concat([df_minority, df_majority]).reset_index()

In [5]:
# Load a copy of the dataset
df_index = df_undersampling.copy()

In [6]:
# Delete columns
df_index.drop(['index'],axis=1,inplace=True)

In [7]:
# Load a copy of the dataset
df_filtered = df_index.copy()

In [8]:
# Drop the target column that has no null values to perform imputation
df_filtered.drop(['FLAG'],axis=1,inplace=True)

In [9]:
# Impute the data with respect to 5 neighbors
imputer = KNNImputer(n_neighbors=5)
imputed = imputer.fit_transform(df_filtered)
imputed.shape

(7230, 1033)

In [10]:
# Create a new dataset with the imputed data
df_imputed = pd.DataFrame(imputed, columns=df_filtered.columns)

In [11]:
# Copy of the inputed dataset
df_low = df_imputed.copy()

In [12]:
# Add the FLAG column again
df_low['FLAG'] = df_undersampling['FLAG']

In [13]:
# Splitting the data from the label
X = df_low.drop(['FLAG'], axis=1)
y = df_low['FLAG']

In [14]:
# Splitting the training data from the testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize a Gradient Boosting
gb = GradientBoostingClassifier(random_state=42)

# Define the parameter grid to explore
param_grid = {'n_estimators': [50],'max_depth': [5],'min_samples_split': [2]}

# Initialize the GridSearchCV object for Gradient Boosting
grid_search_gb = GridSearchCV(estimator=gb, param_grid=param_grid, scoring='recall', cv=3, n_jobs=-1)

# Fit the model to the training data
grid_search_gb.fit(X_train, y_train)

y_pred_gb = grid_search_gb.predict(X_test)

## Testing the model with the entire dataset

In [None]:
# Load a copy of the dataset
df_full = df_clean.copy()

# Delete columns
df_full.drop(['CONS_NO','10/3/2014','FLAG'],axis=1,inplace=True)

In [None]:
# Impute the data with respect to 5 neighbors
imputer_full = KNNImputer(n_neighbors=5)
imputed_full = imputer_full.fit_transform(df_full)
imputed_full.shape

In [None]:
# Create a new dataset with the imputed data
df_imputed_full = pd.DataFrame(imputed_full, columns=df_full.columns)

In [None]:
# Copy of the inputed dataset
df_index_full = df_imputed_full.copy()

In [None]:
# Add the FLAG column again
df_index_full['FLAG'] = df_clean['FLAG']

In [None]:
# Splitting the data from the label
X_full = df_index_full.drop(['FLAG'], axis=1)
y_full = df_index_full['FLAG']

In [None]:
# Splitting the training data from the testing data
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X_full, y_full, test_size=0.2, random_state=42)

In [None]:
y_pred_gb = grid_search_gb.predict(X_train_full)

In [None]:
# Evaluate the performance of the Gradient Boosting model with the best parameters
accuracy_best_gb = accuracy_score(y_test_full, y_pred_gb)

# Visualize the confusion matrix
print(f"Accuracy (Gradient Boosting - Best Parameters): {accuracy_best_gb:.2f}")
print("Classification Report (Gradient Boosting - Best Parameters):")
print(classification_report(y_test_full, y_pred_gb))

### Future Work

An intriguing observation reveals that addressing data imbalance with Undersampling instead of SMOTE (Synthetic Minority Over-sampling Technique) made a significant difference in the project's performance. Exploring alternative techniques to balance these imbalanced datasets would be worthwhile. Additionally, the dataset exhibits considerable variability in null values. Reducing the quantity of null data and imputing it with nearest neighbors significantly improved the model's performance. Considering alternative approaches, such as setting null values to zero, might also yield improved results.

### Next steps and recommendations

To delve deeper into this observation, several approaches can be taken. Firstly, investigating the origin of the numerous data gaps from the electric grid is essential. Null values may be attributed to various factors, such as a user recently joining the electric grid (resulting in no previous records), power outages, disconnections due to non-payment, etc. Handling these null values more delicately is crucial, as they significantly influence each user's consumption behavior. Additionally, another avenue to explore involves dividing the data into segments corresponding to electrical cutoff cycles, allowing for a more nuanced analysis of consumption patterns, alongside considering factors like billing cost and market electricity prices.