In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, confusion_matrix, \
    precision_recall_fscore_support, roc_auc_score
from tensorflow.keras.models import Sequential
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Conv1D, Flatten, Conv2D
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import numpy as np
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from keras.callbacks import CSVLogger

In [2]:
tf.random.set_seed(1234)
epochs_number = 1  # number of epochs for the neural networks
test_set_size = 0.1  # percentage of the test size comparing to the whole dataset
oversampling_flag = 0  # set to 1 to over-sample the minority class
oversampling_percentage = 0.2  # percentage of the minority class after the oversampling comparing to majority class

In [3]:
def read_data():
    rawData = pd.read_csv('C:\electricity theft detection\preprocessedR.csv')

    # Setting the target and dropping the unnecessary columns
    y = rawData[['FLAG']]
    X = rawData.drop(['FLAG', 'CONS_NO'], axis=1)

    print('Normal Consumers:                    ', y[y['FLAG'] == 0].count()[0])
    print('Consumers with Fraud:                ', y[y['FLAG'] == 1].count()[0])
    print('Total Consumers:                     ', y.shape[0])
    print("Classification assuming no fraud:     %.2f" % (y[y['FLAG'] == 0].count()[0] / y.shape[0] * 100), "%")

    # columns reindexing according to dates
    X.columns = pd.to_datetime(X.columns)
    X = X.reindex(X.columns, axis=1)

    # Splitting the dataset into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y['FLAG'], test_size=test_set_size, random_state=0)
    print("Test set assuming no fraud:           %.2f" % (y_test[y_test == 0].count() / y_test.shape[0] * 100), "%\n")

    # Oversampling of minority class to encounter the imbalanced learning
    if oversampling_flag == 1:
        over = SMOTE(sampling_strategy=oversampling_percentage, random_state=0)
        X_train, y_train = over.fit_resample(X_train, y_train)
        print("Oversampling statistics in training set: ")
        print('Normal Consumers:                    ', y_train[y_train == 0].count())
        print('Consumers with Fraud:                ', y_train[y_train == 1].count())
        print("Total Consumers                      ", X_train.shape[0])

    return X_train, X_test, y_train, y_test


  rawData = pd.read_csv('C:\electricity theft detection\preprocessedR.csv')


In [4]:
def results(y_test, prediction):
    print("Accuracy", 100 * accuracy_score(y_test, prediction))
    print("RMSE:", mean_squared_error(y_test, prediction, squared=False))
    print("MAE:", mean_absolute_error(y_test, prediction))
    print("F1:", 100 * precision_recall_fscore_support(y_test, prediction)[2])
    print("AUC:", 100 * roc_auc_score(y_test, prediction))
    print(confusion_matrix(y_test, prediction), "\n")

In [5]:
def LR(X_train, X_test, y_train, y_test):
    print('Logistic Regression:')
    '''
    # Parameters selection 
    param_grid = {'C': [0.1,1,10,100],'solver': ['newton-cg', 'lbfgs']}
    grid = GridSearchCV(LogisticRegression(max_iter=1000,random_state=0), param_grid=param_grid, n_jobs=-1)
    grid.fit(X_train, y_train)
    df = pd.DataFrame(grid.cv_results_)
    print(df[['param_C', 'param_solver', 'mean_test_score', 'rank_test_score']])
    '''
    model = LogisticRegression(C=1000, max_iter=1000, n_jobs=-1, solver='newton-cg')
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    results(y_test, prediction)


In [10]:
X_train, X_test, y_train, y_test = read_data()

  print('Normal Consumers:                    ', y[y['FLAG'] == 0].count()[0])
  print('Consumers with Fraud:                ', y[y['FLAG'] == 1].count()[0])
  print("Classification assuming no fraud:     %.2f" % (y[y['FLAG'] == 0].count()[0] / y.shape[0] * 100), "%")


Normal Consumers:                     36677
Consumers with Fraud:                 3579
Total Consumers:                      40256
Classification assuming no fraud:     91.11 %
Test set assuming no fraud:           90.78 %



In [11]:
print('Logistic Regression:')
'''
# Parameters selection 
param_grid = {'C': [0.1,1,10,100],'solver': ['newton-cg', 'lbfgs']}
grid = GridSearchCV(LogisticRegression(max_iter=1000,random_state=0), param_grid=param_grid, n_jobs=-1)
grid.fit(X_train, y_train)
df = pd.DataFrame(grid.cv_results_)
print(df[['param_C', 'param_solver', 'mean_test_score', 'rank_test_score']])
'''
model = LogisticRegression(C=1000, max_iter=1000, n_jobs=-1, solver='newton-cg')
model.fit(X_train, y_train)
prediction = model.predict(X_test)
results(y_test, prediction)

Logistic Regression:
Accuracy 90.66070541480377
RMSE: 0.3056025946420649
MAE: 0.09339294585196224
F1: [95.04480759 18.96551724]
AUC: 55.2596044999834
[[3606   49]
 [ 327   44]] 





In [12]:
import pickle
data = {"model": model}
with open('saved_model.pkl','wb')as file:
    pickle.dump(data,file)

In [13]:
with open('saved_model.pkl','rb') as file:
    data = pickle.load(file)

model = data["model"]
prediction = model.predict(X_test)
results(y_test, prediction)

Accuracy 90.66070541480377
RMSE: 0.3056025946420649
MAE: 0.09339294585196224
F1: [95.04480759 18.96551724]
AUC: 55.2596044999834
[[3606   49]
 [ 327   44]] 





In [15]:
import csv

X_test.head().to_csv("Customer1.csv", index=False)


In [17]:
row=X_test.iloc[10]

row_as_dataframe = pd.DataFrame(row).transpose()
model.predict(row_as_dataframe)

array([0], dtype=int64)