In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, confusion_matrix, \
    precision_recall_fscore_support, roc_auc_score
from tensorflow.keras.models import Sequential
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Conv1D, Flatten, Conv2D
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import numpy as np
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from keras.callbacks import CSVLogger

In [15]:
tf.random.set_seed(1234)
epochs_number = 1  # number of epochs for the neural networks
test_set_size = 0.1  # percentage of the test size comparing to the whole dataset
oversampling_flag = 0  # set to 1 to over-sample the minority class
oversampling_percentage = 0.2  # percentage of the minority class after the oversampling comparing to majority class

In [16]:
def read_data():
    rawData = pd.read_csv('C:\electricity theft detection\preprocessedR.csv')

    # Setting the target and dropping the unnecessary columns
    y = rawData[['FLAG']]
    X = rawData.drop(['FLAG', 'CONS_NO'], axis=1)

    print('Normal Consumers:                    ', y[y['FLAG'] == 0].count()[0])
    print('Consumers with Fraud:                ', y[y['FLAG'] == 1].count()[0])
    print('Total Consumers:                     ', y.shape[0])
    print("Classification assuming no fraud:     %.2f" % (y[y['FLAG'] == 0].count()[0] / y.shape[0] * 100), "%")

    # columns reindexing according to dates
    X.columns = pd.to_datetime(X.columns)
    X = X.reindex(X.columns, axis=1)

    # Splitting the dataset into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y['FLAG'], test_size=test_set_size, random_state=0)
    print("Test set assuming no fraud:           %.2f" % (y_test[y_test == 0].count() / y_test.shape[0] * 100), "%\n")

    # Oversampling of minority class to encounter the imbalanced learning
    if oversampling_flag == 1:
        over = SMOTE(sampling_strategy=oversampling_percentage, random_state=0)
        X_train, y_train = over.fit_resample(X_train, y_train)
        print("Oversampling statistics in training set: ")
        print('Normal Consumers:                    ', y_train[y_train == 0].count())
        print('Consumers with Fraud:                ', y_train[y_train == 1].count())
        print("Total Consumers                      ", X_train.shape[0])

    return X_train, X_test, y_train, y_test


  rawData = pd.read_csv('C:\electricity theft detection\preprocessedR.csv')


In [13]:
def results(y_test, prediction):
    print("Accuracy", 100 * accuracy_score(y_test, prediction))
    print("RMSE:", mean_squared_error(y_test, prediction, squared=False))
    print("MAE:", mean_absolute_error(y_test, prediction))
    print("F1:", 100 * precision_recall_fscore_support(y_test, prediction)[2])
    print("AUC:", 100 * roc_auc_score(y_test, prediction))
    print(confusion_matrix(y_test, prediction), "\n")

In [5]:
def ANN(X_train, X_test, y_train, y_test,csv_filename='NN epochs results.csv'):
    print('Artificial Neural Network:')
    # for i in range(4,100,3):
    #     print("Epoch:",i)

    # Model creation
    model = Sequential()
    model.add(Dense(1000, input_dim=1034, activation='relu'))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss=keras.losses.binary_crossentropy,
                  optimizer='adam',
                  metrics=['accuracy'])
    csv_logger = CSVLogger('NN epochs results.csv')

    # model.fit(X_train, y_train, validation_split=0, epochs=i, shuffle=True, verbose=0)
    model.fit(X_train, y_train, validation_split=0, epochs=epochs_number, shuffle=True, verbose=1,callbacks=[csv_logger])
    prediction = np.argmax(model.predict(X_test), axis=-1)
    model.summary()
    results(y_test, prediction)

In [6]:
def CNN1D(X_train, X_test, y_train, y_test,csv_filename='1D-Cnn Epochs Results.csv'):
    print('1D - Convolutional Neural Network:')

    # Transforming the dataset into tensors
    X_train = X_train.to_numpy().reshape(X_train.shape[0], X_train.shape[1], 1)
    X_test = X_test.to_numpy().reshape(X_test.shape[0], X_test.shape[1], 1)

    # Model creation
    model = Sequential()
    model.add(Conv1D(100, kernel_size=7, input_shape=(1034, 1), activation='relu'))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss=keras.losses.binary_crossentropy,
                  optimizer='adam',
                  metrics=['accuracy'])
    csv_logger = CSVLogger('1D-Cnn Epochs Results.csv')

    # model.fit(X_train, y_train, epochs=1, validation_split=0.1, shuffle=False, verbose=1)
    model.fit(X_train, y_train, epochs=epochs_number, validation_split=0, shuffle=False, verbose=1,callbacks=[csv_logger])
    prediction = np.argmax(model.predict(X_test), axis=-1)
    model.summary()
    results(y_test, prediction)


In [7]:
def CNN2D(X_train, X_test, y_train, y_test,csv_filename='2D-CNN Epochs Results.csv'):
    print('2D - Convolutional Neural Network:')

    # Transforming every row of the train set into a 2D array and then into a tensor
    n_array_X_train = X_train.to_numpy()
    n_array_X_train_extended = np.hstack((n_array_X_train, np.zeros(
        (n_array_X_train.shape[0], 2))))  # adding two empty columns in order to make the number of columns
    # an exact multiple of 7
    week = []
    for i in range(n_array_X_train_extended.shape[0]):
        a = np.reshape(n_array_X_train_extended[i], (-1, 7, 1))
        week.append(a)
    X_train_reshaped = np.array(week)

    # Transforming every row of the train set into a 2D array and then into a tensor
    n_array_X_test = X_test.to_numpy()  # X_test to 2D - array
    n_array_X_train_extended = np.hstack((n_array_X_test, np.zeros((n_array_X_test.shape[0], 2))))
    week2 = []
    for i in range(n_array_X_train_extended.shape[0]):
        b = np.reshape(n_array_X_train_extended[i], (-1, 7, 1))
        week2.append(b)
    X_test_reshaped = np.array(week2)

    input_shape = (1, 148, 7, 1)  # input shape of the tensor

    # Model creation
    model = Sequential()
    model.add(Conv2D(kernel_size=(1, 3), filters=32, input_shape=input_shape[1:], activation='relu', data_format='channels_last'))

    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss=keras.losses.binary_crossentropy,
                  optimizer='adam',
                  metrics=['accuracy'])
    csv_logger = CSVLogger('2D-CNN Epochs Results.csv')
    model.summary()
    #     model.fit(X_train_reshaped, y_train, validation_split=0.1, epochs=i, shuffle=False, verbose=0)
    model.fit(X_train_reshaped, y_train, validation_split=0.1, epochs=epochs_number, shuffle=False, verbose=1,callbacks=[csv_logger])

    # prediction = model.predict_classes(X_test)
    # prediction = model.predict_classes(X_test_reshaped)
    prediction = np.argmax(model.predict(X_test_reshaped), axis=-1)
    # model.summary()
    results(y_test, prediction)


In [8]:
def LR(X_train, X_test, y_train, y_test):
    print('Logistic Regression:')
    '''
    # Parameters selection 
    param_grid = {'C': [0.1,1,10,100],'solver': ['newton-cg', 'lbfgs']}
    grid = GridSearchCV(LogisticRegression(max_iter=1000,random_state=0), param_grid=param_grid, n_jobs=-1)
    grid.fit(X_train, y_train)
    df = pd.DataFrame(grid.cv_results_)
    print(df[['param_C', 'param_solver', 'mean_test_score', 'rank_test_score']])
    '''
    model = LogisticRegression(C=1000, max_iter=1000, n_jobs=-1, solver='newton-cg')
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    results(y_test, prediction)


In [9]:
def DT(X_train, X_test, y_train, y_test):
    print('Decision Tree:')
    model = DecisionTreeClassifier(random_state=0)
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    results(y_test, prediction)


In [10]:
def RF(X_train, X_test, y_train, y_test):
    print('Random Forest:')
    '''
    # Parameters selection 
    param_grid = {'n_estimators':[10,100,1000]}
    grid = GridSearchCV(RandomForestClassifier(random_state=0), param_grid=param_grid, n_jobs=-1)
    grid.fit(X_train, y_train)
    df = pd.DataFrame(grid.cv_results_)
    print(df[['param_criterion', 'mean_test_score', 'rank_test_score']])
    '''

    model = RandomForestClassifier(n_estimators=100, min_samples_leaf=1, max_features=0.5,  # max_depth=10,
                                   random_state=0, n_jobs=-1)
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    results(y_test, prediction)


In [11]:
def SVM(X_train, X_test, y_train, y_test):
    model = SVC(random_state=0)
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    results(y_test, prediction)

In [5]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from category_encoders.target_encoder import TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV

# estimators = [('encoder', TargetEncoder()), ('clf', XGBClassifier(random_state=8))]
# pipe = Pipeline(steps=estimators)
# pipe
params={"learning_rate":[0.05,0.10,0.15,0.20,0.25,0.30],
        "max_depth":[3,4,5,6,7,8,10,12,15],
        "min_child_weight":[1,3,5,7],
        "gamma":[0.0,0.1,0.2,0.3,0.4],
        "colsample_bytree":[0.3,0.4,0.5,0.7]
        
        }


# from skopt import BayesSearchCV
# from skopt.space import Real,Categorical,Integer

# search_space={
#     'clf__max_depth':Integer(2,8),
#     'clf__learning_rate':Real(0.001,1.0,prior='log-uniform'),
#     'clf__subsample':Real(0.5,1.0),
#     'clf__colsample_bytree':Real(0.5,1.0),
#     'clf__colsample_bylevel':Real(0.5,1.0),
#     'clf__colsample_bynode':Real(0.5,1.0),
#     'clf__reg_alpha':Real(0.0,10.0),
#     'clf__reg_lambda':Real(0.0,10.0),
#     'clf__gamma':Real(0.0,10.0)
# }
# opt=BayesSearchCV(pipe,search_space,cv=3,n_iter=10,scoring='roc_auc',random_state=8)
X_train, X_test, y_train, y_test = read_data()
# opt.fit(X_train,y_train)

classifier=XGBClassifier()
random_search=RandomizedSearchCV(classifier,param_distributions=params,n_iter=5,scoring='roc_auc',n_jobs=-1,cv=5,verbose=3)
random_search.fit(X_train, y_train)
 

# def XGBoost(X_train, X_test, y_train, y_test):
#     print('XGBoost Classifier:')
    
#     # Create XGBoost classifier
#     model = XGBClassifier(random_state=8)
    
#     # Train the model
#     model.fit(X_train, y_train)
    
#     # Predict on the test set
#     y_pred = model.predict(X_test)
    
#     # Calculate accuracy
#     accuracy = accuracy_score(y_test, y_pred)
#     print("Accuracy:", accuracy)
    
    # You can save the predictions or any other metrics you want into a CSV file here if needed


  print('Normal Consumers:                    ', y[y['FLAG'] == 0].count()[0])
  print('Consumers with Fraud:                ', y[y['FLAG'] == 1].count()[0])
  print("Classification assuming no fraud:     %.2f" % (y[y['FLAG'] == 0].count()[0] / y.shape[0] * 100), "%")


Normal Consumers:                     36677
Consumers with Fraud:                 3579
Total Consumers:                      40256
Classification assuming no fraud:     91.11 %
Test set assuming no fraud:           90.78 %

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [6]:
random_search.best_estimator_
 

In [7]:
random_search.best_params_

{'min_child_weight': 7,
 'max_depth': 12,
 'learning_rate': 0.25,
 'gamma': 0.3,
 'colsample_bytree': 0.4}

In [8]:
def XGBoost(X_train, X_test, y_train, y_test):
    print('XGBoost Classifier:')
    
    # Create XGBoost classifier
    model = XGBClassifier(colsample_bytree=0.4,gamma=0.3,learning_rate=0.25,max_depth=12,min_child_weight=7,random_state=8)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

In [17]:
X_train, X_test, y_train, y_test = read_data()

  print('Normal Consumers:                    ', y[y['FLAG'] == 0].count()[0])
  print('Consumers with Fraud:                ', y[y['FLAG'] == 1].count()[0])
  print("Classification assuming no fraud:     %.2f" % (y[y['FLAG'] == 0].count()[0] / y.shape[0] * 100), "%")


Normal Consumers:                     36677
Consumers with Fraud:                 3579
Total Consumers:                      40256
Classification assuming no fraud:     91.11 %
Test set assuming no fraud:           90.78 %



In [18]:
print(X_test.head())

       2014-01-01  2014-01-02  2014-01-03  2014-01-04  2014-01-05  2014-01-06  \
10300    0.126948    0.095892    0.097888    0.088591    0.091852    0.089954   
7510     0.512272    0.380156    0.379236    0.454310    0.485959    0.250616   
4510     0.289163    0.232522    0.289375    0.187381    0.320677    0.229754   
11468    0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   
21356    0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   

       2014-01-07  2014-01-08  2014-01-09  2014-01-10  ...  2016-10-22  \
10300    0.090976    0.079683    0.083723    0.090830  ...    0.668178   
7510     0.432965    0.401316    0.299193    0.385676  ...    0.040849   
4510     0.210165    0.299809    0.236994    0.250835  ...    0.560226   
11468    0.000000    0.000000    0.000000    0.000000  ...    0.000000   
21356    0.000000    0.000000    0.000000    0.000000  ...    0.000000   

       2016-10-23  2016-10-24  2016-10-25  2016-10-26  2016-10-27  2

In [19]:
print(y_test.head())

10300    0
7510     0
4510     0
11468    0
21356    0
Name: FLAG, dtype: int64


In [71]:
ANN(X_train, X_test, y_train, y_test,'NN epochs results.csv')

Artificial Neural Network:


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1133/1133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - accuracy: 0.9111 - loss: 0.3065
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


Accuracy 90.78489816194735
RMSE: 0.3035638621122853
MAE: 0.09215101838052658
F1: [95.16989975  0.        ]
AUC: 50.0
[[3655    0]
 [ 371    0]] 



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [73]:
CNN1D(X_train, X_test, y_train, y_test,'1D-Cnn Epochs Results.csv')

1D - Convolutional Neural Network:


  super().__init__(


[1m1133/1133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 82ms/step - accuracy: 0.9101 - loss: 0.2708
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step


Accuracy 90.78489816194735
RMSE: 0.3035638621122853
MAE: 0.09215101838052658
F1: [95.16989975  0.        ]
AUC: 50.0
[[3655    0]
 [ 371    0]] 



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [76]:
CNN2D(X_train, X_test, y_train, y_test)

2D - Convolutional Neural Network:


  super().__init__(


[1m1019/1019[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 19ms/step - accuracy: 0.9117 - loss: 0.2873 - val_accuracy: 0.9183 - val_loss: 0.2363
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
Accuracy 90.78489816194735
RMSE: 0.3035638621122853
MAE: 0.09215101838052658
F1: [95.16989975  0.        ]
AUC: 50.0
[[3655    0]
 [ 371    0]] 



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [62]:
RF(X_train, X_test, y_train, y_test)

Random Forest:


KeyboardInterrupt: 

In [63]:
LR(X_train, X_test, y_train, y_test)

Logistic Regression:
Accuracy 90.66070541480377
RMSE: 0.3056025946420649
MAE: 0.09339294585196224
F1: [95.04480759 18.96551724]
AUC: 55.2596044999834
[[3606   49]
 [ 327   44]] 





In [64]:
DT(X_train, X_test, y_train, y_test)

Decision Tree:
Accuracy 84.05365126676601
RMSE: 0.39932879602194954
MAE: 0.15946348733233978
F1: [91.13993928 20.34739454]
AUC: 56.22221157001634
[[3302  353]
 [ 289   82]] 





In [65]:
SVM(X_train, X_test, y_train, y_test)

Accuracy 90.8842523596622
RMSE: 0.301922964352462
MAE: 0.09115747640337804
F1: [95.21948678  2.13333333]
AUC: 50.53908355795148
[[3655    0]
 [ 367    4]] 





In [9]:
XGBoost(X_train, X_test, y_train, y_test)

XGBoost Classifier:
Accuracy: 0.9227521112767014
