<a href="https://colab.research.google.com/github/MariamKotob/Electricity-Gas-Consumption/blob/main/Fraud%20Detection%20in%20Electricity%20and%20Gas%20Consumption.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from sklearn import preprocessing
from sklearn.model_selection import RandomizedSearchCV

import matplotlib as plt
import pandas_profiling
import seaborn as sns
import pandas as pd
import numpy as np

In [None]:
#Connect Google Drive to access the dataset
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cd /content/drive/My\ Drive/DataDSC

/content/drive/My Drive/DataDSC


## 1- Loading data to the notebook

In [None]:
#1 Loading data to the notebook
cliTrain = pd.read_csv('client_train.csv')
invTrain = pd.read_csv('invoice_train.csv')
cliTest = pd.read_csv('client_test.csv')
invTest = pd.read_csv('invoice_test.csv')
samplesub = pd.read_csv('SampleSubmission.csv')

## 2- Exploring dataset

In [None]:
#2 Exploring dataset
# get first 5 rows of a table 
cliTrain.head()

In [None]:
invTrain.head()

In [None]:
print(invTrain.shape)

In [None]:
print(invTest.shape)

In [None]:
print(cliTrain.shape)

In [None]:
print(cliTest.shape)

In [None]:
# Explore data types and column names
print(cliTrain.info())

In [None]:
print(invTrain.info())

In [None]:
cliTrain['target'].value_counts()

In [None]:
# Using pandas_profiling to explore features relations
eda_cli_report = pandas_profiling.ProfileReport(cliTrain)
eda_cli_report

# Export output data on an HTML 
eda_cli_report.to_file("client_report.html")

In [None]:
eda_inv_report = pandas_profiling.ProfileReport(invTrain, minimal=True)
eda_inv_report.to_file("invoice_report.html")

## 3- Cleaning and preprocessing data

In [None]:
#3 Cleaning and re-organizing data
# Encode 'counter_type' variable
d={"ELEC":0,"GAZ":1}
invTrain['counter_type'] = invTrain['counter_type'].map(d)

# turn date from abject type to datetime type to process it
for df in [invTrain, invTest]:
    df['invoice_date'] = pd.to_datetime(df['invoice_date'])

In [None]:
invTrain['counter_type'].value_counts()

In [None]:
cliTrain['client_catg'].value_counts()

In [None]:
# Make statistical calculations on the invoices data and save in a new list
aggs = {}
aggs['consommation_level_1'] = ['sum','mean','std']

aggs['consommation_level_2'] = ['sum','mean','std']

aggs['consommation_level_3'] = ['sum','mean','std']

aggs['consommation_level_4'] = ['sum','mean','std']

aggs['months_number'] = ['sum','mean']

aggs['reading_remarque'] = ['sum','mean','std']

In [None]:
# Collect all data of a single client in one group
agg_trans = invTrain.groupby(['client_id']).agg(aggs)
agg_trans.columns = ['_'.join(col).strip() for col in agg_trans.columns.values]
agg_trans.reset_index(inplace=True)

# Count the number of transactions by a single client
df = (invTrain.groupby('client_id')
      .size()
      .reset_index(name='{}transactions_count'.format('1')))

# New dataframe after aggregation 
agg_trans = pd.merge(df, agg_trans, on='client_id', how='left')

In [None]:
agg_trans.head(10)

In [None]:
agg_trans.shape

In [None]:
# Create the new training dataset in a dataframe
train = pd.merge(cliTrain, agg_trans, on='client_id', how='left')

In [None]:
train.shape

In [None]:
train.head(10)

In [None]:
# Same re-organizing to create the new test set
d={"ELEC":0,"GAZ":1}
invTest['counter_type'] = invTest['counter_type'].map(d)

agg_trans = invTest.groupby(['client_id']).agg(aggs)
agg_trans.columns = ['_'.join(col).strip() for col in agg_trans.columns.values]
agg_trans.reset_index(inplace=True)

df = (invTest.groupby('client_id')
      .size()
      .reset_index(name='{}transactions_count'.format('1')))

agg_trans = pd.merge(df, agg_trans, on='client_id', how='left')

test = pd.merge(cliTest, agg_trans, on='client_id', how='left')

In [None]:
test.shape

In [None]:
test.head(10)

In [None]:
col_to_drop = ['client_id', 'creation_date','old_index_std',
               'reading_remarque_std','month_std',
               'consommation_level_1_std', 
               'consommation_level_2_std', 
               'consommation_level_3_std',
               'consommation_level_4_std'
               ]
               
for col in col_to_drop:
    if col in train.columns:
        train.drop([col], axis=1, inplace=True)
    if col in test.columns:
        test.drop([col], axis=1, inplace=True)

# Export labels from training df
target = cliTrain['target']

# Drop the label column (target) from the df
train.drop('target', axis=1, inplace=True)

In [None]:
# Check and clean missing values + NULL values
all_data_na = train.isnull().sum() 
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]

missing_data = pd.DataFrame({'Missing ' :all_data_na})
missing_data.head(20) 

Unnamed: 0,Missing


## 4- Model call, train and test


In [None]:
#4 Calling the model giving the current training data
# Try using Random Forest Model <Belive this one will be the one>
model2 = RandomForestClassifier(n_estimators=100, random_state=55, 
                                bootstrap = True, 
                                max_features = 'sqrt', 
                                n_jobs=-1, verbose = 1)

In [None]:
# Send trining data to the model to start training
# The model using Random Forests
model2.fit(train, target)

In [None]:
# Test the model giving it list of the testing data
prediction = model2.predict(test) 
prediction

In [None]:
# Saving the initial model output
submission = pd.DataFrame({
        "client_id": samplesub["client_id"],
        "target": prediction[1]
    })

# Exporting data to a submission file
submission.to_csv('SUB.csv', index=False)

submission.head(10)

Unnamed: 0,client_id,target
0,test_Client_0,0.0
1,test_Client_1,0.0
2,test_Client_10,0.0
3,test_Client_100,0.0
4,test_Client_1000,0.0
5,test_Client_10000,0.0
6,test_Client_10001,0.0
7,test_Client_10002,0.0
8,test_Client_10003,0.0
9,test_Client_10004,0.0


## 5- Features Selection and Model Optimization

In [None]:
# Variable importances and hyperparameter optimization

# Feature Importances for Random Forest Model
fea_i = pd.DataFrame({'feature': train,
                   'importance': model2.feature_importances_}).\
                    sort_values('importance', ascending = False)
fea_i.head()

In [None]:
param_grid = {
    'n_estimators': np.linspace(10, 200).astype(int),
    'max_depth': [None] + list(np.linspace(3, 20).astype(int)),
    'max_features': ['auto', 'sqrt', None] + list(np.arange(0.5, 1, 0.1)),
    'max_leaf_nodes': [None] + list(np.linspace(10, 50, 500).astype(int)),
    'min_samples_split': [2, 5, 10],
    'bootstrap': [True, False]
}

In [None]:
# Optimizing the model performance using random search
# Estimator for use in random search
estimator = RandomForestClassifier(random_state = 50)

# Create the random search model     #### n_jobs = -1,
rs = RandomizedSearchCV(estimator, param_grid,   
                        scoring = 'roc_auc', cv = 3, 
                        n_iter = 10, verbose = 1, random_state=55)

# Fit 
rs.fit(train, target)

In [None]:
rs.best_params_

In [None]:
best_model = rs.best_estimator_

In [None]:
rf_predictions = best_model.predict(test)

In [None]:
rf_predictions

In [None]:
submission = pd.DataFrame({
        "client_id": samplesub["client_id"],
        "target": rf_predictions[1]
    })

submission.head(10)

In [None]:
submission.to_csv('SUB2.csv', index=False)