# I. Models
## 1. Undersamlping 
- Random undersampling : EasyEnsembleClassifier using adaboost classifier
- Random undersampling : EasyEnsembleClassifier using gradientboosting classifier



In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#import Data
df = pd.read_csv('../../Data/cleaned_data_one_hot.csv')

# drop BatchId
df = df.drop(['BatchId'], axis=1)
df.head()

Unnamed: 0,AccountId,SubscriptionId,CustomerId,ProductId,Value,PricingStrategy,ProductCategory_airtime,ProductCategory_data_bundles,ProductCategory_financial_services,ProductCategory_movies,...,ProviderId_2,ProviderId_3,ProviderId_4,ProviderId_5,ProviderId_6,Hour_1,Hour_2,Hour_3,Hour_4,Fraud
0,3957,887,4406,10,1000,2,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,4841,3829,4406,6,20,2,0,0,1,0,...,0,0,1,0,0,0,0,0,1,0
2,4229,222,4683,1,500,2,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
3,648,2185,988,21,21800,2,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,4841,3829,988,6,644,2,0,0,1,0,...,0,0,1,0,0,0,0,0,1,0


In [78]:
# split data into train and test
from sklearn.model_selection import train_test_split
X = df.drop(['Fraud'], axis=1)
y = df['Fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [79]:
# find the number of fraud and non-fraud cases in the training data
print('Number of fraud and not fraud cases in training data: \n', y_train.value_counts())
print('Number of fraud and not fraud cases in test data: \n', y_test.value_counts())



Number of fraud and not fraud cases in training data: 
 0    76372
1      157
Name: Fraud, dtype: int64
Number of fraud and not fraud cases in test data: 
 0    19097
1       36
Name: Fraud, dtype: int64


In [80]:
## import EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
# import AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import classification_report, confusion_matrix, f1_score

# create model
eec = EasyEnsembleClassifier(n_estimators=20, estimator=AdaBoostClassifier() , random_state=42, n_jobs=-1, verbose=0)
eec.fit(X_train, y_train)

# predict
y_pred = eec.predict(X_test)
print('Matrix confusion : \n',confusion_matrix(y_test, y_pred))
print('\n Classification report : \n',classification_report(y_test, y_pred))

Matrix confusion : 
 [[18888   209]
 [    0    36]]

 Classification report : 
               precision    recall  f1-score   support

           0       1.00      0.99      0.99     19097
           1       0.15      1.00      0.26        36

    accuracy                           0.99     19133
   macro avg       0.57      0.99      0.63     19133
weighted avg       1.00      0.99      0.99     19133



In [81]:
from sklearn.ensemble import GradientBoostingClassifier
# create model
eec = EasyEnsembleClassifier(n_estimators=50, estimator=GradientBoostingClassifier() , random_state=42, n_jobs=-1, verbose=0 )
eec.fit(X_train, y_train)

# predict
y_pred = eec.predict(X_test)
print('Matrix confusion : \n',confusion_matrix(y_test, y_pred))
print('\n Classification report : \n',classification_report(y_test, y_pred))

Matrix confusion : 
 [[18810   287]
 [    0    36]]

 Classification report : 
               precision    recall  f1-score   support

           0       1.00      0.98      0.99     19097
           1       0.11      1.00      0.20        36

    accuracy                           0.98     19133
   macro avg       0.56      0.99      0.60     19133
weighted avg       1.00      0.98      0.99     19133



In [82]:

# create model
eec_GBC = EasyEnsembleClassifier(n_estimators=40, estimator=GradientBoostingClassifier(), sampling_strategy=0.25 , random_state=42, n_jobs=-1, verbose=0 )
eec_GBC.fit(X_train, y_train)

# predict
y_pred = eec_GBC.predict(X_test)
print('Matrix confusion : \n',confusion_matrix(y_test, y_pred))
print('\n Classification report : \n',classification_report(y_test, y_pred))

Matrix confusion : 
 [[19009    88]
 [    0    36]]

 Classification report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19097
           1       0.29      1.00      0.45        36

    accuracy                           1.00     19133
   macro avg       0.65      1.00      0.72     19133
weighted avg       1.00      1.00      1.00     19133



In [145]:
from sklearn.ensemble import RandomForestClassifier
# create model
rfc = EasyEnsembleClassifier(n_estimators=5, estimator=RandomForestClassifier(), sampling_strategy=0.05 , random_state=42, n_jobs=-1, verbose=0 )
rfc.fit(X_train, y_train)

# predict
y_pred = rfc.predict(X_test)
print('Matrix confusion : \n',confusion_matrix(y_test, y_pred))
print('\n Classification report : \n',classification_report(y_test, y_pred))

Matrix confusion : 
 [[19076    21]
 [    0    36]]

 Classification report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19097
           1       0.63      1.00      0.77        36

    accuracy                           1.00     19133
   macro avg       0.82      1.00      0.89     19133
weighted avg       1.00      1.00      1.00     19133



In [146]:
from sklearn.ensemble import VotingClassifier

# create model
vc = VotingClassifier(estimators=[('eec', eec), ('rfc_und_over', rfc), ('eec_GBC',eec_GBC)], voting='hard')
vc.fit(X_train, y_train)

# predict
y_pred = vc.predict(X_test)
print('Matrix confusion : \n',confusion_matrix(y_test, y_pred))
print('\n Classification report : \n',classification_report(y_test, y_pred))


Matrix confusion : 
 [[19032    65]
 [    0    36]]

 Classification report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19097
           1       0.36      1.00      0.53        36

    accuracy                           1.00     19133
   macro avg       0.68      1.00      0.76     19133
weighted avg       1.00      1.00      1.00     19133



In [85]:
# create model
eec = EasyEnsembleClassifier(n_estimators=20, estimator=AdaBoostClassifier(), sampling_strategy=0.25 , random_state=42, n_jobs=-1, verbose=0)
eec.fit(X_train, y_train)

# predict
y_pred = eec.predict(X_test)
print('Matrix confusion : \n',confusion_matrix(y_test, y_pred))
print('\n Classification report : \n',classification_report(y_test, y_pred))

Matrix confusion : 
 [[19030    67]
 [    0    36]]

 Classification report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19097
           1       0.35      1.00      0.52        36

    accuracy                           1.00     19133
   macro avg       0.67      1.00      0.76     19133
weighted avg       1.00      1.00      1.00     19133



In [178]:
from sklearn.ensemble import RandomForestClassifier
# create model
rfc2 = EasyEnsembleClassifier(n_estimators=2, estimator=RandomForestClassifier(n_estimators=15 ), sampling_strategy=0.01, random_state=42, n_jobs=-1, verbose=0 )
rfc2.fit(X_train, y_train)

# predict
y_pred = rfc2.predict(X_test)
print('Matrix confusion : \n',confusion_matrix(y_test, y_pred))
print('\n Classification report : \n',classification_report(y_test, y_pred))

Matrix confusion : 
 [[19086    11]
 [    0    36]]

 Classification report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19097
           1       0.77      1.00      0.87        36

    accuracy                           1.00     19133
   macro avg       0.88      1.00      0.93     19133
weighted avg       1.00      1.00      1.00     19133



## 2. Undersampling and Oversampling
a) Resample
- SMOTEENN : SMOTENC (Categorical and numerical features)+ ENN



...

In [86]:
X_train.shape

(76529, 29)

In [87]:
#import SMOTENC 
from imblearn.over_sampling import SMOTENC
#import ENN
from imblearn.under_sampling import EditedNearestNeighbours

#import SMOTEENN
from imblearn.combine import SMOTEENN

# create model of over and under sampling : SMOTENC and ENN
# categorical features : 'AccountId','SubscriptionId','CustomerId', 'ProductId','PricingStrategy','ProductCategory_airtime','ProductCategory_data_bundles','ProductCategory_financial_services','ProductCategory_movies','ProductCategory_other','ProductCategory_ticket','ProductCategory_transport','ProductCategory_tv','ProductCategory_utility_bill','ChannelId_1','ChannelId_2','ChannelId_3','ChannelId_5','ProviderId_1','ProviderId_2','ProviderId_3','ProviderId_4','ProviderId_5','ProviderId_6','Hour_1','Hour_2','Hour_3','Hour_4'
#smotenc= SMOTENC(categorical_features=['AccountId','SubscriptionId','CustomerId', 'ProductId','PricingStrategy','ProductCategory_airtime','ProductCategory_data_bundles','ProductCategory_financial_services','ProductCategory_movies','ProductCategory_other','ProductCategory_ticket','ProductCategory_transport','ProductCategory_tv','ProductCategory_utility_bill','ChannelId_1','ChannelId_2','ChannelId_3','ChannelId_5','ProviderId_1','ProviderId_2','ProviderId_3','ProviderId_4','ProviderId_5','ProviderId_6','Hour_1','Hour_2','Hour_3','Hour_4'], random_state=42)
smotenc= SMOTENC(categorical_features=[0,1,2,3,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,23,24,25,26,27,28])
enn_model = EditedNearestNeighbours(sampling_strategy="auto", n_neighbors=3, kind_sel='all',n_jobs=-1)

smotenc_enn = SMOTEENN(sampling_strategy='not minority', smote=smotenc, enn=enn_model, n_jobs=-1)




In [88]:
#X_resampled_enn, y_resampled_enn = enn_model.fit_resample(X_train, y_train)

In [89]:
# print ('Shape of undersampled data : ', y_resampled_enn.value_counts())

In [90]:
#X_resampled_smotenc, y_resampled_smotenc = smotenc.fit_resample(X_train, y_train)
#print('Number of fraud and not fraud cases in training data: \n', y_resampled_smotenc.value_counts())

In [91]:
#
# create new X_resampled and y_resampled
# X_resampled, y_resampled = smotenc_enn.fit_resample(X_train, y_train)

# # find the number of fraud and non-fraud cases in the training data
# print('Number of fraud and not fraud cases in training data: \n', y_resampled.value_counts())

## 2. Undersampling and Oversampling
a) Resample

- RandomUnderSampler + SMOTEENN 

In [96]:
print (y_train.value_counts())

0    76372
1      157
Name: Fraud, dtype: int64


In [107]:
# import randomUnderSampler
from imblearn.under_sampling import RandomUnderSampler

# create model under sampling 
rus = RandomUnderSampler(sampling_strategy=0.04, random_state=42)

In [108]:
# fit and resample data
X_resampled_rus, y_resampled_rus = rus.fit_resample(X_train, y_train)

In [109]:
print ('value count of undersampled data : ', y_resampled_rus.value_counts())
print ('Shape of undersampled data : ', X_resampled_rus.shape)

value count of undersampled data :  0    3925
1     157
Name: Fraud, dtype: int64
Shape of undersampled data :  (4082, 29)


In [110]:
# Oversampling with SMOTENC fit and resample data
X_resampled_smotenc, y_resampled_smotenc = smotenc.fit_resample(X_resampled_rus, y_resampled_rus)

In [112]:
print ('value count of undersampled data : ', y_resampled_smotenc.value_counts())
print ('Shape of undersampled data : ', X_resampled_smotenc.shape)

value count of undersampled data :  0    3925
1    3925
Name: Fraud, dtype: int64
Shape of undersampled data :  (7850, 29)


## 2. Undersampling and Oversampling
b) Model classifier
- random forest classifier

In [144]:
# create model random forest
rfc_und_over = RandomForestClassifier(n_estimators=30, random_state=42, n_jobs=-1, verbose=0)

# fit and predict
rfc_und_over.fit(X_resampled_smotenc, y_resampled_smotenc)
y_pred = rfc_und_over.predict(X_test)

print('Matrix confusion : \n',confusion_matrix(y_test, y_pred))
print('\n Classification report : \n',classification_report(y_test, y_pred))


Matrix confusion : 
 [[19058    39]
 [    1    35]]

 Classification report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19097
           1       0.47      0.97      0.64        36

    accuracy                           1.00     19133
   macro avg       0.74      0.99      0.82     19133
weighted avg       1.00      1.00      1.00     19133



# II. Evaluation

- Prediction on test set

In [None]:
#import Data

df_test = pd.read_csv('../../Data/test.csv')

df_test.head()


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy
0,TransactionId_50600,BatchId_35028,AccountId_2441,SubscriptionId_4426,CustomerId_2857,UGX,256,ProviderId_5,ProductId_3,airtime,ChannelId_3,1000.0,1000,2019-02-13T10:01:40Z,4
1,TransactionId_95109,BatchId_45139,AccountId_3439,SubscriptionId_2643,CustomerId_3874,UGX,256,ProviderId_5,ProductId_15,financial_services,ChannelId_3,2000.0,2000,2019-02-13T10:02:12Z,2
2,TransactionId_47357,BatchId_74887,AccountId_4841,SubscriptionId_3829,CustomerId_2857,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-50.0,50,2019-02-13T10:02:30Z,2
3,TransactionId_28185,BatchId_11025,AccountId_2685,SubscriptionId_4626,CustomerId_3105,UGX,256,ProviderId_5,ProductId_10,airtime,ChannelId_3,3000.0,3000,2019-02-13T10:02:38Z,4
4,TransactionId_22140,BatchId_29804,AccountId_4841,SubscriptionId_3829,CustomerId_3105,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-60.0,60,2019-02-13T10:02:58Z,2


In [None]:
# set TransactionId as index
df_test.set_index('TransactionId', inplace=True)

# Drop the columns that are not needed
df_test.drop(['CurrencyCode', 
         'CountryCode', 
         'Amount','BatchId'], axis=1, inplace=True)






# Isolate Date and Time
df_test['TransactionStartTime'] = pd.to_datetime(df_test['TransactionStartTime'])
df_test['Hour'] = df_test['TransactionStartTime'].dt.time
df_test.drop(['TransactionStartTime'], axis=1, inplace=True)

# Convert Time to int & only keep the hour, no minutes
df_test['Hour'] = df_test['Hour'].astype(str).str[:2].astype(int)

# Convert time into 4 categories; 4-11, 12-13, 14-19, 20-3
df_test['Hour'] = df_test['Hour'].apply(lambda x: 25 if x in range(0,4) else x)
df_test['Hour'] = df_test['Hour'].apply(lambda x: 1 if x in range(4,12) else x) 
df_test['Hour'] = df_test['Hour'].apply(lambda x: 2 if x in range(14,20) else x)
df_test['Hour'] = df_test['Hour'].apply(lambda x: 3 if x in range(12,14) else x)
df_test['Hour'] = df_test['Hour'].apply(lambda x: 25 if x in range(20,24) else x)
df_test['Hour'] = df_test['Hour'].apply(lambda x: 4 if x == 25 else x)


# split the data
df_test['AccountId'] = df_test['AccountId'].str.split('_').str[1]
# df_test['BatchId'] = df_test['BatchId'].str.split('_').str[1]
df_test['SubscriptionId'] = df_test['SubscriptionId'].str.split('_').str[1]
df_test['CustomerId'] = df_test['CustomerId'].str.split('_').str[1]
df_test['ProviderId'] = df_test['ProviderId'].str.split('_').str[1]
df_test['ProductId'] = df_test['ProductId'].str.split('_').str[1]
df_test['ChannelId'] = df_test['ChannelId'].str.split('_').str[1]

# convert to int one hot encoding
df_test = pd.get_dummies(df_test, columns=['ProductCategory'])
df_test = pd.get_dummies(df_test, columns=['ChannelId'])
df_test = pd.get_dummies(df_test, columns=['ProviderId'])
df_test = pd.get_dummies(df_test, columns=['Hour'])



df_test.head()




Unnamed: 0,AccountId,SubscriptionId,CustomerId,ProductId,Value,PricingStrategy,ProductCategory_airtime,ProductCategory_data_bundles,ProductCategory_financial_services,ProductCategory_movies,...,ProviderId_1,ProviderId_2,ProviderId_3,ProviderId_4,ProviderId_5,ProviderId_6,Hour_1,Hour_2,Hour_3,Hour_4
0,2441,4426,2857,3,1000,4,1,0,0,0,...,0,0,0,0,1,0,1,0,0,0
1,3439,2643,3874,15,2000,2,0,0,1,0,...,0,0,0,0,1,0,1,0,0,0
2,4841,3829,2857,6,50,2,0,0,1,0,...,0,0,0,1,0,0,1,0,0,0
3,2685,4626,3105,10,3000,4,1,0,0,0,...,0,0,0,0,1,0,1,0,0,0
4,4841,3829,3105,6,60,2,0,0,1,0,...,0,0,0,1,0,0,1,0,0,0


In [None]:
print(df.shape)
print (df.columns)
print(df_test.shape)
print (df_test.columns)


(95662, 30)
Index(['AccountId', 'SubscriptionId', 'CustomerId', 'ProductId', 'Value',
       'PricingStrategy', 'ProductCategory_airtime',
       'ProductCategory_data_bundles', 'ProductCategory_financial_services',
       'ProductCategory_movies', 'ProductCategory_other',
       'ProductCategory_ticket', 'ProductCategory_transport',
       'ProductCategory_tv', 'ProductCategory_utility_bill', 'ChannelId_1',
       'ChannelId_2', 'ChannelId_3', 'ChannelId_5', 'ProviderId_1',
       'ProviderId_2', 'ProviderId_3', 'ProviderId_4', 'ProviderId_5',
       'ProviderId_6', 'Hour_1', 'Hour_2', 'Hour_3', 'Hour_4', 'Fraud'],
      dtype='object')
(45019, 30)
Index(['AccountId', 'SubscriptionId', 'CustomerId', 'ProductId', 'Value',
       'PricingStrategy', 'ProductCategory_airtime',
       'ProductCategory_data_bundles', 'ProductCategory_financial_services',
       'ProductCategory_movies', 'ProductCategory_retail',
       'ProductCategory_ticket', 'ProductCategory_transport',
       'ProductCa

In [None]:
y_pred=rfc.predict(df_test)
print (y_pred.value_counts())


Feature names unseen at fit time:
- ChannelId_4
- ProductCategory_retail
Feature names seen at fit time, yet now missing:
- ProductCategory_other



ValueError: X has 30 features, but EasyEnsembleClassifier is expecting 29 features as input.