# I. Models
## 1. Undersamlping 
- Random undersampling : EasyEnsembleClassifier using adaboost classifier
- Random undersampling : EasyEnsembleClassifier using gradientboosting classifier



In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#import Data
df = pd.read_csv('../../Data/training_clean.csv')


df.head()

Unnamed: 0,Value,ProviderId_3,ProviderId_5,ProviderId_1,ProductCategory_utility_bill,ProductCategory_financial_services,ChannelId_3,Hour_3,Hour_4,ChannelId_1,PricingStrategy_4,SubscriptionId,ProductCategory_transport,AccountId,PricingStrategy_0,Fraud
0,1000,0,0,0,0,0,1,0,1,0,0,887,0,3957,0,0
1,20,0,0,0,0,1,0,0,1,0,0,3829,0,4841,0,0
2,500,0,0,0,0,0,1,0,1,0,0,222,0,4229,0,0
3,21800,0,0,1,1,0,1,0,1,0,0,2185,0,648,0,0
4,644,0,0,0,0,1,0,0,1,0,0,3829,0,4841,0,0


In [23]:
# split data into train and test
from sklearn.model_selection import train_test_split
X = df.drop(['Fraud'], axis=1)
y = df['Fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
# find the number of fraud and non-fraud cases in the training data
print('Number of fraud and not fraud cases in training data: \n', y_train.value_counts())
print('Number of fraud and not fraud cases in test data: \n', y_test.value_counts())



Number of fraud and not fraud cases in training data: 
 0    76372
1      157
Name: Fraud, dtype: int64
Number of fraud and not fraud cases in test data: 
 0    19097
1       36
Name: Fraud, dtype: int64


In [25]:
## import EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
# import AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import classification_report, confusion_matrix, f1_score

# create model
eec = EasyEnsembleClassifier(n_estimators=20, estimator=AdaBoostClassifier() , random_state=42, n_jobs=-1, verbose=0)
eec.fit(X_train, y_train)

# predict
y_pred = eec.predict(X_test)
print('Matrix confusion : \n',confusion_matrix(y_test, y_pred))
print('\n Classification report : \n',classification_report(y_test, y_pred))

Matrix confusion : 
 [[18817   280]
 [    0    36]]

 Classification report : 
               precision    recall  f1-score   support

           0       1.00      0.99      0.99     19097
           1       0.11      1.00      0.20        36

    accuracy                           0.99     19133
   macro avg       0.56      0.99      0.60     19133
weighted avg       1.00      0.99      0.99     19133



In [26]:
from sklearn.ensemble import GradientBoostingClassifier
# create model
eec = EasyEnsembleClassifier(n_estimators=50, estimator=GradientBoostingClassifier() , random_state=42, n_jobs=-1, verbose=0 )
eec.fit(X_train, y_train)

# predict
y_pred = eec.predict(X_test)
print('Matrix confusion : \n',confusion_matrix(y_test, y_pred))
print('\n Classification report : \n',classification_report(y_test, y_pred))

Matrix confusion : 
 [[18817   280]
 [    0    36]]

 Classification report : 
               precision    recall  f1-score   support

           0       1.00      0.99      0.99     19097
           1       0.11      1.00      0.20        36

    accuracy                           0.99     19133
   macro avg       0.56      0.99      0.60     19133
weighted avg       1.00      0.99      0.99     19133



In [27]:

# create model
eec_GBC = EasyEnsembleClassifier(n_estimators=40, estimator=GradientBoostingClassifier(), sampling_strategy=0.25 , random_state=42, n_jobs=-1, verbose=0 )
eec_GBC.fit(X_train, y_train)

# predict
y_pred = eec_GBC.predict(X_test)
print('Matrix confusion : \n',confusion_matrix(y_test, y_pred))
print('\n Classification report : \n',classification_report(y_test, y_pred))

Matrix confusion : 
 [[18986   111]
 [    0    36]]

 Classification report : 
               precision    recall  f1-score   support

           0       1.00      0.99      1.00     19097
           1       0.24      1.00      0.39        36

    accuracy                           0.99     19133
   macro avg       0.62      1.00      0.70     19133
weighted avg       1.00      0.99      1.00     19133



In [28]:
from sklearn.ensemble import RandomForestClassifier
# create model
rfc = EasyEnsembleClassifier(n_estimators=5, estimator=RandomForestClassifier(), sampling_strategy=0.05 , random_state=42, n_jobs=-1, verbose=0 )
rfc.fit(X_train, y_train)

# predict
y_pred = rfc.predict(X_test)
print('Matrix confusion : \n',confusion_matrix(y_test, y_pred))
print('\n Classification report : \n',classification_report(y_test, y_pred))

Matrix confusion : 
 [[19072    25]
 [    0    36]]

 Classification report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19097
           1       0.59      1.00      0.74        36

    accuracy                           1.00     19133
   macro avg       0.80      1.00      0.87     19133
weighted avg       1.00      1.00      1.00     19133



In [29]:
from sklearn.ensemble import VotingClassifier

# create model
vc = VotingClassifier(estimators=[('eec', eec), ('rfc_und_over', rfc), ('eec_GBC',eec_GBC)], voting='hard')
vc.fit(X_train, y_train)

# predict
y_pred = vc.predict(X_test)
print('Matrix confusion : \n',confusion_matrix(y_test, y_pred))
print('\n Classification report : \n',classification_report(y_test, y_pred))


Matrix confusion : 
 [[18990   107]
 [    0    36]]

 Classification report : 
               precision    recall  f1-score   support

           0       1.00      0.99      1.00     19097
           1       0.25      1.00      0.40        36

    accuracy                           0.99     19133
   macro avg       0.63      1.00      0.70     19133
weighted avg       1.00      0.99      1.00     19133



In [30]:
# create model
eec = EasyEnsembleClassifier(n_estimators=20, estimator=AdaBoostClassifier(), sampling_strategy=0.25 , random_state=42, n_jobs=-1, verbose=0)
eec.fit(X_train, y_train)

# predict
y_pred = eec.predict(X_test)
print('Matrix confusion : \n',confusion_matrix(y_test, y_pred))
print('\n Classification report : \n',classification_report(y_test, y_pred))

Matrix confusion : 
 [[19006    91]
 [    0    36]]

 Classification report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19097
           1       0.28      1.00      0.44        36

    accuracy                           1.00     19133
   macro avg       0.64      1.00      0.72     19133
weighted avg       1.00      1.00      1.00     19133



In [31]:
from sklearn.ensemble import RandomForestClassifier
# create model
rfc2 = EasyEnsembleClassifier(n_estimators=2, estimator=RandomForestClassifier(n_estimators=15 ), sampling_strategy=0.01, random_state=42, n_jobs=-1, verbose=0 )
rfc2.fit(X_train, y_train)

# predict
y_pred = rfc2.predict(X_test)
print('Matrix confusion : \n',confusion_matrix(y_test, y_pred))
print('\n Classification report : \n',classification_report(y_test, y_pred))

Matrix confusion : 
 [[19084    13]
 [    0    36]]

 Classification report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19097
           1       0.73      1.00      0.85        36

    accuracy                           1.00     19133
   macro avg       0.87      1.00      0.92     19133
weighted avg       1.00      1.00      1.00     19133



## 2. Undersampling and Oversampling
a) Resample
- SMOTEENN : SMOTENC (Categorical and numerical features)+ ENN



...

In [32]:
X_train.shape

(76529, 15)

In [33]:
#import SMOTENC 
from imblearn.over_sampling import SMOTENC
#import ENN
from imblearn.under_sampling import EditedNearestNeighbours

#import SMOTEENN
from imblearn.combine import SMOTEENN

# create model of over and under sampling : SMOTENC and ENN
# categorical features : 'AccountId','SubscriptionId','CustomerId', 'ProductId','PricingStrategy','ProductCategory_airtime','ProductCategory_data_bundles','ProductCategory_financial_services','ProductCategory_movies','ProductCategory_other','ProductCategory_ticket','ProductCategory_transport','ProductCategory_tv','ProductCategory_utility_bill','ChannelId_1','ChannelId_2','ChannelId_3','ChannelId_5','ProviderId_1','ProviderId_2','ProviderId_3','ProviderId_4','ProviderId_5','ProviderId_6','Hour_1','Hour_2','Hour_3','Hour_4'
#smotenc= SMOTENC(categorical_features=['AccountId','SubscriptionId','CustomerId', 'ProductId','PricingStrategy','ProductCategory_airtime','ProductCategory_data_bundles','ProductCategory_financial_services','ProductCategory_movies','ProductCategory_other','ProductCategory_ticket','ProductCategory_transport','ProductCategory_tv','ProductCategory_utility_bill','ChannelId_1','ChannelId_2','ChannelId_3','ChannelId_5','ProviderId_1','ProviderId_2','ProviderId_3','ProviderId_4','ProviderId_5','ProviderId_6','Hour_1','Hour_2','Hour_3','Hour_4'], random_state=42)
#smotenc= SMOTENC(categorical_features=[0,1,2,3,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,23,24,25,26,27,28])
smotenc= SMOTENC(categorical_features=[ 'ProviderId_3', 'ProviderId_5', 'ProviderId_1', 'ProductCategory_utility_bill', 'ProductCategory_financial_services', 'ChannelId_3', 'Hour_3', 'Hour_4', 'ChannelId_1', 'PricingStrategy_4', 'SubscriptionId', 'ProductCategory_transport', 'AccountId', 'PricingStrategy_0'])
enn_model = EditedNearestNeighbours(sampling_strategy="auto", n_neighbors=3, kind_sel='all',n_jobs=-1)

smotenc_enn = SMOTEENN(sampling_strategy='not minority', smote=smotenc, enn=enn_model, n_jobs=-1)




In [34]:
#X_resampled_enn, y_resampled_enn = enn_model.fit_resample(X_train, y_train)

In [35]:
# print ('Shape of undersampled data : ', y_resampled_enn.value_counts())

In [36]:
#X_resampled_smotenc, y_resampled_smotenc = smotenc.fit_resample(X_train, y_train)
#print('Number of fraud and not fraud cases in training data: \n', y_resampled_smotenc.value_counts())

In [37]:
#
# create new X_resampled and y_resampled
# X_resampled, y_resampled = smotenc_enn.fit_resample(X_train, y_train)

# # find the number of fraud and non-fraud cases in the training data
# print('Number of fraud and not fraud cases in training data: \n', y_resampled.value_counts())

## 2. Undersampling and Oversampling
a) Resample

- RandomUnderSampler + SMOTEENN 

In [38]:
print (y_train.value_counts())

0    76372
1      157
Name: Fraud, dtype: int64


In [39]:
# import randomUnderSampler
from imblearn.under_sampling import RandomUnderSampler

# create model under sampling 
rus = RandomUnderSampler(sampling_strategy=0.04, random_state=42)

In [40]:
# fit and resample data
X_resampled_rus, y_resampled_rus = rus.fit_resample(X_train, y_train)

In [41]:
print ('value count of undersampled data : ', y_resampled_rus.value_counts())
print ('Shape of undersampled data : ', X_resampled_rus.shape)

value count of undersampled data :  0    3925
1     157
Name: Fraud, dtype: int64
Shape of undersampled data :  (4082, 15)


In [42]:
# Oversampling with SMOTENC fit and resample data
X_resampled_smotenc, y_resampled_smotenc = smotenc.fit_resample(X_resampled_rus, y_resampled_rus)

  [cat not in np.arange(self.n_features_) for cat in categorical_features]


ValueError: Some of the categorical indices are out of range. Indices should be between 0 and 14

In [None]:
print ('value count of undersampled data : ', y_resampled_smotenc.value_counts())
print ('Shape of undersampled data : ', X_resampled_smotenc.shape)

value count of undersampled data :  0    3925
1    3925
Name: Fraud, dtype: int64
Shape of undersampled data :  (7850, 29)


## 2. Undersampling and Oversampling
b) Model classifier
- random forest classifier

In [None]:
# create model random forest
rfc_und_over = RandomForestClassifier(n_estimators=30, random_state=42, n_jobs=-1, verbose=0)

# fit and predict
rfc_und_over.fit(X_resampled_smotenc, y_resampled_smotenc)
y_pred = rfc_und_over.predict(X_test)

print('Matrix confusion : \n',confusion_matrix(y_test, y_pred))
print('\n Classification report : \n',classification_report(y_test, y_pred))


Matrix confusion : 
 [[19058    39]
 [    1    35]]

 Classification report : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19097
           1       0.47      0.97      0.64        36

    accuracy                           1.00     19133
   macro avg       0.74      0.99      0.82     19133
weighted avg       1.00      1.00      1.00     19133



# II. Evaluation

- Prediction on test set

In [None]:
#import Data

df_test = pd.read_csv('../../Data/test_clean.csv')

df_test.head()


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy
0,TransactionId_50600,BatchId_35028,AccountId_2441,SubscriptionId_4426,CustomerId_2857,UGX,256,ProviderId_5,ProductId_3,airtime,ChannelId_3,1000.0,1000,2019-02-13T10:01:40Z,4
1,TransactionId_95109,BatchId_45139,AccountId_3439,SubscriptionId_2643,CustomerId_3874,UGX,256,ProviderId_5,ProductId_15,financial_services,ChannelId_3,2000.0,2000,2019-02-13T10:02:12Z,2
2,TransactionId_47357,BatchId_74887,AccountId_4841,SubscriptionId_3829,CustomerId_2857,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-50.0,50,2019-02-13T10:02:30Z,2
3,TransactionId_28185,BatchId_11025,AccountId_2685,SubscriptionId_4626,CustomerId_3105,UGX,256,ProviderId_5,ProductId_10,airtime,ChannelId_3,3000.0,3000,2019-02-13T10:02:38Z,4
4,TransactionId_22140,BatchId_29804,AccountId_4841,SubscriptionId_3829,CustomerId_3105,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-60.0,60,2019-02-13T10:02:58Z,2


In [None]:
print(df.shape)
print (df.columns)
print(df_test.shape)
print (df_test.columns)


(95662, 30)
Index(['AccountId', 'SubscriptionId', 'CustomerId', 'ProductId', 'Value',
       'PricingStrategy', 'ProductCategory_airtime',
       'ProductCategory_data_bundles', 'ProductCategory_financial_services',
       'ProductCategory_movies', 'ProductCategory_other',
       'ProductCategory_ticket', 'ProductCategory_transport',
       'ProductCategory_tv', 'ProductCategory_utility_bill', 'ChannelId_1',
       'ChannelId_2', 'ChannelId_3', 'ChannelId_5', 'ProviderId_1',
       'ProviderId_2', 'ProviderId_3', 'ProviderId_4', 'ProviderId_5',
       'ProviderId_6', 'Hour_1', 'Hour_2', 'Hour_3', 'Hour_4', 'Fraud'],
      dtype='object')
(45019, 30)
Index(['AccountId', 'SubscriptionId', 'CustomerId', 'ProductId', 'Value',
       'PricingStrategy', 'ProductCategory_airtime',
       'ProductCategory_data_bundles', 'ProductCategory_financial_services',
       'ProductCategory_movies', 'ProductCategory_retail',
       'ProductCategory_ticket', 'ProductCategory_transport',
       'ProductCa

In [None]:
y_pred_EEC_rfc2=rfc2.predict(df_test)
print (y_pred.value_counts())

y_pred_rfc=rfc_und_over.predict(df_test)
print (y_pred.value_counts())


Feature names unseen at fit time:
- ChannelId_4
- ProductCategory_retail
Feature names seen at fit time, yet now missing:
- ProductCategory_other



ValueError: X has 30 features, but EasyEnsembleClassifier is expecting 29 features as input.