In [13]:
# import libraries
from pathlib import Path
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import StandardScaler

In [2]:
# read in csv file
filepath = Path('PS_20174392719_1491204439457_log-5-1.csv')
aml_dataset= pd.read_csv(filepath)

In [3]:
# view df
aml_dataset.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,355,CASH_OUT,129493.82,C1386423433,496957.0,367463.18,C1136528453,0.0,129493.82,0,0
1,355,PAYMENT,5787.63,C1365884506,513408.0,507620.37,M1473495757,0.0,0.0,0,0
2,355,CASH_OUT,172499.24,C468847406,150346.0,0.0,C1947417438,0.0,172499.24,0,0
3,355,CASH_OUT,377391.01,C1354691509,30759.0,0.0,C1791454133,1409725.39,1697870.91,0,0
4,355,PAYMENT,12469.5,C1287195798,315.0,0.0,M87390822,0.0,0.0,0,0


In [4]:
aml_dataset.shape

(636262, 11)

## Cleaning and Prepping Dataset for Analysis

In [5]:
# drop unecessary columns
aml_dataset = aml_dataset.drop(columns=['nameOrig','nameDest','isFlaggedFraud'])
# review the df
aml_dataset.head()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud
0,355,CASH_OUT,129493.82,496957.0,367463.18,0.0,129493.82,0
1,355,PAYMENT,5787.63,513408.0,507620.37,0.0,0.0,0
2,355,CASH_OUT,172499.24,150346.0,0.0,0.0,172499.24,0
3,355,CASH_OUT,377391.01,30759.0,0.0,1409725.39,1697870.91,0
4,355,PAYMENT,12469.5,315.0,0.0,0.0,0.0,0


In [6]:
# using get_dummies function to encode type column to binary
aml_dataset = pd.get_dummies(aml_dataset)
# review the df
aml_dataset.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,355,129493.82,496957.0,367463.18,0.0,129493.82,0,0,1,0,0,0
1,355,5787.63,513408.0,507620.37,0.0,0.0,0,0,0,0,1,0
2,355,172499.24,150346.0,0.0,0.0,172499.24,0,0,1,0,0,0
3,355,377391.01,30759.0,0.0,1409725.39,1697870.91,0,0,1,0,0,0
4,355,12469.5,315.0,0.0,0.0,0.0,0,0,0,0,1,0


## Splitting Data into Testing and Training sets

In [7]:
# separate the data into labels and features
# separate the y variable
y = aml_dataset['isFraud']

# separate the X variable
X = aml_dataset.drop(columns='isFraud')

In [8]:
# review the y variable 
y.head()

0    0
1    0
2    0
3    0
4    0
Name: isFraud, dtype: int64

In [9]:
# checking y for fraud vs not fraud
y.value_counts()

0    635772
1       490
Name: isFraud, dtype: int64

In [10]:
# review the x variable
X.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,355,129493.82,496957.0,367463.18,0.0,129493.82,0,1,0,0,0
1,355,5787.63,513408.0,507620.37,0.0,0.0,0,0,0,1,0
2,355,172499.24,150346.0,0.0,0.0,172499.24,0,1,0,0,0
3,355,377391.01,30759.0,0.0,1409725.39,1697870.91,0,1,0,0,0
4,355,12469.5,315.0,0.0,0.0,0.0,0,0,0,1,0


## Prepping Variables and Data for Models

In [11]:
# Split the data using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [24]:
# Create the StandardScaler instance
X_scaler = StandardScaler()

# Fit the scaler to the features training dataset
X_scaler.fit(X_train)

# Scale both the training and testing data from the features dataset
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

X_train_scaled

array([[-0.38670632, -0.03442918, -0.28938054, ..., -0.08289869,
        -0.71924374, -0.30311178],
       [ 0.31778955, -0.11357049, -0.28562729, ..., -0.08289869,
        -0.71924374, -0.30311178],
       [-0.38670632, -0.39211606, -0.26758961, ..., -0.08289869,
         1.39034927, -0.30311178],
       ...,
       [ 0.47434418, -0.3865021 , -0.27596878, ..., -0.08289869,
         1.39034927, -0.30311178],
       [ 1.17884005, -0.38056772, -0.26909172, ..., -0.08289869,
         1.39034927, -0.30311178],
       [-0.62153827,  3.19355382, -0.2720152 , ..., -0.08289869,
        -0.71924374,  3.29911292]])

In [15]:
# assign randomoversampler to a variable
random_oversampler = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_resampled, y_resampled = random_oversampler.fit_resample(X_train_scaled, y_train)

In [16]:
# check to ensure balanced data
y_resampled.value_counts()

1    476833
0    476833
Name: isFraud, dtype: int64

## AdaBoostClassifier Model

In [17]:
# instantiate adaboostclassifier
ada_model = AdaBoostClassifier(random_state = 1)

# fit the model using the training data
aml_model = ada_model.fit(X_resampled, y_resampled)

# make a prediction using the testing data
aml_pred = aml_model.predict(X_test_scaled)


In [18]:
# printing the evaluation metrics
print(balanced_accuracy_score(y_test, aml_pred))
confusion_matrix(y_test, aml_pred)
print(classification_report_imbalanced(y_test, aml_pred))

0.9885538219411965
                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.98      0.99      0.99      0.99      0.98    158939
          1       0.05      0.99      0.98      0.10      0.99      0.98       127

avg / total       1.00      0.98      0.99      0.99      0.99      0.98    159066



## LinearSVC Model

In [19]:
# instantiate linearsvc model
linearsvc_model = LinearSVC(random_state=1)

#fit the model using the training data
aml_model = linearsvc_model.fit(X_resampled, y_resampled)

# make a prediction using the data
aml_pred = aml_model.predict(X_test_scaled)



In [20]:
# printing the evaluation metrics
print(balanced_accuracy_score(y_test, aml_pred))
print(classification_report_imbalanced(y_test, aml_pred))

0.9391389347460743
                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.95      0.93      0.97      0.94      0.88    158939
          1       0.01      0.93      0.95      0.03      0.94      0.88       127

avg / total       1.00      0.95      0.93      0.97      0.94      0.88    159066



## Logistic Regression Model

In [21]:
# instantiate logisticregression model
lr_model = LogisticRegression(random_state=1)

# fit the model using the training data
aml_model = lr_model.fit(X_resampled, y_resampled)

# make a prediction using the data
aml_pred = aml_model.predict(X_test_scaled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [22]:
# printing the evaluation metrics
print(balanced_accuracy_score(y_test, aml_pred))
print(classification_report_imbalanced(y_test, aml_pred))

0.9480793973699513
                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.95      0.94      0.98      0.95      0.90    158939
          1       0.02      0.94      0.95      0.03      0.95      0.90       127

avg / total       1.00      0.95      0.94      0.97      0.95      0.90    159066

