# Import packages and Data

In [29]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


df = pd.read_csv(r'C:\Users\miesn\Data_for_Sci-kit_Learn\Financial_Fraud.csv')

# Pre-processing DataFrame for Model

In [2]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [3]:
df= df.drop(['nameOrig','nameDest','isFlaggedFraud'],axis = 1)

In [4]:
df_fraud = df[df['isFraud'] == 1]
df_nofraud = df[df['isFraud'] == 0]

In [5]:
df_nofraud = df_nofraud.head(12000)
df= pd.concat([df_fraud, df_nofraud], axis = 0)

In [6]:
df['type']= df['type'].astype('category')
type_encode = LabelEncoder()
df['type']= type_encode.fit_transform(df.type)

In [7]:
type_one_hot = OneHotEncoder()
type_one_hot_encode = type_one_hot.fit_transform(df.type.values.reshape(-1,1)).toarray()
ohe_variable = pd.DataFrame(type_one_hot_encode, columns = ["type_"+str(int(i)) for i in range(type_one_hot_encode.shape[1])])
df = pd.concat([df, ohe_variable], axis=1)
df = df.drop('type', axis = 1)
df

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,type_0,type_1,type_2,type_3,type_4
0,1.0,9839.64,170136.00,160296.36,0.00,0.00,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,1864.28,21249.00,19384.72,0.00,0.00,0.0,0.0,1.0,0.0,0.0,0.0
2,1.0,181.00,181.00,0.00,0.00,0.00,1.0,0.0,0.0,0.0,0.0,1.0
3,1.0,181.00,181.00,0.00,21182.00,0.00,1.0,0.0,1.0,0.0,0.0,0.0
4,1.0,11668.14,41554.00,29885.86,0.00,0.00,0.0,0.0,0.0,0.0,0.0,1.0
5,1.0,7817.71,53860.00,46042.29,0.00,0.00,0.0,0.0,1.0,0.0,0.0,0.0
6,1.0,7107.77,183195.00,176087.23,0.00,0.00,0.0,0.0,1.0,0.0,0.0,0.0
7,1.0,7861.64,176087.23,168225.59,0.00,0.00,0.0,0.0,0.0,0.0,0.0,1.0
8,1.0,4024.36,2671.00,0.00,0.00,0.00,0.0,0.0,1.0,0.0,0.0,0.0
9,1.0,5337.77,41720.00,36382.23,41898.00,40348.79,0.0,0.0,0.0,0.0,0.0,1.0


In [8]:
df.isnull().any()

step              True
amount            True
oldbalanceOrg     True
newbalanceOrig    True
oldbalanceDest    True
newbalanceDest    True
isFraud           True
type_0            True
type_1            True
type_2            True
type_3            True
type_4            True
dtype: bool

In [9]:
df = df.fillna(0)

In [10]:
df.to_csv('fraud_prediction.csv')

# Setting up data for Model

In [11]:
features = df.drop('isFraud', axis = 1).values
target = df['isFraud'].values

In [12]:
X_train, X_test, y_train, y_test = train_test_split(features,target,test_size = 0.3,random_state = 42,stratify = target)

In [13]:
knn_classifier = KNeighborsClassifier(n_neighbors=3)

knn_classifier.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')

# Running Model

In [14]:
accuracy_score= knn_classifier.score(X_test,y_test)

print( "{0:.2%}".format(accuracy_score))

98.31%


# Refitting Model

In [22]:
grid = {'n_neighbors': np.arange(1,5)}

knn_classifier = KNeighborsClassifier()
knn= GridSearchCV(knn_classifier, grid, cv = 10)
knn.fit(X_train,y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_neighbors': array([1, 2, 3, 4])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [25]:
knn.best_params_


{'n_neighbors': 1}

In [27]:
print( "{0:.2%}".format(knn.best_score_))

98.51%


In [32]:
pipeline_order = [('scaler', StandardScaler()),('knn',KNeighborsClassifier(n_neighbors = 1))]
pipeline = Pipeline(pipeline_order)

In [35]:
knn_classifier_scaled = pipeline.fit(X_train, y_train)
scaled_score = knn_classifier_scaled.score(X_test,y_test)

print( "{0:.2%}".format(scaled_score))

99.60%
