In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
df = pd.read_csv('fraudTrain.csv')

In [2]:
#Feature Selection/Extraction: 

df['trans_date_trans_time']= df['trans_date_trans_time'].str[10:13]
#taking only the hour time from this format: 'M/D/YYYY  HH:MM:SS AM/PM'

df = df.rename(columns={'trans_date_trans_time': 'trans_hour'})

print(df['trans_hour'].unique())

[' 00' ' 01' ' 02' ' 03' ' 04' ' 05' ' 06' ' 07' ' 08' ' 09' ' 10' ' 11'
 ' 12' ' 13' ' 14' ' 15' ' 16' ' 17' ' 18' ' 19' ' 20' ' 21' ' 22' ' 23']


In [3]:
df = df.drop(columns=['cc_num', 'merchant', 'first', 'last', 'street', 'city', 'state', 'zip', 'city_pop','job', 'trans_num', 'unix_time' ])  

In [4]:
df['lat'] = abs(df['lat'] - df['merch_lat'])
df['long'] = abs(df['long'] - df['merch_long'])

df = df.rename(columns = {'long': 'long_distance', 'lat': 'lat_distance'})

df = df.drop(columns=['merch_lat'])
df = df.drop(columns=['merch_long'])
df = df.drop(columns=['Unnamed: 0'])

In [5]:
df['dob'] = df['dob'].str[0:4] 
#taking only the birth year from this format: 'M/D/YYYY' so we can get the age of the card holder

df['dob'] = df['dob'].astype(int) 
#turning the column to integer values so we can get the age(performing a column type transformation)

df['dob'] = 2020 - df['dob'] #because the dataset has transacions up until no more than 2020
df = df.rename(columns={'dob': 'age'})

In [6]:
#Encoding: 
from sklearn.preprocessing import LabelEncoder
print('number of different hours: ', len(df['trans_hour'].unique()), '==>',df['trans_hour'].unique() )
#Now we can use label encoding on the trans_hour column

le=LabelEncoder()
df['trans_hour'] = le.fit_transform(df['trans_hour'])
#we then renamed the column to trans_hour

number of different hours:  24 ==> [' 00' ' 01' ' 02' ' 03' ' 04' ' 05' ' 06' ' 07' ' 08' ' 09' ' 10' ' 11'
 ' 12' ' 13' ' 14' ' 15' ' 16' ' 17' ' 18' ' 19' ' 20' ' 21' ' 22' ' 23']


In [7]:
import category_encoders as ce
print('number of different categories: ', len(df['category'].unique()))
#Performing binary encoding on the 'category' column
encoder = ce.BinaryEncoder(cols=['category'])

df_binary=encoder.fit_transform(df['category'])

df = pd.concat([df, df_binary], axis=1) #this line is to add the new encoded columns inside the dataframe

df = df.drop(columns=['category'])#dropping the main category column after adding the new encoded columns inside df

number of different categories:  14


In [8]:
import pandas as pd
print('number of different genders: ', len(df['gender'].unique()), '==>', df['gender'].unique() )
#Now we can use one hot encoding on the gender column 

df = pd.get_dummies(df, columns=['gender'], prefix = 'gender')

number of different genders:  2 ==> ['F' 'M']


In [9]:
df.head()

Unnamed: 0,trans_hour,amt,lat_distance,long_distance,age,is_fraud,category_0,category_1,category_2,category_3,gender_F,gender_M
0,0,4.97,0.067507,0.870215,32,0,0,0,0,1,1,0
1,0,107.23,0.271247,0.024038,42,0,0,0,1,0,1,0
2,0,220.11,0.969904,0.107519,58,0,0,0,1,1,0,1
3,0,45.0,0.803731,0.447271,53,0,0,1,0,0,0,1
4,0,41.96,0.254299,0.830441,34,0,0,1,0,1,0,1


In [10]:
from time import time
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# splitting into the fraud and legit datasets
fraud_dataSet = df[df.is_fraud == 1]
legit_dataSet = df[df.is_fraud == 0]

# building a legit dataset in the length of the frauds (7506), so it will be 7506 frauds and 7506 legits
legit_sample = legit_dataSet.sample(n=7506) #7506

# concatenating the two legit(7506) and fraud(7506) datasets(15012)
new_dataset = pd.concat([legit_sample,fraud_dataSet], axis=0)

X = new_dataset.drop(columns='is_fraud',axis=1)
y = new_dataset['is_fraud']

Start_Time = time()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

model = RandomForestClassifier(n_estimators=300, max_depth=20,min_samples_split=1, min_samples_leaf= 1, random_state=42)

#training the Decision Tree Model with Training Data
model.fit(X_train,y_train)

# Predictions on the test data
y_test_pred = model.predict(X_test)

# Calculate evaluation metrics for test data (you can keep this part as is)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred, average='macro')
test_recall = recall_score(y_test, y_test_pred, average='macro')
test_f1 = f1_score(y_test, y_test_pred, average='macro')
test_auc_roc = roc_auc_score(y_test, y_test_pred)

End_Time = time()

print('Test Data Metrics:')
print('Test Accuracy: ', test_accuracy)
print('Test Precision: ', test_precision)
print('Test Recall: ', test_recall)
print('Test F1: ', test_f1)
print('Test AUC-ROC: ', test_auc_roc)

print('time: ', End_Time-Start_Time)

Test Data Metrics:
Test Accuracy:  0.9686944937833037
Test Precision:  0.9689164927207042
Test Recall:  0.9686944937833037
Test F1:  0.9686907881065578
Test AUC-ROC:  0.9686944937833036
time:  0.33054447174072266


In [11]:
X

Unnamed: 0,trans_hour,amt,lat_distance,long_distance,age,category_0,category_1,category_2,category_3,gender_F,gender_M
572082,13,71.34,0.103084,0.315114,47,0,0,1,1,0,1
160892,6,76.78,0.922326,0.011214,44,0,0,0,1,1,0
844799,16,87.56,0.523307,0.673423,19,0,0,1,1,0,1
825520,16,11.33,0.141052,0.360211,50,1,1,1,0,1,0
695080,19,21.63,0.595633,0.213326,84,1,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...
1295399,1,977.01,0.744314,0.431289,34,0,1,1,1,1,0
1295491,1,1210.91,0.583707,0.547452,34,0,1,1,1,1,0
1295532,2,10.24,0.758544,0.458932,26,0,1,0,0,0,1
1295666,3,21.69,0.815372,0.743649,51,0,1,0,0,1,0


In [12]:
y

572082     0
160892     0
844799     0
825520     0
695080     0
          ..
1295399    1
1295491    1
1295532    1
1295666    1
1295733    1
Name: is_fraud, Length: 15012, dtype: int64

In [13]:
model.predict([[22,5203,0.744314,0.431289,34,0,1,0,0,1,0]])



array([1], dtype=int64)

In [14]:
import joblib
joblib.dump(model,"model_joblib.pkl")

['model_joblib.pkl']

In [15]:
classifier = joblib.load('model_joblib.pkl')

In [16]:
new_obs = [[22,5203,0.744314,0.431289,34,0,1,0,0,1,0]]

In [17]:
classifier.predict(new_obs)



array([1], dtype=int64)

In [18]:
from flask import Flask, request, jsonify
import joblib
app = Flask(__name__)

In [19]:
@app.route('/predict', methods=['POST'])
def predict():
    try:
        data = request.json
        prediction = model.predict([data['features']])
        return jsonify({'prediction':int(prediction[0])})
    except Exception as e:
        return jsonify({'error': str(e)})

In [None]:
def run_app():
    # Disable reloader and debugger
    app.run(port=5000, debug=False, use_reloader=False)
if __name__ == '__main__':
        run_app()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [01/Dec/2023 16:51:51] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Dec/2023 16:51:55] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Dec/2023 16:52:43] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Dec/2023 16:52:44] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Dec/2023 16:53:55] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Dec/2023 16:53:56] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Dec/2023 16:54:00] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [01/Dec/2023 16:55:01] "POST /predict HTTP/1.1" 200 -
