# Importing important modules and loading Dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import sklearn
from sklearn.metrics import classification_report

import joblib

import datetime
from datetime import date, datetime

In [2]:
train = pd.read_csv('../datasets/fraudTrain.csv')
test = pd.read_csv('../datasets/fraudTest.csv')

dataset = pd.concat([train, test], ignore_index = True)
dataset.drop(['Unnamed: 0'], axis=1, inplace=True)

# Preprocessing datasets

In [3]:
class PreprocessorML():
    def __init__(self, normalization=False):
        self.norm = normalization
    

    def one_hot_category(self, dataset):
        hot = ['gas_transport', 'grocery_pos', 'home', 'shopping_pos', 'kids_pets', 'shopping_net', 'entertainment',
               'food_dining', 'personal_care', 'health_fitness', 'misc_pos', 'misc_net', 'grocery_net', 'travel']
        
        for category in hot:
            dataset[category] = pd.Series([1 if x.category == category else 0 for x in dataset.itertuples()],
                                          index=dataset.index)
        
        return dataset


    def add_time(self, dataset):
        dataframe = dataset.sort_values(by=['cc_num', 'unix_time'])

        delta_time = []

        previous_row = dataframe.iloc[0]

        delta_time.append(0)

        for row in dataframe[1:].itertuples():

            if row.cc_num == previous_row.cc_num:
                delta_time.append(row.unix_time - previous_row.unix_time)
            else:
                delta_time.append(0)

            previous_row = row

        dataframe['delta_time'] = pd.Series(delta_time, index=dataframe.index)

        return dataframe


    def parse_time(self, string):
        return datetime.strptime(string, "%Y-%m-%d %H:%M:%S")


    def add_workhour_category(self, dataset):
        dataset['work_hours'] = dataset['trans_date_trans_time'].apply(
            lambda x: 
                int(self.parse_time(x).hour >= 6 and self.parse_time(x).hour <= 18))
        return dataset


    def add_weekend_category(self, dataset):
        dataset['weekend'] = dataset['trans_date_trans_time'].apply(
            lambda x: 
                int(self.parse_time(x).weekday() >= 5 and self.parse_time(x).weekday() <= 6))
        return dataset

    
    def add_age(self, dataset):
        dataset['age'] = dataset['dob'].apply(lambda x: (date.today() - date.fromisoformat(x)).days // 365)
        return dataset


    def add_distance(self, dataset):
        lat1 = dataset['lat']
        lon1 = dataset['long']
        lat2 = dataset['merch_lat']
        lon2 = dataset['merch_long']
        dataset['distance'] = np.arccos(np.sin(lat1) * np.sin(lat2) + np.cos(lat1) * np.cos(lat2) * np.cos(lon1 - lon2)) * 6371
        return dataset

    
    def add_gender(self, dataset):
        dataset['gender'] = pd.Categorical(dataset['gender'], categories=['F', 'M'])
        hot = pd.get_dummies(dataset['gender'], columns = ['F', 'M'])
        
        return dataset.join(hot)


    def add_weekday(self, dataset):
        dataset['weekday'] = dataset['trans_date_trans_time'].apply(
            lambda x: int(self.parse_time(x).weekday()))
        
        return dataset
    
    def add_hour(self, dataset):
        dataset['hour'] = dataset['trans_date_trans_time'].apply(
            lambda x: int(self.parse_time(x).hour))
        
        return dataset


    def preprocess(self, dataset, columns_to_delete=['cc_num', 
                      'city', 
                      'dob', 
                      'job', 
                      'first', 
                      'last',
                      'trans_date_trans_time',
                      'category',
                      'trans_num',
                      'lat',
                      'long',
                      'merch_lat',
                      'merch_long',
                      'unix_time',
                      'street',
                      'merchant',
                      'state',
                      'gender']):
        dataset = self.add_age(dataset)
        dataset = self.add_time(dataset)
        dataset = self.add_distance(dataset)
        dataset = self.one_hot_category(dataset)
        dataset = self.add_workhour_category(dataset)
        dataset = self.add_weekend_category(dataset)
        dataset = self.add_gender(dataset)
        print(type(dataset))
        dataset = self.add_hour(dataset)
        print(type(dataset))
        dataset = self.add_weekday(dataset)
        print(type(dataset))

        dataset = dataset.drop(columns_to_delete, axis = 1)

        return dataset

In [4]:
preprocessor = PreprocessorML()
dataset = preprocessor.preprocess(dataset)
dataset

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,amt,zip,city_pop,is_fraud,age,delta_time,distance,gas_transport,grocery_pos,home,...,misc_pos,misc_net,grocery_net,travel,work_hours,weekend,F,M,hour,weekday
1017,7.27,82514,1645,0,37,0,7543.700547,0,0,0,...,0,1,0,0,1,0,True,False,12,1
2724,52.94,82514,1645,0,37,71862,6264.188302,1,0,0,...,0,0,0,0,1,0,True,False,8,2
2726,82.08,82514,1645,0,37,159,939.418661,1,0,0,...,0,0,0,0,1,0,True,False,8,2
2882,34.79,82514,1645,0,37,13838,4937.313870,0,0,0,...,0,0,0,0,1,0,True,False,12,2
2907,27.18,82514,1645,0,37,1952,3941.877584,0,0,1,...,0,0,0,0,1,0,True,False,13,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1849259,66.11,61335,532,0,67,8077,3043.216367,1,0,0,...,0,0,0,0,0,0,False,True,2,3
1849567,4.58,61335,532,0,67,11005,5377.002667,0,0,0,...,0,1,0,0,0,0,False,True,5,3
1850234,95.96,61335,532,0,67,21729,2151.742759,1,0,0,...,0,0,0,0,1,0,False,True,11,3
1850235,149.48,61335,532,0,67,101,5406.444488,0,1,0,...,0,0,0,0,1,0,False,True,11,3


In [5]:
dataset.to_csv('../../models/dataset_ml_v2.csv', index=False)

# Fitting  Random Forest Tree

In [6]:
dataset = pd.read_csv('../../models/dataset_ml_v2.csv')

In [7]:
dataset

Unnamed: 0,amt,zip,city_pop,is_fraud,age,delta_time,distance,gas_transport,grocery_pos,home,...,misc_pos,misc_net,grocery_net,travel,work_hours,weekend,F,M,hour,weekday
0,7.27,82514,1645,0,37,0,7543.700547,0,0,0,...,0,1,0,0,1,0,True,False,12,1
1,52.94,82514,1645,0,37,71862,6264.188302,1,0,0,...,0,0,0,0,1,0,True,False,8,2
2,82.08,82514,1645,0,37,159,939.418661,1,0,0,...,0,0,0,0,1,0,True,False,8,2
3,34.79,82514,1645,0,37,13838,4937.313870,0,0,0,...,0,0,0,0,1,0,True,False,12,2
4,27.18,82514,1645,0,37,1952,3941.877584,0,0,1,...,0,0,0,0,1,0,True,False,13,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1852389,66.11,61335,532,0,67,8077,3043.216367,1,0,0,...,0,0,0,0,0,0,False,True,2,3
1852390,4.58,61335,532,0,67,11005,5377.002667,0,0,0,...,0,1,0,0,0,0,False,True,5,3
1852391,95.96,61335,532,0,67,21729,2151.742759,1,0,0,...,0,0,0,0,1,0,False,True,11,3
1852392,149.48,61335,532,0,67,101,5406.444488,0,1,0,...,0,0,0,0,1,0,False,True,11,3


In [8]:
y = dataset['is_fraud']
X = dataset.drop(['is_fraud'], axis=1)

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
y_test

1541144    0
1731581    0
354659     0
1493788    0
468148     0
          ..
297820     0
1392794    0
1582515    0
1805770    0
774513     0
Name: is_fraud, Length: 370479, dtype: int64

In [10]:
tree = RandomForestRegressor(n_estimators=100, bootstrap=True, oob_score=True, n_jobs=-1)
preds = tree.fit(X_train, y_train)
pred = tree.predict(X_test)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


KeyboardInterrupt: 

In [None]:
report = classification_report(y_test, [1 if x >= 0.5 else 0 for x in pred])

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [None]:
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    368595
           1       0.94      0.82      0.87      1884

    accuracy                           1.00    370479
   macro avg       0.97      0.91      0.94    370479
weighted avg       1.00      1.00      1.00    370479



In [None]:
joblib.dump(tree, 'random_forest_regressor.ml')

['random_forest_regressor.ml']

# Fitting XGBoost tree