In [26]:
#import everything for data preprocessing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# import everything for model building
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, roc_curve



CAT_COL = ["repeat_retailer","used_chip","used_pin_number","online_order","fraud"]
NON_CAT_COL = ["distance_from_home","distance_from_last_transaction","ratio_to_median_purchase_price"]


## Read Data

In [13]:
dataSet = pd.read_csv('card_transdata.csv')
#print levels of every column
for col in dataSet.columns:
    print(col, dataSet[col].unique())

dataSet.dropna(inplace=True)

#count rows that have used_chip = 0, used_pin_number = 0, online_order = 0
#this means that the payment happened through wireless payment
print(dataSet[(dataSet['used_chip'] == 0) & (dataSet['used_pin_number'] == 0) & (dataSet['online_order'] == 0)].shape[0])

#count rows that have used_chip = 1, used_pin_number = 0, online_order = 1
#this means that the payment happened using NFC technology
print(dataSet[(dataSet['used_chip'] == 1) & (dataSet['used_pin_number'] == 0) & (dataSet['online_order'] == 1)].shape[0])




distance_from_home [57.87785658 10.8299427   5.09107949 ...  2.91485699  4.25872939
 58.10812496]
distance_from_last_transaction [0.31114001 0.1755915  0.80515259 ... 1.47268669 0.24202337 0.31811012]
ratio_to_median_purchase_price [1.94593998 1.29421881 0.42771456 ... 0.21807549 0.47582206 0.38691985]
repeat_retailer [1. 0.]
used_chip [1. 0.]
used_pin_number [0. 1.]
online_order [0. 1.]
fraud [0. 1.]
204124
205221


## Handle Outliers

In [7]:
def handle_outliers(df):
#Handle outliers
    # Select numerical columns only
    num_cols = df.select_dtypes(include=[np.number])

    # Compute the 1st and 99th percentile of each numerical column
    percentiles = np.nanpercentile(num_cols, [1, 99], axis=0)

    # Winsorize the numerical columns
    num_cols = np.clip(num_cols, percentiles[0], percentiles[1])

    # Replace the original numerical columns in the dataframe with the winsorized ones
    df[num_cols.columns] = num_cols
    

## Standard deviation

In [16]:
def standardDev(df):
    result = df.copy()
    for feature_name in df.columns:
        if feature_name in CAT_COL:
            continue
        mean_value = df[feature_name].mean()
        std_value = df[feature_name].std()
        result[feature_name] = (df[feature_name] - mean_value) / std_value
    return result

## Split the data

In [27]:
#split the data
dataSet = standardDev(dataSet)
X_train, X_test, y_train, y_test = train_test_split(dataSet.drop('fraud',axis=1), dataSet['fraud'], test_size=0.3, random_state=42)




## Logistic regression

In [22]:
#implement logistic regression
logisticRegr = LogisticRegression(max_iter=1000)
logisticRegr.fit(X_train, y_train)
predictions = logisticRegr.predict(X_test)
print("logistic regression predictions: ", predictions[:10])
score = logisticRegr.score(X_test, y_test)
print("Logisitic regression score: ", score*100, "%")


Logisitic regression score:  95.84899999999999 %
