# Homework 4 - Fraud Prediction:

In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load in the Test and Train Set's:

In [None]:
#Load in testa nd train data
train_df = pd.read_csv('/content/drive/My Drive/training.csv', delimiter = ',', encoding = 'ISO-8859-1')
test_df = pd.read_csv('/content/drive/My Drive/test.csv', delimiter = ',', encoding = 'ISO-8859-1')

# EDA and preparation for training:

In [None]:
#Drop columns with too many missing values
train_df = train_df.drop(['ANUMBER_02', 'ANUMBER_03', 'ANUMBER_04', 'ANUMBER_05'], axis=1)
test_df = test_df.drop(['ANUMBER_02', 'ANUMBER_03', 'ANUMBER_04', 'ANUMBER_05'], axis=1)

#Encode categorial features
le = LabelEncoder()
categorial_features = ['O_EMAIL', 'O_TELEPHONE', 'FLAG_DIIDENTICAL', 'FLAG_NEWSLETTER', 
              'CHK_LADR', 'CHK_RADR', 'CHK_KTO', 'CHK_CARD', 'CHK_COOKIE', 
              'CHK_IP', 'FAIL_LPLZ', 'FAIL_LORT', 'FAIL_LPLZORTMATCH', 
              'FAIL_RPLZ', 'FAIL_RORT', 'FAIL_RPLZORTMATCH', 'NEW_CUSTOMER', 
              'P_METHOD','P_CARD_TYPE', 'DEL_DAY', 'P_METHOD', 'P_CARD_TYPE', 
              'DEL_DAY']

for feature in categorial_features:
  train_df[feature] = le.fit_transform(train_df[feature])
  test_df[feature] = le.fit_transform(test_df[feature])

#Replace NaN values in 'REM_CUR' and 'REM_MAX' for better learning

train_df[['REM_CUR', 'REM_MAX']] = train_df[['REM_CUR', 'REM_MAX']].fillna(0.0)
test_df[['REM_CUR', 'REM_MAX']] = test_df[['REM_CUR', 'REM_MAX']].fillna(0.0)

#Check fi their are NaN values
print(train_df.isna().sum())


# Create new features:

# 1. Feature 'DEL_AMOUNT_DIFF':

In [None]:
#Create new feature 'DEL_AMOUNT_DIFF' which is the 'DEL_AMOUNT' minus it's mean
train_df['DEL_AMOUNT_DIFF'] = train_df['DEL_AMOUNT'] - train_df['DEL_AMOUNT'].mean()
test_df['DEL_AMOUNT_DIFF'] = test_df['DEL_AMOUNT'] - test_df['DEL_AMOUNT'].mean()

# Description:

The new feature 'DEL_AMOUNT_DIFF' represents the difference between the 'DEL_AMOUNT' value to the mean. This feature can help to improve the performance, because it shows how much this order deviates from the average, which could be related to the likelihood of a customer to pay.

# 2. Feature 'DEL_VALUE_DIFF'

In [None]:
#Create new feature 'DEL_VALUE_DIFF' which is the 'DEL_VALUE' minus it's mean
train_df['DEL_VALUE_DIFF'] = train_df['DEL_VALUE'] - train_df['DEL_VALUE'].mean()
test_df['DEL_VALUE_DIFF'] = test_df['DEL_VALUE'] - test_df['DEL_VALUE'].mean()

# Description: 

The new feature 'DEL_AMOUNT_DIFF' represents the differnce in a specific order compared to the mean. It can help to improve the performance of the model, because it shows how much a specific order varies from the average order. This could be related to the likelihood of a customer to pay.

# 3. Feature: 'SESSION_TIME_DIFF'

In [None]:
#Create new feature 'SESSION_TIME_DIFF' which is the 'SESSION_TIME' minus it's mean
train_df['SESSION_TIME_DIFF'] = train_df['SESSION_TIME'] - train_df['SESSION_TIME'].mean()
test_df['SESSION_TIME_DIFF'] = test_df['SESSION_TIME'] - test_df['SESSION_TIME'].mean()

# Description:

The new feature 'SESSION_TIME_DIFF' compares this orders session time to the average. This can help to see if orders went way faster or slower then usual, which can may relate to customers make to quick choices in their orders or picked the items very carefully.

# 4. Feature 'VALUE_AMOUNT_CMP'

In [None]:
#Create new feature 'VALUE_AMOUNT_CMP' this compares this ordes value per item average to the general value per item average of this customer
train_df['VALUE_AMOUNT_CMP'] = (train_df['DEL_VALUE'] / train_df['DEL_AMOUNT']) - (train_df['DEL_VALUE_TOTAL'] / train_df['DEL_AMOUNT_TOTAL'])
test_df['VALUE_AMOUNT_CMP'] = (test_df['DEL_VALUE'] / test_df['DEL_AMOUNT']) - (test_df['DEL_VALUE_TOTAL'] / test_df['DEL_AMOUNT_TOTAL'])


#Handle NaN values based on 'DEL_AMOUNT_TOTAL' being zero or 'DEL_VALUE_TOTAL' being zero
train_df['VALUE_AMOUNT_CMP'] = train_df['VALUE_AMOUNT_CMP'].fillna(train_df['DEL_VALUE'] / train_df['DEL_AMOUNT'])
test_df['VALUE_AMOUNT_CMP'] = test_df['VALUE_AMOUNT_CMP'].fillna(test_df['DEL_VALUE'] / test_df['DEL_AMOUNT'])

# Description:

The new feature 'VALUE_AMOUNT_CMP' compares this orders average value per item to the average value per item of this customer in general. This can help to identify any anomalous behaviour which could indicate fraud. So this new feature can help to improve the performance at predicting fraud.

# Create test and train sets:

In [None]:
#Create test and train sets
X_train = train_df.drop(['TARGET_FRAUD'], axis=1)
y_train = train_df['TARGET_FRAUD']
X_test = test_df

# Hyperparameter Tuning:

In [None]:
# Define the hyperparameters to tune
#hyperparameters = {
    #'penalty': ['l1', 'l2'],
    #'C': [0.001, 0.01, 0.1, 1, 10, 100],
    #'solver': ['liblinear']
#}

# Initialize the model
#model = LogisticRegression()

# Use GridSearchCV to tune the hyperparameters
#grid_search = GridSearchCV(model, hyperparameters, scoring='roc_auc', cv=5)
#grid_search.fit(X_train, y_train)

# Print the best hyperparameters
#print(f'Best hyperparameters: {grid_search.best_params_}')

Best hyperparameters: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}


# Result:

In [None]:
#Create the model
model = LogisticRegression(C = 100, penalty = 'l1', solver = 'liblinear')

#fit the model
model.fit(X_train, y_train)

#predict
test_df['prediction'] = model.predict_proba(X_test)[:, 1]

#Create csv
test_df[['Id', 'prediction']].to_csv('predictions_target_fraud.csv', index = False) 