In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, MaxAbsScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import tree
import numpy as np

train = pd.read_csv('HW3_training.csv')

## Loading and preping data for Linear and Logistic Regressions

In [None]:
df_model = train.copy()

# filters fraudulent observations
fraud = df_model[df_model['isFraud'] == 1]
# samples n observations from original data
df_model = df_model.sample(200000)
# removes those identical fraudulent data from df_model
df_model = df_model[df_model['isFraud']==0]
df_model = df_model.append(fraud)

# creates x and y variables
x = df_model.drop(columns=['isFraud'])
x = pd.get_dummies(x, columns=['type'], drop_first=True) # create dummies
y = df_model['isFraud']
# split data into test and train
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)

### Linear Model

In [None]:
# creates Linear Regression obj and runs predict on x_test
lin_model = LinearRegression().fit(x_train,y_train)
pred = lin_model.predict(x_test)

In [None]:
# Sets the threshold to where I change isFraud to 0 or 1
threshold=1
indx = 0
for i in range(len(x_test)):
    # if isFraud = 1
    if y_test.iloc[i] == 1:
        # and the threshold is greater than prediction where isFraud = 1
        if threshold >= pred[i] > 0:
            threshold = pred[i] + (threshold/8)+0.02
            indx = i

In [None]:
# converts to 0 or 1
for i in range(len(pred)):
    if pred[i] >= threshold:
        pred[i]=1
    else:
        pred[i] = 0

In [None]:
print( sum(pred==y_test)/len(y_test))
f1_score(y_test, pred)

#### Linear Regression is terrible at predicting fraud

### Logistic Regression

In [None]:
logit_model = LogisticRegression().fit(x_train,y_train)
y_pred = logit_model.predict(x_test)

In [None]:
print( sum(y_pred==y_test)/len(y_test))
f1_score(y_test, y_pred)

### KNN

#### Standard KNN

In [None]:
# f1 score and K value
F1 = [0, -1]

# tests different k neighbors and stores the best performing values
for i in range(1, 12):
    knn_model = KNeighborsClassifier(i)
    knn_model.fit(x_train, y_train)
    y_pred = knn_model.predict(x_test)
    f1 = f1_score(y_test, y_pred)
    if f1 > F1[0]:
        F1[0] = f1
        F1[1] = i

In [None]:
print( sum(y_pred==y_test)/len(y_test))
F1

#### Standard Scaling KNN

In [None]:
df_model = train.copy()

# filters fraudulent observations
fraud = df_model[df_model['isFraud'] == 1]
# samples n observations from original data
df_model = df_model.sample(200000)
# removes those identical fraudulent data from df_model
df_model = df_model[df_model['isFraud']==0]
df_model = df_model.append(fraud)

#Rescaling features
scaler=StandardScaler()
features = [['amount','oldbalanceOrg','newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']]
for feature in features:
    df_model[feature] = scaler.fit_transform(df_model[feature])

x = df_model.drop(columns=['isFraud'])
x = pd.get_dummies(x, columns=['type'], drop_first=True) # create dummies
y = df_model['isFraud']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)

# f1 score and K value
F1 = [0, -1]

# tests different k neighbors and stores the best performing values
for i in range(1,12):
    knn = KNeighborsClassifier(i)
    knn.fit(x_train,y_train)
    y_pred=knn.predict(x_test)
    f1 = f1_score(y_test, y_pred)
    if f1 > F1[0]:
        F1[0] = f1
        F1[1] = i

In [None]:
print( sum(y_pred==y_test)/len(y_test))
F1

#### Robust Scaling KNN

In [None]:
df_model = train.copy()

# filters fraudulent observations
fraud = df_model[df_model['isFraud'] == 1]
# samples n observations from original data
df_model = df_model.sample(200000)
# removes those identical fraudulent data from df_model
df_model = df_model[df_model['isFraud']==0]
df_model = df_model.append(fraud)

#Rescaling features
scaler=RobustScaler()
features = [['amount','oldbalanceOrg','newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']]
for feature in features:
    df_model[feature] = scaler.fit_transform(df_model[feature])

x = df_model.drop(columns=['isFraud'])
x = pd.get_dummies(x, columns=['type'], drop_first=True) # create dummies
y = df_model['isFraud']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)

# f1 score and K value
F1 = [0, -1]

# tests different k neighbors and stores the best performing values
for i in range(1, 12):
    knn = KNeighborsClassifier(i)
    knn.fit(x_train,y_train)
    y_pred=knn.predict(x_test)
    f1 = f1_score(y_test, y_pred)
    if f1 > F1[0]:
        F1[0] = f1
        F1[1] = i

In [None]:
print( sum(y_pred==y_test)/len(y_test))
F1

#### MinMax KNN

In [None]:
df_model = train.copy()

# filters fraudulent observations
fraud = df_model[df_model['isFraud'] == 1]
# samples n observations from original data
df_model = df_model.sample(200000)
# removes those identical fraudulent data from df_model
df_model = df_model[df_model['isFraud']==0]
df_model = df_model.append(fraud)

#Rescaling features
scaler=MinMaxScaler()
features = [['amount','oldbalanceOrg','newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']]
for feature in features:
    df_model[feature] = scaler.fit_transform(df_model[feature])

x = df_model.drop(columns=['isFraud'])
x = pd.get_dummies(x, columns=['type'], drop_first=True) # create dummies
y = df_model['isFraud']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)

# f1 score and K value
F1 = [0, -1]

# tests different k neighbors and stores the best performing values
for i in range(1, 12):
    knn = KNeighborsClassifier(i)
    knn.fit(x_train,y_train)
    y_pred=knn.predict(x_test)
    f1 = f1_score(y_test, y_pred)
    if f1 > F1[0]:
        F1[0] = f1
        F1[1] = i

In [None]:
print( sum(y_pred==y_test)/len(y_test))
F1

#### MaxAbs Scaling

In [None]:
df_model = train.copy()

# filters fraudulent observations
fraud = df_model[df_model['isFraud'] == 1]
# samples n observations from original data
df_model = df_model.sample(200000)
# removes those identical fraudulent data from df_model
df_model = df_model[df_model['isFraud']==0]
df_model = df_model.append(fraud)

#Rescaling features
scaler=MaxAbsScaler()
features = [['amount','oldbalanceOrg','newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']]
for feature in features:
    df_model[feature] = scaler.fit_transform(df_model[feature])

x = df_model.drop(columns=['isFraud'])
x = pd.get_dummies(x, columns=['type'], drop_first=True) # create dummies
y = df_model['isFraud']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)

# f1 score and K value
F1 = [0, -1]

# tests different k neighbors and stores the best performing values
for i in range(1, 12):
    knn = KNeighborsClassifier(i)
    knn.fit(x_train,y_train)
    y_pred=knn.predict(x_test)
    f1 = f1_score(y_test, y_pred)
    if f1 > F1[0]:
        F1[0] = f1
        F1[1] = i

In [None]:
print( sum(y_pred==y_test)/len(y_test))
F1

##### KNN with Robust Scaling yields the best performance with K = 2

### Trees

#### Standard Scaling

In [32]:
df_model = train.copy()

# filters fraudulent observations
fraud = df_model[df_model['isFraud'] == 1]
# samples n observations from original data
df_model = df_model.sample(190000)
# removes those identical fraudulent data from df_model
df_model = df_model[df_model['isFraud']==0]
df_model = df_model.append(fraud)

scaler=StandardScaler()
features = [['amount','oldbalanceOrg','newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']]
for feature in features:
    df_model[feature] = scaler.fit_transform(df_model[feature])

x = df_model.drop(columns=['isFraud'])
x = pd.get_dummies(x, columns=['type'], drop_first=True) # create dummies
y = df_model['isFraud']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)

F1 = [0, -1]
best_standard_model = None

# tests different depths and stores the best performing values
for i in range(1, 120):
    tree_classifier = tree.DecisionTreeClassifier(max_depth=i)
    tree_model = tree_classifier.fit(x_train, y_train)
    y_pred = tree_model.predict(x_test)
    f1 = f1_score(y_test, y_pred)
    if f1 > F1[0]:
        F1[0] = f1
        F1[1] = i
        best_standard_model = tree_model

F1

  df_model = df_model.append(fraud)


[0.8372093023255814, 17]

#### Robust Scaling

In [44]:
df_model = train.copy()

# filters fraudulent observations
fraud = df_model[df_model['isFraud'] == 1]
# samples n observations from original data
df_model = df_model.sample(170000)
# removes those identical fraudulent data from df_model
df_model = df_model[df_model['isFraud']==0]
df_model = df_model.append(fraud)


scaler=RobustScaler()
features = [['amount','oldbalanceOrg','newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']]
for feature in features:
    df_model[feature] = scaler.fit_transform(df_model[feature])

x = df_model.drop(columns=['isFraud'])
x = pd.get_dummies(x, columns=['type'], drop_first=True) # create dummies
y = df_model['isFraud']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)

F1 = [0, -1]
best_robust_model = None

# tests different depths and stores the best performing values
for i in range(1, 120):
    tree_classifier = tree.DecisionTreeClassifier(max_depth=i)
    tree_model = tree_classifier.fit(x_train, y_train)
    y_pred = tree_model.predict(x_test)
    f1 = f1_score(y_test, y_pred)
    if f1 > F1[0]:
        F1[0] = f1
        F1[1] = i
        best_robust_model = tree_model
F1

  df_model = df_model.append(fraud)


[0.8513119533527697, 19]

#### MinMax Scaling

In [18]:
df_model = train.copy()

# filters fraudulent observations
fraud = df_model[df_model['isFraud'] == 1]
# samples n observations from original data
df_model = df_model.sample(175000)
# removes those identical fraudulent data from df_model
df_model = df_model[df_model['isFraud']==0]
df_model = df_model.append(fraud)


scaler=MinMaxScaler()
features = [['amount','oldbalanceOrg','newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']]
for feature in features:
    df_model[feature] = scaler.fit_transform(df_model[feature])

x = df_model.drop(columns=['isFraud'])
x = pd.get_dummies(x, columns=['type'], drop_first=True) # create dummies
y = df_model['isFraud']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)

F1 = [0, -1]
best_MinMax_model = None

# tests different depths and stores the best performing values
for i in range(1, 120):
    tree_classifier = tree.DecisionTreeClassifier(max_depth=i)
    tree_model = tree_classifier.fit(x_train, y_train)
    y_pred = tree_model.predict(x_test)
    f1 = f1_score(y_test, y_pred)
    if f1 > F1[0]:
        F1[0] = f1
        F1[1] = i
        best_MinMax_model = tree_model
F1

  df_model = df_model.append(fraud)


[0.8535825545171339, 93]

#### MaxAbs Scaling

In [14]:
df_model = train.copy()

# filters fraudulent observations
fraud = df_model[df_model['isFraud'] == 1]
# samples n observations from original data
df_model = df_model.sample(172000)
# removes those identical fraudulent data from df_model
df_model = df_model[df_model['isFraud']==0]
df_model = df_model.append(fraud)

scaler=MaxAbsScaler()
features = [['amount','oldbalanceOrg','newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']]
for feature in features:
    df_model[feature] = scaler.fit_transform(df_model[feature])

x = df_model.drop(columns=['isFraud'])
x = pd.get_dummies(x, columns=['type'], drop_first=True) # create dummies
y = df_model['isFraud']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.185, random_state=52)

F1 = [0, -1]
best_MaxAbs_model = None

# tests different depths and stores the best performing values
for i in range(1, 120):
    tree_classifier = tree.DecisionTreeClassifier(max_depth=i)
    tree_model = tree_classifier.fit(x_train, y_train)
    y_pred = tree_model.predict(x_test)
    f1 = f1_score(y_test, y_pred)
    if f1 > F1[0]:
        F1[0] = f1
        F1[1] = i
        best_MaxAbs_model = tree_model
F1

  df_model = df_model.append(fraud)


[0.8680351906158359, 98]

#### No Scaling

In [17]:
df_model = train.copy()

# filters fraudulent observations
fraud = df_model[df_model['isFraud'] == 1]
# samples n observations from original data
df_model = df_model.sample(168000)
# removes those identical fraudulent data from df_model
df_model = df_model[df_model['isFraud']==0]
df_model = df_model.append(fraud)


x = df_model.drop(columns=['isFraud'])
x = pd.get_dummies(x, columns=['type'], drop_first=True) # create dummies
y = df_model['isFraud']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)

F1 = [0, -1]
best_model = None
# tests different depths and stores the best performing values
for i in range(1, 120):
    tree_classifier = tree.DecisionTreeClassifier(max_depth=i)
    tree_model = tree_classifier.fit(x_train, y_train)
    y_pred = tree_model.predict(x_test)
    f1 = f1_score(y_test, y_pred)
    if f1 > F1[0]:
        F1[0] = f1
        F1[1] = i
        best_model = tree_model
F1


  df_model = df_model.append(fraud)


[0.822429906542056, 68]

## Running the best model on HW3_test_input dataset

#### Preprocessing; both standard and MaxAbs scaling methods yielded similar performing models, achieving f1 score~ 0.87 with 175000 random samples at its highest where 'isFraud' == 0 and the 912 observations where 'isFraud' == 1

#### I think using MaxAbs model makes the most sense, so the saved prediction file is uses MaxAbs scaling with a decision Tree

In [15]:
test = pd.read_csv('HW3_test_input.csv')

df_model = test.copy()

scaler=MaxAbsScaler()
features = [['amount','oldbalanceOrg','newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']]
for feature in features:
    df_model[feature] = scaler.fit_transform(df_model[feature])

x = df_model
x = pd.get_dummies(x, columns=['type'], drop_first=True) # create dummies

y_test_pred = best_MaxAbs_model.predict(x)
np.savetxt('/Users/ericwang/Desktop/ECON_148/HW3/maxAbsTree_prediction.csv',y_test_pred, delimiter=',')

In [19]:
df_model = train.copy()

scaler=MaxAbsScaler()
features = [['amount','oldbalanceOrg','newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']]
for feature in features:
    df_model[feature] = scaler.fit_transform(df_model[feature])
x = df_model.drop(columns=['isFraud'])
x = pd.get_dummies(x, columns=['type'], drop_first=True) # create dummies
y = df_model['isFraud']

f1_score(y, best_MaxAbs_model.predict(x))

0.8337280909521555

In [7]:
df_model = train.copy()

scaler=MaxAbsScaler()
features = [['amount','oldbalanceOrg','newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']]
for feature in features:
    df_model[feature] = scaler.fit_transform(df_model[feature])

x = df_model
x = pd.get_dummies(x, columns=['type'], drop_first=True) # create dummies
y = df_model['isFraud']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)
clf = tree.DecisionTreeClassifier()

param = {
    "criterion": ["gini", "entropy", "log_loss"],
    "max_depth": [12,13,14,15,16,17,18,19],
    "splitter": ["best", "random"]
    }
grid = GridSearchCV(clf, param_grid=param, cv=10)

In [8]:
grid.fit(x_train,y_train)
print(grid.best_params_)

{'criterion': 'gini', 'max_depth': 12, 'splitter': 'best'}


In [9]:
clf = tree.DecisionTreeClassifier(criterion='gini',max_depth=12,splitter='best')
tree_model = clf.fit(x_train, y_train)
y_pred = tree_model.predict(x_test)
f1_score(y_test, y_pred)

1.0