In [55]:
import pandas as pd
import numpy as np

import datetime
import matplotlib.pyplot as plt
import geopy.distance

### Use columns
- create month, weekday/weekend, hour from trans_date_trans_time
- amt
- lat
- long
- merch_lat
- merch_long
- dist: distance between credict card hold and merchant
- get age from dob
- categorical features
    - cc_num_prefix: the first digit from cc_num
    - city_pop_level: discretize city_pop
    - gender
    - category
    - job
    - state
    - zip

## feature engineering

In [51]:
def get_weekend(day):
    if day.weekday() > 4:
        return 1
    else:
        return 0
    
def get_distance(lat, long, merch_lat, merch_long):
    p1 = (lat, long)
    p2 = (merch_lat, merch_long)
    dist = geopy.distance.distance(p1, p2).km
    return dist

def get_age(dob):
    curr = datetime.datetime.strptime("2021-12-12", "%Y-%m-%d")
    age = round(abs((curr - dob).days / 365))
    return age

def get_city_level(x):
    if x > 500000:
        return "city_pop_larger_than_500000"
    elif x > 100000 and x <= 500000:
        return "city_pop_100000_to_500000"
    elif x > 50000 and x <= 100000:
        return "city_pop_50000_to_100000"
    elif x > 10000 and x <= 50000:
        return "city_pop_10000_to_50000"
    elif x > 5000 and x <= 10000:
        return "city_pop_5000_to_10000"
    elif x > 1000 and x <= 5000:
        return "city_pop_1000_to_5000"
    else:
        return "city_pop_smaller_than_1000"

In [40]:
data = pd.read_csv('fraudTrain.csv', index_col = 0, parse_dates=['trans_date_trans_time', 'dob'])

  mask |= (ar1 == a)


In [None]:
data["cc_num_prefix"] = data["cc_num"].apply(lambda x: str(x)[:1])
data["trans_month"] = data["trans_date_trans_time"].apply(lambda x:  x.month)
data["trans_hour"] = data["trans_date_trans_time"].apply(lambda x: x.hour)
data["trans_weekend"] = data["trans_date_trans_time"].apply(lambda x: get_weekend(x))

In [57]:
# data["dist"] = data.apply(lambda x: get_distance(x.lat, x.long, x.merch_lat, x.merch_long), axis=1)
data["age"] = data["dob"].apply(lambda x: get_age(x))
data["city_pop_level"] = data["city_pop"].apply(lambda x: get_city_level(x))

In [60]:
# the last 7 columns are categorical features
feature_cols = ["trans_month", "trans_weekend", "trans_hour", "amt",
                "age", "lat", "long", "merch_lat", "merch_long", 
                "cc_num_prefix", "gender", "category", "job",  "state", "zip", "city_pop_level"]

label_col = ["is_fraud"]

In [64]:
def convert_to_categorical(df, cate_col_list):
    for cate_col in cate_col_list:   
        df[cate_col] = df[cate_col].astype('category')
    return df

In [65]:
clean_data = data[feature_cols + label_col]
clean_data = convert_to_categorical(clean_data, feature_cols[-7:])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


## model

In [39]:
import lightgbm
from sklearn.model_selection import train_test_split, GridSearchCV
from lightgbm import LGBMClassifier
from sklearn.metrics import mean_absolute_error, accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
import joblib
import re

In [67]:
X = clean_data.drop(["is_fraud"], axis=1)
y = clean_data["is_fraud"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [95]:
gbm = LGBMClassifier(learning_rate=0.05, n_estimators=100, max_depth=10, scale_pos_weight=120,
                     min_child_samples=300, subsample=0.6, colsample_bytree=0.6, reg_lambda=1e-3,
                     random_state=100, silence=True)

gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], 
        early_stopping_rounds=5, categorical_feature=feature_cols[-7:])

New categorical_feature is ['category', 'cc_num_prefix', 'city_pop_level', 'gender', 'job', 'state', 'zip']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))






[1]	valid_0's binary_logloss: 0.104758
Training until validation scores don't improve for 5 rounds
[2]	valid_0's binary_logloss: 0.101997
[3]	valid_0's binary_logloss: 0.103042
[4]	valid_0's binary_logloss: 0.0996905
[5]	valid_0's binary_logloss: 0.0989273
[6]	valid_0's binary_logloss: 0.0956954
[7]	valid_0's binary_logloss: 0.09458
[8]	valid_0's binary_logloss: 0.0911298
[9]	valid_0's binary_logloss: 0.0905591
[10]	valid_0's binary_logloss: 0.0901925
[11]	valid_0's binary_logloss: 0.0895654
[12]	valid_0's binary_logloss: 0.0877208
[13]	valid_0's binary_logloss: 0.0871847
[14]	valid_0's binary_logloss: 0.0862945
[15]	valid_0's binary_logloss: 0.0836698
[16]	valid_0's binary_logloss: 0.0806018
[17]	valid_0's binary_logloss: 0.0792931
[18]	valid_0's binary_logloss: 0.0778135
[19]	valid_0's binary_logloss: 0.0760097
[20]	valid_0's binary_logloss: 0.0754117
[21]	valid_0's binary_logloss: 0.0730006
[22]	valid_0's binary_logloss: 0.0727026
[23]	valid_0's binary_logloss: 0.0724393
[24]	valid_

LGBMClassifier(colsample_bytree=0.6, learning_rate=0.05, max_depth=10,
               min_child_samples=300, random_state=100, reg_lambda=0.001,
               scale_pos_weight=120, silence=True, subsample=0.6)

In [98]:
# save model
joblib.dump(gbm, 'Q4_output/lightgbm_model.pkl')

['Q4_output/lightgbm_model.pkl']

In [73]:
# load model
gbm = joblib.load('Q4_output/lightgbm_model.pkl')

In [99]:
def evaluate(model, X, y):
    y_pred = gbm.predict(X, num_iteration=gbm.best_iteration_)
    
    print('Precision: %.4f' % precision_score(y, y_pred))
    print('Recall: %.4f' % recall_score(y, y_pred))
    print('F1：', f1_score(y, y_pred))
    print('AUC：', roc_auc_score(y, y_pred))
    
    return y_pred

In [100]:
y_test_pred = evaluate(gbm, X_test, y_test)

Precision: 0.3212
Recall: 0.9236
F1： 0.4766170937108045
AUC： 0.9563556631574867


In [104]:
y_test_pred_df = pd.DataFrame(y_test_pred)
y_test_pred_df.to_csv("Q4_output/Q4 predicted results.csv", index=False, header=False)