In [1]:
import os
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
import seaborn as sns
from matplotlib import pyplot as plt
import math

from matplotlib.font_manager import FontProperties

from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_percentage_error

from sklearn.preprocessing import LabelEncoder


In [2]:
# dataset
training_data_file = '/Users/jellyfish/Desktop/玉山/dataset_1st/training.csv'
testing_data_file = '/Users/jellyfish/Desktop/玉山/dataset_1st/public_processed.csv'
testing_submit = '/Users/jellyfish/Desktop/玉山/31_範例繳交檔案.csv'
submit_file = '/Users/jellyfish/Desktop/玉山/submission/submission.csv'
private_file = '/Users/jellyfish/Desktop/玉山/dataset_2nd/private_1_processed.csv'


# Read data
train_df = pd.read_csv(training_data_file)
test_df = pd.read_csv(testing_data_file)
private_df = pd.read_csv(private_file)
test_sb = pd.read_csv(testing_submit)

In [3]:
test_sb.head()

Unnamed: 0,txkey,pred
0,a2c1209018e4e52e04f6fabb48f05f1b8bc09dc838ff6c...,0
1,16c4880500059e01553789be11bbb50753b7acaae7b95b...,0
2,623c56be3bee87724e3d119c271d9ed098eeda84233183...,0
3,250da12187059cf6e3a3066656a2919d08ceb8207efd55...,0
4,4b268e0da036f44cbbb056ddfac6a28ea336d9cf299843...,0


In [4]:
import pandas as pd

# 假設 private_df 和 test_df 是兩個 DataFrame
# 使用 merge 函數合併兩個 DataFrame，找出相同的 'txkey'
merged_df = pd.merge(private_df, test_sb, on='txkey')

# 取得合併後的 DataFrame 中相同 'txkey' 的數量
num_common_txkey = merged_df['txkey'].nunique()

print(f"private_df 和 test_df 中相同的 'txkey' 數量為：{num_common_txkey}")


private_df 和 test_df 中相同的 'txkey' 數量為：754139


In [8]:
import pandas as pd

# 假設 private_df 和 test_df 是兩個 DataFrame
# 使用 merge 函數合併兩個 DataFrame，找出相同的 'txkey'
merged_df = pd.merge(test_df, test_sb, on='txkey')

# 取得合併後的 DataFrame 中相同 'txkey' 的數量
num_common_txkey = merged_df['txkey'].nunique()

print(f"private_df 和 test_df 中相同的 'txkey' 數量為：{num_common_txkey}")

private_df 和 test_df 中相同的 'txkey' 數量為：600182


In [7]:
test_sb.shape

(1354321, 2)

In [20]:
test2_df = pd.read_csv(testing_data_file)
test2_sb = pd.read_csv(testing_submit)

In [21]:
new_column_titles = [
    "交易序號", "授權日期", "授權時間", "顧客ID", "交易卡號", "交易類別", "交易型態",
    "特店代號", "收單行代碼", "mcc_code", "交易金額-台幣", "網路交易註記", "是否分期交易",
    "分期期數", "是否紅利交易", "實付金額", "消費地國別", "消費城市", "狀態碼", "超額註記碼",
    "Fallback註記", "支付型態", "消費地幣別", "消費地金額", "3D交易註記", "盜刷與否"
]

train_df = train_df.rename(columns=dict(zip(train_df.columns, new_column_titles)))
test_df = test_df.rename(columns=dict(zip(test_df.columns, new_column_titles)))


In [22]:
train_df.isnull().sum()

交易序號                0
授權日期                0
授權時間                0
顧客ID                0
交易卡號                0
交易類別                0
交易型態           203455
特店代號                0
收單行代碼               0
mcc_code         4550
交易金額-台幣             0
網路交易註記              0
是否分期交易              0
分期期數                0
是否紅利交易              0
實付金額                0
消費地國別             600
消費城市           266066
狀態碼           8665195
超額註記碼               0
Fallback註記          0
支付型態           286656
消費地幣別          498657
消費地金額               0
3D交易註記              0
盜刷與否                0
dtype: int64

In [23]:
# null process
train_df['交易型態'].fillna(train_df['交易型態'].mode()[0], inplace=True)
train_df['mcc_code'].fillna(train_df['mcc_code'].mode()[0], inplace=True)
train_df['消費地國別'].fillna(train_df['消費地國別'].mode()[0], inplace=True)
train_df['支付型態'].fillna(train_df['支付型態'].mode()[0], inplace=True)

train_df.dropna(subset=['消費城市'], inplace=True)

In [24]:
train_df['消費地幣別'] = np.where(train_df['消費地幣別'] == 70, 1, 0)
test_df['消費地幣別'] = np.where(test_df['消費地幣別'] == 70, 1, 0)

In [10]:
train_df['卡片交易次數'] = train_df.groupby('交易卡號')['交易卡號'].transform('count')
test_df['卡片交易次數'] = test_df.groupby('交易卡號')['交易卡號'].transform('count')

In [25]:
train_df['特店交易次數'] = train_df.groupby('特店代號')['特店代號'].transform('count')
test_df['特店交易次數'] = test_df.groupby('特店代號')['特店代號'].transform('count')

In [26]:
train_df['顧客交易次數'] = train_df.groupby('顧客ID')['顧客ID'].transform('count')
test_df['顧客交易次數'] = test_df.groupby('顧客ID')['顧客ID'].transform('count')

In [27]:
# label encoding
combined_df = pd.concat([train_df, test_df])

label_encoder = LabelEncoder()
combined_df['收單行代碼'] = label_encoder.fit_transform(combined_df['收單行代碼'])

# Split
train_df['收單行代碼'] = combined_df['收單行代碼'][:len(train_df)]
test_df['收單行代碼'] = combined_df['收單行代碼'][len(train_df):]


In [None]:
# 處理授權時間
def convert_to_hms(seconds):
    hours = seconds // 10000
    minutes = (seconds % 10000) // 100
    seconds = seconds % 100
    return hours, minutes, seconds

train_df['授權時間-小時'] = train_df['授權時間'].apply(lambda x: convert_to_hms(x)[0])
test_df['授權時間-小時'] = test_df['授權時間'].apply(lambda x: convert_to_hms(x)[0])

train_df.drop(columns=['授權時間'],inplace=True)
test_df.drop(columns=['授權時間'],inplace=True)

In [32]:
train_df['卡片一日交易次數'] = train_df.groupby(['交易卡號', train_df['授權日期']])['授權日期'].transform('count')
test_df['卡片一日交易次數'] = test_df.groupby(['交易卡號', test_df['授權日期']])['授權日期'].transform('count')


KeyError: '授權日期'

In [None]:
train_df['卡片一日交易次數'] = train_df.groupby(['交易卡號', train_df['mcc_code']])['mcc_code'].transform('count')
test_df['卡片一日交易次數'] = test_df.groupby(['交易卡號', test_df['mcc_code']])['mcc_code'].transform('count')

In [28]:
# 捨棄的features
train_df.drop(columns=['交易序號'],inplace=True)
train_df.drop(columns=['消費城市'],inplace=True)
train_df.drop(columns=['狀態碼'],inplace=True)
train_df.drop(columns=['交易卡號'],inplace=True)
train_df.drop(columns=['顧客ID'],inplace=True)
train_df.drop(columns=['特店代號'],inplace=True)
train_df.drop(columns=['授權日期'],inplace=True)

test_df.drop(columns=['交易序號'],inplace=True)
test_df.drop(columns=['消費城市'],inplace=True)
test_df.drop(columns=['狀態碼'],inplace=True)
test_df.drop(columns=['交易卡號'],inplace=True)
test_df.drop(columns=['顧客ID'],inplace=True)
test_df.drop(columns=['特店代號'],inplace=True)
test_df.drop(columns=['授權日期'],inplace=True)



In [29]:
X = train_df.iloc[:,train_df.columns != '盜刷與否']
Y = train_df['盜刷與否']

In [30]:
X_train, X_valid, y_train, y_valid = train_test_split(X, Y, train_size=0.8, test_size=0.2, random_state=42, shuffle=True)

In [31]:
# XGBoost
params = {
    'max_depth': 8,
    'n_estimators': 5000,
    'gamma': 0.003,
    'eta': 0.025,
    'subsample': 0.9,
    'colsample_bytree': 0.7,
    'objective': 'binary:logistic',  
    'eval_metric': 'logloss',  
    'silent': 0,
    'verbosity': 0,
    'random_state': 42,
}

def XGBmodel(X_train, X_valid, y_train, y_valid, params):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)
    model = xgb.train(params=params,
                      dtrain=dtrain, num_boost_round=10000,
                      early_stopping_rounds=500, evals=[(dvalid, 'valid')])
    return model

xgbmodel = XGBmodel(X_train, X_valid, y_train, y_valid, params)

[0]	valid-logloss:0.66869
[1]	valid-logloss:0.64542
[2]	valid-logloss:0.62327
[3]	valid-logloss:0.60218
[4]	valid-logloss:0.58201
[5]	valid-logloss:0.56274
[6]	valid-logloss:0.54432
[7]	valid-logloss:0.52672
[8]	valid-logloss:0.50983
[9]	valid-logloss:0.49366
[10]	valid-logloss:0.47816
[11]	valid-logloss:0.46324
[12]	valid-logloss:0.44895
[13]	valid-logloss:0.43518


KeyboardInterrupt: 

In [None]:
# Testing data
X_test = test_df
w1 = 1.0
w2 = 0.0

test_pred_xgb = xgbmodel.predict(xgb.DMatrix(X_test)).tolist()
test_pred = np.asarray(w1) * test_pred_xgb

threshold = 0.5  
y_pred_binary = [1 if pred > threshold else 0 for pred in test_pred]

submission = pd.DataFrame()
submission['txkey'] = test2_df['txkey']
submission['pred'] = y_pred_binary
submission.to_csv(submit_file, index=False)