In [49]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_val_score

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils import resample
import pickle
import hashlib
import time  
import xgboost as xgb
from xgboost import XGBClassifier
import math

In [50]:

data1 = pd.read_csv(r"public.csv")
data2 = pd.read_csv(r"training.csv")
public1 = pd.read_csv(r"public_processed.csv")
public2 = pd.read_csv(r"private_1_processed.csv")

public_processed_df = pd.concat([public1, public2])
data = pd.concat([data1, data2])

public_processed_df = public_processed_df.reset_index(drop=True)
data = data.reset_index(drop=True)

In [51]:
#把特定的資料轉換成整數
def hash_to_int(value):
    return int(hashlib.sha256(value.encode()).hexdigest(), 16)

In [52]:
data["chid"] = data["chid"].apply(hash_to_int)
data["cano"] = data["cano"].apply(hash_to_int)
data["mchno"]= data["mchno"].apply(hash_to_int)
data["acqic"]= data["acqic"].apply(hash_to_int)
##loctm
data['loctm_hh'] = data['loctm'].apply(lambda x: math.floor(x/10000))
data['loctm_mm'] = data['loctm'].apply(lambda x: math.floor(x/100)-math.floor(x/10000)*100)
data['loctm_ss'] = data['loctm'].apply(lambda x: math.floor(x)-math.floor(x/100)*100)
data['weekday'] = data['locdt'] % 7
data['conam3000'] = np.where(data['conam'] > 3000, 1, 0)


In [53]:
# 列出包含NaN值的列名 並替換
nan_columns = data.isnull().any()
columns_with_nan = nan_columns[nan_columns].index.tolist()
print(columns_with_nan)
data[columns_with_nan] = data[columns_with_nan].fillna(-1)

['etymd', 'mcc', 'stocn', 'scity', 'stscd', 'hcefg', 'csmcu']


In [54]:
X = data.drop(['label','txkey'], axis=1) 
y = data['label']
# 到這邊X有些特徵還是object，要用以下轉換轉換成int/float

# Step 5 特徵選擇
selector = SelectKBest(f_classif, k="all")  # Change k value as needed 
X_new = selector.fit_transform(X, y)
# 獲取選擇的特徵索引
selected_feature_indices = selector.get_support(indices=True)
# 獲取選擇的特徵名稱
selected_feature_names = X.columns[selected_feature_indices]

In [77]:
# Step 6 切割訓練測試集
# Assuming X_new and y are the selected features and labels after feature selection
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=15)

# Step 7 特徵正規化，方便訓練並降低不同訓練資料特徵維度的比例
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [78]:
# model = XGBClassifier(max_depth=20, 
#                     min_child_weight=5,
#                     scale_pos_weight=5, 
#                     n_estimators=200, 
#                     gamma=0,
#                     eval_metric="auc",
#                     tree_method='hist',  # 使用"hist"
#                     # max_delta_step =1,
#                     # colsample_bytree=0.7,
#                     # colsample_bylevel=1,
#                     seed=1440,
#                     device='cuda')  # 使用"cuda"
model = XGBClassifier(max_depth=20, 
                     min_child_weight=7.734890981711013,
                     scale_pos_weight=5, 
                     gamma=3,
                     random_state=42, 
                     eval_metric="auc",
                     tree_method='hist',  # 使用"hist"
                     device='cuda')  # 使用"cuda"
print("CV score of XGB is ",cross_val_score(model,X_train,y_train,cv=4, scoring = 'f1').mean())

CV score of XGB is  0.7613669843261159


In [79]:
model.fit(X_train,y_train)


In [80]:
y_pred = model.predict(X_test)

In [81]:
print("y_pred Classification Report: ")
print(classification_report(y_test, y_pred))
print("Confusion Matrix: ")
print(confusion_matrix(y_test, y_pred))

y_pred Classification Report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1852570
           1       0.83      0.74      0.79      6944

    accuracy                           1.00   1859514
   macro avg       0.92      0.87      0.89   1859514
weighted avg       1.00      1.00      1.00   1859514

Confusion Matrix: 
[[1851536    1034]
 [   1781    5163]]


In [82]:
public1 = pd.read_csv(r"public_processed.csv")
public2 = pd.read_csv(r"private_1_processed.csv")
public_processed_df = pd.concat([public1, public2])

public_feature = public_processed_df.drop(['txkey'], axis=1)
public_name = public_processed_df['txkey']

public_feature["chid"] = public_feature["chid"].apply(hash_to_int)
public_feature["cano"] = public_feature["cano"].apply(hash_to_int)
public_feature["mchno"]= public_feature["mchno"].apply(hash_to_int)
public_feature["acqic"]= public_feature["acqic"].apply(hash_to_int)

In [83]:
public_feature['loctm_hh'] = public_feature['loctm'].apply(lambda x: math.floor(x/10000))
public_feature['loctm_mm'] = public_feature['loctm'].apply(lambda x: math.floor(x/100)-math.floor(x/10000)*100)
public_feature['loctm_ss'] = public_feature['loctm'].apply(lambda x: math.floor(x)-math.floor(x/100)*100)
public_feature['weekday'] = public_feature['locdt'] % 7
public_feature['conam3000'] = np.where(public_feature['conam'] > 3000, 1, 0)

In [84]:
#  自己設計的資料清理
public_feature_nan_columns = public_feature.isnull().any()
public_feature_columns_with_nan = public_feature_nan_columns[public_feature_nan_columns].index.tolist()
print(public_feature_columns_with_nan)
public_feature[public_feature_columns_with_nan] = public_feature[public_feature_columns_with_nan].fillna(-1)
public_training_5 = public_feature[selected_feature_names]

['etymd', 'mcc', 'stocn', 'scity', 'stscd', 'hcefg', 'csmcu']


In [85]:
#  使用訓練集的sc進行縮放
public_X_new = sc.transform(public_training_5)

# 使用上面的xgbc測試集辨識
public_y_pred = model.predict(public_X_new)

# 製作與儲存可以上傳的csv檔案
public_y_pred = pd.DataFrame(public_y_pred, columns=["pred"])

public_y_pred = public_y_pred.reset_index(drop=True)
public_name = public_name.reset_index(drop=True)

result = pd.concat([public_name, public_y_pred], axis=1)
result.to_csv(r"result.csv",index=False)



In [86]:
result.shape

(1354321, 2)