In [142]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score


def analyze_feat(data):
    feat_col = [col for col in data.columns if col != 'label']
    print(f"資料集基本資訊:")
    print(f"訓練集大小: {data.shape}")
    print(f"特徵數量: {len(feat_col)}")

    # 分析標籤分布
    print("\n標籤分布:")
    print(data['label'].value_counts())

    # 檢查基本統計資訊
    print("\n特徵統計摘要:")
    print(data[feat_col].describe())

def remove_zero_var_feat(data):
    zero_var_feat = []
    feature_columns = [col for col in data.columns if col != 'label']

    for feat in feature_columns:
        if data[feat].nunique() == 1:
            zero_var_feat.append(feat)

    data_cleaned = data.drop(columns=zero_var_feat)
    return data_cleaned

def trim_outliers(data, percentile=0.01):
    """裁剪極端值而非完全移除"""
    feature_columns = [col for col in data.columns if col != 'label']
    data_trimmed = data.copy()
    
    for feat in feature_columns:
        lower = data[feat].quantile(percentile)
        upper = data[feat].quantile(1-percentile)
        data_trimmed[feat] = data_trimmed[feat].clip(lower, upper)
    return data_trimmed

def identify_zero_var_feat(data):
    zero_var_feat = []
    feature_columns = [col for col in data.columns if col != 'label']
    for feat in feature_columns:
        if data[feat].nunique() == 1:
            zero_var_feat.append(feat)
    return zero_var_feat

In [149]:
train = pd.read_csv('data/train.csv')
# train

In [150]:
train = train.dropna()
train_cleaned = remove_zero_var_feat(train)
train_clean_and_trim = trim_outliers(train_cleaned)
# train_clean_and_trim

In [188]:
# 前處理並標準化
X = train_clean_and_trim.drop('label', axis=1)
Y = train_clean_and_trim[['label']]  # 使用同一個清理好的資料框獲取標籤

# 標準化並執行PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA()
pca.fit(X_scaled)

# 獲取解釋方差
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(cumulative_variance >= 0.999) + 1  # 改回95%或根據需要調整

# 應用PCA降維
X_pca = pca.transform(X_scaled)[:, :n_components]
feature_names = [f'PC{i+1}' for i in range(n_components)]
X_pca_df = pd.DataFrame(X_pca, columns=feature_names, index=X.index)

# 合併標籤
train_pca = pd.concat([Y, X_pca_df], axis=1)
train_pca['label'] = train_pca['label'].map({0: -1, 1: 1})

train_pca

Unnamed: 0,label,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,...,PC68,PC69,PC70,PC71,PC72,PC73,PC74,PC75,PC76,PC77
0,1,0.891636,-0.551380,0.763197,-1.705115,0.503612,-1.290867,1.034565,1.952948,0.570574,...,0.091739,0.030523,-0.060181,0.013141,-0.129054,-0.109666,-0.250439,-0.048888,0.013114,-0.004277
1,-1,-0.648400,-0.671903,0.319544,-1.168103,-1.157278,-0.564319,2.255324,1.112096,1.649733,...,0.056364,-0.002835,0.050115,0.058307,0.223791,0.034248,0.067557,-0.038378,-0.050100,0.035227
2,-1,7.779782,-2.775658,-2.984205,2.237394,-4.175191,1.541019,-0.665241,-4.292417,1.920400,...,-0.077553,0.177987,-0.020185,0.020134,0.030449,-0.047858,-0.134861,0.160203,0.045374,-0.270493
3,-1,-0.993142,0.062101,0.966048,1.765415,1.188964,3.097109,1.183999,-1.496847,-0.016068,...,-0.010328,0.047674,0.094035,0.012736,0.005438,0.050830,-0.037208,-0.012051,0.064610,-0.151841
4,-1,-2.741324,0.822105,1.608229,0.992427,1.334669,0.597561,2.029869,0.190938,-1.155607,...,-0.032742,-0.013228,-0.014584,-0.016860,-0.044201,0.126317,-0.000882,0.021117,-0.003464,-0.024336
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7592,-1,-6.025926,-0.178534,0.031776,0.189696,-0.019145,-0.511366,-1.945743,-0.680624,-0.028814,...,0.041036,0.012827,-0.007174,0.000309,0.011178,-0.023759,-0.005371,-0.003055,-0.005612,0.006916
7593,-1,-1.347762,0.064158,-1.556552,-4.105600,-2.892081,-2.566729,-0.874057,-0.439323,-0.083176,...,0.096727,-0.004021,-0.036147,0.043945,0.146055,-0.176540,-0.020394,-0.004482,0.051724,-0.013379
7594,-1,-0.744512,-0.108301,-0.714964,-1.321755,-1.723064,3.041560,1.419419,1.535666,1.925135,...,0.194754,0.045216,-0.031984,0.140548,0.268245,0.052348,0.068755,-0.002507,-0.027687,-0.048845
7595,-1,-3.703554,2.590339,4.357971,0.259575,-2.270371,-1.713973,-4.245264,1.054712,0.494570,...,-0.021935,0.095566,-0.000888,-0.043632,-0.010393,-0.025836,0.054332,0.006666,0.023839,-0.057499


In [189]:
# Logistic
# 分割資料集
X_train, X_val, y_train, y_val = train_test_split(
    X_pca_df, 
    train_pca['label'], 
    test_size=0.2, 
    random_state=42
)

# 創建並訓練 Logistic Regression 模型
lr = LogisticRegression(max_iter=10000, random_state=42)
lr.fit(X_train, y_train)

# 評估模型
y_pred = lr.predict(X_val)
print(f"準確率: {accuracy_score(y_val, y_pred)}")
print(classification_report(y_val, y_pred))

# 如果要調整超參數
param_grid = {
    'C': [1000000, 100000],  # 1000000
    'solver': ['liblinear', 'saga'] #liblinear
}
grid = GridSearchCV(LogisticRegression(random_state=42), param_grid, cv=5)
grid.fit(X_pca_df, train_pca['label'])
print(f"最佳參數: {grid.best_params_}")
print(f"最佳分數: {grid.best_score_}")

準確率: 0.8296052631578947
              precision    recall  f1-score   support

          -1       0.91      0.82      0.86       993
           1       0.72      0.84      0.77       527

    accuracy                           0.83      1520
   macro avg       0.81      0.83      0.82      1520
weighted avg       0.84      0.83      0.83      1520





最佳參數: {'C': 1000000, 'solver': 'liblinear'}
最佳分數: 0.8395428086344895


In [190]:
train_zero_var_features = identify_zero_var_feat(train)

# 載入測試資料
test = pd.read_csv('data/test.csv')
test = test.dropna()  # 確保賦值回test變數

# 使用與訓練集相同的特徵集
test_cleaned = test.copy()
if set(train_features).issubset(set(test.columns)):
    test_cleaned = test[['label'] + train_features]
else:
    # 如果測試集缺少特徵，添加這些特徵並設為0
    for feat in train_features:
        if feat not in test.columns:
            test_cleaned[feat] = 0
    test_cleaned = test_cleaned[['label'] + train_features]

# 使用相同的前處理
test_clean_and_trim = trim_outliers(test_cleaned)

# 應用相同的轉換
X_test = test_clean_and_trim.drop('label', axis=1)
X_test_scaled = scaler.transform(X_test)
X_test_pca = pca.transform(X_test_scaled)[:, :n_components]

# 預測
test_predictions = lr.predict(X_test_pca)

# 計算準確率
test_accuracy = accuracy_score(y_test_true, test_predictions)
print(f"測試資料準確率: {test_accuracy:.4f}")

測試資料準確率: 0.8345


  data_trimmed[feat] = data_trimmed[feat].clip(lower, upper)
  data_trimmed[feat] = data_trimmed[feat].clip(lower, upper)
  data_trimmed[feat] = data_trimmed[feat].clip(lower, upper)
  data_trimmed[feat] = data_trimmed[feat].clip(lower, upper)
  data_trimmed[feat] = data_trimmed[feat].clip(lower, upper)
  data_trimmed[feat] = data_trimmed[feat].clip(lower, upper)


In [193]:
# sns.scatterplot(data=train_p1, x='PC11', y='PC12', hue='label', palette={-1:'blue', 1:'red'})

# 分割數據為特徵和標籤
X = train_pca.drop('label', axis=1)
y = train_pca['label'].map({-1: -1, 1: 1})  # 確保標籤是-1和1

# 分割訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 創建並訓練感知器模型
perceptron = Perceptron(max_iter=1000, random_state=42)
perceptron.fit(X_train, y_train)

# 預測和評估
# y_pred = perceptron.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
# report = classification_report(y_test, y_pred)

# Averaged Perceptron
avg_perceptron = SGDClassifier(
    loss="perceptron",      # 感知器損失函數
    average=True,           # 平均化權重（關鍵參數）
    class_weight='balanced',
    max_iter=400,
    random_state=42,
    alpha=0.00001,
    eta0=0.7,
)
avg_perceptron.fit(X_train, y_train)
y_pred_avg = avg_perceptron.predict(X_test)
accuracy_avg = accuracy_score(y_test, y_pred_avg)
report_avg = classification_report(y_test, y_pred_avg)

print(f"準確率：{accuracy_avg:.4f}")
print("\n分類報告：")
print(report_avg)

# 查看模型係數（重要性）
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': perceptron.coef_[0]
}).sort_values('Coefficient', key=abs, ascending=False)

print("\n特徵重要性（前10個）：")
print(feature_importance.head(10))

準確率：0.8217

分類報告：
              precision    recall  f1-score   support

          -1       0.95      0.77      0.85       993
           1       0.68      0.92      0.78       527

    accuracy                           0.82      1520
   macro avg       0.81      0.84      0.82      1520
weighted avg       0.85      0.82      0.83      1520


特徵重要性（前10個）：
   Feature  Coefficient
68    PC69   -55.082167
66    PC67    50.613907
64    PC65   -49.469115
12    PC13    45.322089
56    PC57   -45.178237
69    PC70   -41.651842
76    PC77    40.065570
18    PC19    37.362343
55    PC56   -34.889029
44    PC45    34.228908


In [194]:
# 5折交叉驗證
cv_scores = cross_val_score(avg_perceptron, X, y, cv=5)
print(f"交叉驗證準確率: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")

# 使用GridSearchCV同時進行交叉驗證和參數調優
from sklearn.model_selection import GridSearchCV

param_grid = {
    'alpha': [0.00001, 0.0001],
    'max_iter': [300, 350, 400],
    'eta0': [0.6, 0.7, 0.8]
}

grid = GridSearchCV(
    SGDClassifier(loss="perceptron", average=True, class_weight='balanced'),
    param_grid,
    cv=5,
    scoring='accuracy'
)

grid.fit(X, y)
print(f"最佳參數: {grid.best_params_}")
print(f"最佳交叉驗證準確率: {grid.best_score_:.4f}")

交叉驗證準確率: 0.8144 (+/- 0.0223)
最佳參數: {'alpha': 1e-05, 'eta0': 0.7, 'max_iter': 300}
最佳交叉驗證準確率: 0.8155


In [195]:
train_zero_var_features = identify_zero_var_feat(train)

# 載入測試資料
test = pd.read_csv('data/test.csv')
test = test.dropna()  # 確保賦值回test變數

# 使用與訓練集相同的特徵集
test_cleaned = test.copy()
if set(train_features).issubset(set(test.columns)):
    test_cleaned = test[['label'] + train_features]
else:
    # 如果測試集缺少特徵，添加這些特徵並設為0
    for feat in train_features:
        if feat not in test.columns:
            test_cleaned[feat] = 0
    test_cleaned = test_cleaned[['label'] + train_features]

# 使用相同的前處理
test_clean_and_trim = trim_outliers(test_cleaned)

# 應用相同的轉換
X_test = test_clean_and_trim.drop('label', axis=1)
X_test_scaled = scaler.transform(X_test)
X_test_pca = pca.transform(X_test_scaled)[:, :n_components]

# 將測試集標籤轉換為-1和1（與訓練集一致）
y_test_true = test_clean_and_trim['label'].map({0: -1, 1: 1})

# 預測
test_predictions = avg_perceptron.predict(X_test_pca)

# 計算準確率
test_accuracy = accuracy_score(y_test_true, test_predictions)
print(f"測試資料準確率: {test_accuracy:.4f}")

測試資料準確率: 0.8198


  data_trimmed[feat] = data_trimmed[feat].clip(lower, upper)
  data_trimmed[feat] = data_trimmed[feat].clip(lower, upper)
  data_trimmed[feat] = data_trimmed[feat].clip(lower, upper)
  data_trimmed[feat] = data_trimmed[feat].clip(lower, upper)
  data_trimmed[feat] = data_trimmed[feat].clip(lower, upper)
  data_trimmed[feat] = data_trimmed[feat].clip(lower, upper)


In [196]:
def generate_submission(model, scaler, pca, n_components, output_file='submission_logistic.csv'):
    """
    生成Kaggle提交檔案
    
    Parameters:
    -----------
    model : 訓練好的模型
    scaler : 已經擬合的StandardScaler
    pca : 已經擬合的PCA
    n_components : PCA保留的組件數量
    output_file : 輸出檔案名稱，預設為'submission.csv'
    """
    # 讀取評估資料
    eval_data = pd.read_csv('data/eval.anon.csv')
    # 讀取ID對照表
    eval_ids = pd.read_csv('data/eval.ids', header=None)
    eval_ids.columns = ['example_id']
    
    # 提取特徵
    X_eval = eval_data.drop('label', axis=1)
    
    # 零方差特徵處理 (使用與訓練集相同的處理方式)
    train_zero_var_features = identify_zero_var_feat(train)
    train_features = [col for col in train.columns if col != 'label' and col not in train_zero_var_features]
    
    # 確保評估集有相同的特徵
    for feat in train_features:
        if feat not in X_eval.columns:
            X_eval[feat] = 0
    X_eval = X_eval[train_features]
    
    # 應用相同的預處理流程
    X_eval = trim_outliers(X_eval)
    
    # 標準化和PCA轉換
    X_eval_scaled = scaler.transform(X_eval)
    X_eval_pca = pca.transform(X_eval_scaled)[:, :n_components]
    
    # 預測
    predictions = model.predict(X_eval_pca)
    
    # 將-1轉換回0 (如果模型輸出是-1和1)
    predictions = np.where(predictions == -1, 0, predictions)
    
    # 創建提交檔案
    submission = pd.DataFrame({
        'example_id': eval_ids['example_id'],
        'label': predictions
    })
    
    # 保存為CSV
    submission.to_csv(output_file, index=False)
    print(f"提交檔案已保存至 {output_file}")
    
    # 檢查標籤分佈
    label_counts = pd.Series(predictions).value_counts()
    print("預測標籤分佈:")
    print(f"0 (非惡意): {label_counts.get(0, 0)}")
    print(f"1 (惡意): {label_counts.get(1, 0)}")
    
    return submission

# submission = generate_submission(
#     model=lr,  # 您的模型
#     scaler=scaler,         # 已擬合的標準化器
#     pca=pca,               # 已擬合的PCA
#     n_components=n_components  # PCA組件數量
# )
    
submission = generate_submission(
    model=avg_perceptron,  # 您的模型
    scaler=scaler,         # 已擬合的標準化器
    pca=pca,               # 已擬合的PCA
    n_components=n_components  # PCA組件數量
)

提交檔案已保存至 submission_logistic.csv
預測標籤分佈:
0 (非惡意): 1352
1 (惡意): 1180


