In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn import svm


# set device to gpu
device = (
"cuda"
if torch.cuda.is_available()
else "mps"
if torch.backends.mps.is_available()
else "cpu"
)
device

'cuda'

In [2]:
tfidf_train_data = pd.read_csv('../Data/TrainingData/tfidf_train.csv')
tfidf_val_data = pd.read_csv('../Data/ValidateData/tfidf_val.csv')
word2vec_train_data = pd.read_csv('../Data/TrainingData/word2vec_train.csv')
word2vec_val_data = pd.read_csv('../Data/ValidateData/word2vec_val.csv')
bert_train_data = pd.read_csv('../Data/TrainingData/bert_train.csv')
bert_val_data = pd.read_csv('../Data/ValidateData/bert_val.csv')

In [3]:
spike_cols = [col for col in tfidf_val_data.columns if '食記' in col]
spike_cols

[]

In [9]:
def separator(count = 50):
    print('-'*count)

def train_svm_model(df, scaler_path):
    data = df.copy()
    y = data.pop('source')  # 假設標籤列名為 'label'
    
    # 標準化數據
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(data.values)
    # 保存標準化器
    joblib.dump(scaler, scaler_path)

    # 分割數據集
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # # 將數據轉換為 PyTorch 張量並移動到 GPU
    # X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
    # X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
    # y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1).to(device)
    # y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1).to(device)


    # 建立 kernel='linear' 模型
    model=svm.SVC(kernel='linear', C=1)
    # 使用訓練資料訓練模型

    print(data.shape)
    print(X_train.shape)

    model.fit(X_train, y_train)

    # y_score = model.fit(X_train, y_train).predict_proba(X_test)

    cv_scores = cross_val_score(model, X_train, y_train, cv=5)

    print(cv_scores)

    # score train and test sets
    scoreTrain = model.score(X_train, y_train)
    scoreTest = model.score(X_test, y_test)

    # predict the test data
    predict_test = model.predict(X_test)


    cm_result = confusion_matrix(y_test, predict_test)
    cr_result = classification_report(y_test,predict_test)

    model_name = str(model).split('(')[0]

    # print model name in blue color
    print('\033[1m' + model_name + '\033[0m')
    # print -----------------------------------
    separator()
    print('Train Score for '+str(model_name)+': ', (scoreTrain))
    separator()
    print('Test Score for '+str(model_name)+': ', (scoreTest))
    separator()
    print('Confusion Matrix for '+str(model_name)+' for test : \n', (cm_result))
    separator()
    print('Classification Report for '+str(model_name)+' for test : \n', str(cr_result))
    separator()

    # feature_importance = np.array(model.feature_importances_)
    # # print(len(list(data.iloc[:, :-1].columns)))
    # # print(len(feature_importance))
    # rank_of_coef = pd.DataFrame({'coef_name': list(data.columns), 'feature_importance': feature_importance })
    # print(rank_of_coef.sort_values(by=['feature_importance'], ascending=False).head(n=20))
    # 保存整個模型
    # torch.save(model, model_path)
    return model

In [10]:
model = train_svm_model(tfidf_train_data, '../Data/Model/tfidf_scaler.pkl')

(1610, 37675)
(1288, 37675)
[0.96899225 0.95348837 0.97674419 0.9844358  0.97276265]
[1mSVC[0m
--------------------------------------------------
Train Score for SVC:  1.0
--------------------------------------------------
Test Score for SVC:  0.984472049689441
--------------------------------------------------
Confusion Matrix for SVC for test : 
 [[157   0]
 [  5 160]]
--------------------------------------------------
Classification Report for SVC for test : 
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       157
           1       1.00      0.97      0.98       165

    accuracy                           0.98       322
   macro avg       0.98      0.98      0.98       322
weighted avg       0.98      0.98      0.98       322

--------------------------------------------------


In [11]:
model = train_svm_model(word2vec_train_data,  '../Data/Model/tfidf_scaler.pkl')

(1610, 50)
(1288, 50)
[0.96899225 0.97674419 0.96124031 0.9844358  0.98054475]
[1mSVC[0m
--------------------------------------------------
Train Score for SVC:  0.9968944099378882
--------------------------------------------------
Test Score for SVC:  0.9813664596273292
--------------------------------------------------
Confusion Matrix for SVC for test : 
 [[154   3]
 [  3 162]]
--------------------------------------------------
Classification Report for SVC for test : 
               precision    recall  f1-score   support

           0       0.98      0.98      0.98       157
           1       0.98      0.98      0.98       165

    accuracy                           0.98       322
   macro avg       0.98      0.98      0.98       322
weighted avg       0.98      0.98      0.98       322

--------------------------------------------------


In [12]:
model = train_svm_model(bert_train_data,'../Data/Model/tfidf_scaler.pkl')

(1610, 768)
(1288, 768)
[1.         0.98837209 0.99224806 0.99610895 1.        ]
[1mSVC[0m
--------------------------------------------------
Train Score for SVC:  1.0
--------------------------------------------------
Test Score for SVC:  1.0
--------------------------------------------------
Confusion Matrix for SVC for test : 
 [[157   0]
 [  0 165]]
--------------------------------------------------
Classification Report for SVC for test : 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       157
           1       1.00      1.00      1.00       165

    accuracy                           1.00       322
   macro avg       1.00      1.00      1.00       322
weighted avg       1.00      1.00      1.00       322

--------------------------------------------------


In [20]:
def evaluate_on_new_data(model, scaler, new_data):
    data = new_data.copy()
    y_new = data.pop('source')  # 假設標籤列名為 'label'
    X_new_scaled = scaler.transform(data.values)

    # 將數據轉換為 PyTorch 張量並移動到 GPU
    X_new_tensor = torch.tensor(X_new_scaled, dtype=torch.float32).to(device)
    y_new_tensor = torch.tensor(y_new.values, dtype=torch.float32).view(-1, 1).to(device)

    # 測試模型並計算 R² Score
    model.eval()
    with torch.no_grad():
        y_pred_new = model(X_new_tensor).cpu().numpy()
        y_new_numpy = y_new_tensor.cpu().numpy()

        new_r2 = r2_score(y_new_numpy, y_pred_new)

    print(f'New Data R² Score: {new_r2:.4f}')

In [5]:
model, scaler = train_mlp_model(tfidf_train_data, '../Data/Model/tfidf_mlp_model.pth', '../Data/Model/tfidf_scaler.pkl')
evaluate_on_new_data(model, scaler, tfidf_val_data)

  from .autonotebook import tqdm as notebook_tqdm


Epoch [10/100], Loss: 0.0090
Epoch [20/100], Loss: 0.0000
Epoch [30/100], Loss: 0.0000
Epoch [40/100], Loss: 0.0000
Epoch [50/100], Loss: 0.0000
Epoch [60/100], Loss: 0.0000
Epoch [70/100], Loss: 0.0000
Epoch [80/100], Loss: 0.0000
Epoch [90/100], Loss: 0.0000
Epoch [100/100], Loss: 0.0000
Train R² Score: 1.0000
Test R² Score: 0.8957
New Data R² Score: 0.9193


In [6]:
model, scaler = train_mlp_model(word2vec_train_data, '../Data/Model/word2vec_mlp_model.pth', '../Data/Model/word2vec_scaler.pkl')
evaluate_on_new_data(model, scaler, word2vec_val_data)

Epoch [10/100], Loss: 0.0472
Epoch [20/100], Loss: 0.0051
Epoch [30/100], Loss: 0.0004
Epoch [40/100], Loss: 0.0001
Epoch [50/100], Loss: 0.0000
Epoch [60/100], Loss: 0.0000
Epoch [70/100], Loss: 0.0000
Epoch [80/100], Loss: 0.0000
Epoch [90/100], Loss: 0.0000
Epoch [100/100], Loss: 0.0000
Train R² Score: 1.0000
Test R² Score: 0.9806
New Data R² Score: 0.9357


In [7]:
model, scaler = train_mlp_model(bert_train_data, '../Data/Model/bert_mlp_model.pth', '../Data/Model/bert_scaler.pkl')
evaluate_on_new_data(model, scaler, bert_val_data)

Epoch [10/100], Loss: 0.0005
Epoch [20/100], Loss: 0.0000
Epoch [30/100], Loss: 0.0000
Epoch [40/100], Loss: 0.0000
Epoch [50/100], Loss: 0.0000
Epoch [60/100], Loss: 0.0000
Epoch [70/100], Loss: 0.0000
Epoch [80/100], Loss: 0.0000
Epoch [90/100], Loss: 0.0000
Epoch [100/100], Loss: 0.0000
Train R² Score: 1.0000
Test R² Score: 1.0000
New Data R² Score: 1.0000
