In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# set device to gpu
device = (
"cuda"
if torch.cuda.is_available()
else "mps"
if torch.backends.mps.is_available()
else "cpu"
)
device

'mps'

In [2]:
tfidf_data = pd.read_csv('../Data/all_tfidf_vector.csv')
tfidf_data['source'] = tfidf_data['source'].apply(lambda x: 1 if x == 'ptt' else 0)
tfidf_data.set_index('Unnamed: 0', inplace=True)
tfidf_data.index.name = None
tfidf_data

Unnamed: 0,丁丁,丁丼,丁偏,丁入,丁勝,丁勻,丁味,丁咬感,丁塊,丁實,...,龜吼,龜山,龜山區,龜山島,龜意,龜氣,龜苓膏,龜萬,龜龜,source
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [3]:
word2vec_data = pd.read_csv('../Data/all_word2vec_vector.csv')
word2vec_data['source'] = word2vec_data['source'].apply(lambda x: 1 if x == 'ptt' else 0)
word2vec_data.set_index('Unnamed: 0', inplace=True)
word2vec_data.index.name = None
word2vec_data

Unnamed: 0,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,...,v41,v42,v43,v44,v45,v46,v47,v48,v49,source
0,-0.771740,-1.122951,-2.740019,-1.455619,-2.453897,2.584098,0.003290,-4.249466,0.038024,-2.036725,...,-2.575592,-2.157023,3.693957,1.180108,-0.119045,-0.375034,0.931739,-0.288399,-3.317509,1
1,-1.264917,-1.485122,-2.853180,-3.907526,-2.354339,1.849224,0.041251,-5.302798,-1.022643,-1.460521,...,-2.027088,-2.022411,3.619741,2.518206,0.164746,0.139362,0.242296,-0.612891,-2.046702,1
2,0.545562,-0.984973,-3.397347,-3.695013,-2.627015,3.009818,1.047864,-5.023787,0.177018,-2.660777,...,-3.480310,-0.298348,5.081692,1.393839,-0.308617,-0.088431,2.050133,0.326478,-2.584825,1
3,-0.676069,-1.015693,-2.595752,-4.186768,-3.405892,2.620121,0.655334,-5.261594,-0.151640,-2.821827,...,-2.867878,-1.007114,5.549149,1.384725,-0.094552,1.117431,1.516598,-0.739755,-2.702050,1
4,0.394582,-1.733394,-3.706931,-1.762239,-3.960566,2.377504,-0.202225,-2.133445,1.255376,-2.073871,...,-2.090236,-0.552351,4.927535,1.720784,1.235193,-0.830622,0.573635,-0.296566,-3.124098,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2008,-0.267541,-1.894767,-3.862698,-2.819665,-5.085258,2.828193,0.820895,-4.616461,-0.764121,-3.313679,...,-5.911887,-1.044043,4.024465,1.631647,0.225096,-0.390803,1.725868,-0.693092,-6.124576,0
2009,1.367300,0.478092,-3.086425,-2.912148,-2.887171,2.986663,1.178478,-5.338357,0.787550,-3.615940,...,-4.933023,-0.722975,5.045085,1.977219,0.723010,-1.789779,0.055758,2.353520,-4.092143,0
2010,-0.161211,-0.760100,-3.208897,-3.185342,-2.677257,4.249139,-0.219862,-5.229231,1.615627,-4.354275,...,-4.807690,-1.341681,4.329050,3.805667,1.489856,-3.245761,0.140854,0.615634,-3.973378,0
2011,1.811152,0.420892,-1.938110,-3.022245,-2.616310,4.921800,0.313476,-4.195392,1.484297,-2.114074,...,-5.735107,-0.350204,3.937828,1.022171,0.315242,0.051294,1.696289,2.280374,-5.254778,0


In [4]:
bert_data = pd.read_csv("../Data/all_bert_vector.csv")
bert_data['source'] = bert_data['source'].apply(lambda x: 1 if x == 'ptt' else 0)
bert_data

Unnamed: 0,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,...,v759,v760,v761,v762,v763,v764,v765,v766,v767,source
0,0.482289,0.002567,-0.588562,0.213909,-0.164422,-0.546130,0.359240,0.467617,-0.555582,-0.356817,...,-0.367303,0.120131,0.068178,-0.198242,0.108325,-0.449807,0.341683,0.003773,0.289636,1
1,0.569334,0.117720,-0.463536,0.238271,-0.190863,-0.493317,0.005829,0.358729,-0.309904,-0.576790,...,-0.556465,0.099134,0.002236,-0.173020,0.134463,-0.222570,0.392557,-0.023141,0.426285,1
2,0.476065,-0.089274,-0.577052,0.177087,-0.250644,-0.576189,0.178282,0.457851,-0.342015,-0.302689,...,-0.573780,0.145084,0.131299,-0.005712,0.151731,-0.485495,0.467338,0.036680,0.369509,1
3,0.699978,0.038673,-0.629142,0.254823,-0.150740,-0.511036,-0.049724,0.432495,-0.217387,-0.545995,...,-0.555327,0.139082,0.253673,-0.133184,0.238108,-0.440649,0.381738,0.005293,0.345910,1
4,0.474377,0.071660,-0.567027,0.117066,-0.088797,-0.665853,0.365925,0.620158,-0.264941,-0.338038,...,-0.447519,0.363528,0.054789,-0.006940,0.096666,-0.488630,0.556082,-0.021331,0.212194,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2008,0.520790,0.023278,-0.440306,0.030521,-0.082336,-0.448585,0.180265,0.690003,-0.374882,-0.413573,...,-0.448096,-0.031846,0.121534,-0.233416,0.240516,0.095192,0.611452,0.289429,0.256680,0
2009,0.438469,0.061487,-0.700103,0.153445,-0.274683,-0.301178,0.192171,0.611885,-0.553411,-0.381177,...,-0.378960,0.004228,0.087660,-0.258496,0.054177,-0.280153,0.513852,0.050605,0.373820,0
2010,0.734525,-0.143993,-0.402033,0.186052,-0.283591,-0.509124,0.107150,0.652794,-0.368750,-0.356416,...,-0.251269,0.112176,0.287556,0.052772,0.211563,-0.168681,0.332440,0.010061,0.293597,0
2011,0.577803,-0.199617,-0.484820,0.179146,-0.171064,-0.334219,0.271937,0.394994,-0.349607,-0.333682,...,-0.266281,0.098342,0.158844,-0.068416,0.130563,-0.590198,0.412623,0.093424,0.302986,0


In [5]:
def train_mlp_model(df, model_path):

    y = df.pop('source')  # 假設標籤列名為 'label'
    
    # 標準化數據
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df.values)

    # 分割數據集
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # 將數據轉換為 PyTorch 張量並移動到 GPU
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1).to(device)
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1).to(device)


    # 建立 MLP 模型
    input_size = X_train_tensor.shape[1]
    model = nn.Sequential(
        nn.Linear(input_size, 1024),
        nn.ReLU(),
        nn.Linear(1024, 512),
        nn.ReLU(),
        nn.Linear(512, 256),
        nn.ReLU(),
        nn.Linear(256, 1),
        nn.Sigmoid()
    ).to(device)

    # 定義損失函數和優化器
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # 訓練模型
    num_epochs = 100
    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        loss.backward()
        optimizer.step()
        
        if (epoch+1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

    # 測試模型並計算 R² Score
    model.eval()
    with torch.no_grad():
        y_pred_train = model(X_train_tensor).cpu().numpy()
        y_pred_test = model(X_test_tensor).cpu().numpy()

        y_train_numpy = y_train_tensor.cpu().numpy()
        y_test_numpy = y_test_tensor.cpu().numpy()

        train_r2 = r2_score(y_train_numpy, y_pred_train)
        test_r2 = r2_score(y_test_numpy, y_pred_test)


    print(f'Train R² Score: {train_r2:.4f}')
    print(f'Test R² Score: {test_r2:.4f}')

    # 保存整個模型
    torch.save(model, model_path)
    return model

In [6]:
train_mlp_model(tfidf_data, '../Data/Model/tfidf_mlp_model.pth')

  from .autonotebook import tqdm as notebook_tqdm


Epoch [10/100], Loss: 0.0215
Epoch [20/100], Loss: 0.0001
Epoch [30/100], Loss: 0.0000
Epoch [40/100], Loss: 0.0000
Epoch [50/100], Loss: 0.0000
Epoch [60/100], Loss: 0.0000
Epoch [70/100], Loss: 0.0000
Epoch [80/100], Loss: 0.0000
Epoch [90/100], Loss: 0.0000
Epoch [100/100], Loss: 0.0000
Train R² Score: 1.0000
Test R² Score: 0.9863


Sequential(
  (0): Linear(in_features=37677, out_features=1024, bias=True)
  (1): ReLU()
  (2): Linear(in_features=1024, out_features=512, bias=True)
  (3): ReLU()
  (4): Linear(in_features=512, out_features=256, bias=True)
  (5): ReLU()
  (6): Linear(in_features=256, out_features=1, bias=True)
  (7): Sigmoid()
)

In [7]:
train_mlp_model(word2vec_data, '../Data/Model/word2vec_mlp_model.pth')

Epoch [10/100], Loss: 0.0882
Epoch [20/100], Loss: 0.0213
Epoch [30/100], Loss: 0.0024
Epoch [40/100], Loss: 0.0003
Epoch [50/100], Loss: 0.0000
Epoch [60/100], Loss: 0.0000
Epoch [70/100], Loss: 0.0000
Epoch [80/100], Loss: 0.0000
Epoch [90/100], Loss: 0.0000
Epoch [100/100], Loss: 0.0000
Train R² Score: 1.0000
Test R² Score: 0.9386


Sequential(
  (0): Linear(in_features=50, out_features=1024, bias=True)
  (1): ReLU()
  (2): Linear(in_features=1024, out_features=512, bias=True)
  (3): ReLU()
  (4): Linear(in_features=512, out_features=256, bias=True)
  (5): ReLU()
  (6): Linear(in_features=256, out_features=1, bias=True)
  (7): Sigmoid()
)

In [8]:
train_mlp_model(bert_data, '../Data/Model/bert_mlp_model.pth')

Epoch [10/100], Loss: 0.0002
Epoch [20/100], Loss: 0.0000
Epoch [30/100], Loss: 0.0000
Epoch [40/100], Loss: 0.0000
Epoch [50/100], Loss: 0.0000
Epoch [60/100], Loss: 0.0000
Epoch [70/100], Loss: 0.0000
Epoch [80/100], Loss: 0.0000
Epoch [90/100], Loss: 0.0000
Epoch [100/100], Loss: 0.0000
Train R² Score: 1.0000
Test R² Score: 1.0000


Sequential(
  (0): Linear(in_features=768, out_features=1024, bias=True)
  (1): ReLU()
  (2): Linear(in_features=1024, out_features=512, bias=True)
  (3): ReLU()
  (4): Linear(in_features=512, out_features=256, bias=True)
  (5): ReLU()
  (6): Linear(in_features=256, out_features=1, bias=True)
  (7): Sigmoid()
)