In [None]:
# import package
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
import gensim
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [None]:
# set random seed
np.random.seed(42)

## Dataset

In [None]:
# load data
train_df = pd.read_csv('dataset/train.csv', sep='\t', encoding='utf-8')
test_df = pd.read_csv('dataset/test.csv', sep='\t', encoding='utf-8')
print(f"Training data shape: {train_df.shape}")
print(train_df.head())
print(f"Testing data shape: {test_df.shape}")
print(test_df.head())   # no labels

In [None]:
# EDA

# check NaN values
print(f"Null values in training data:")
print(train_df.isnull().sum())
# print unique labels
print(f"Unique labels in training data:")
print(train_df['label'].unique())
# find the row that label == 'label'
print(f"Rows with label 'label':")
print(train_df[train_df['label'] == 'label'])

# remove the row that label == 'label'
train_df = train_df[train_df['label'] != 'label']

# save labels as int type
train_df['label'] = train_df['label'].astype(int)

In [None]:
# train validation split
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)
print(train_df.shape)
print(val_df.shape)

print(train_df.head())
print(val_df.head())

In [None]:
# 把句子分成單字
train_sentences = [sentence.split() for sentence in train_df['text']]
val_sentences = [sentence.split() for sentence in val_df['text']]
test_sentences = [sentence.split() for sentence in test_df['text']]

print(train_sentences[0])
print(val_sentences[0])
print(test_sentences[0])

In [None]:
# 訓練word2vec模型
word2vec_model = Word2Vec(train_sentences, vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.train(train_sentences, total_examples=len(train_sentences), epochs=10)

In [None]:
# 把句子轉換成向量
def sentence_to_vector(sentence, model):
    words = sentence.split()
    vector = np.mean([model.wv[word] for word in words if word in model.wv], axis=0)
    return vector

In [None]:
# 將句子轉換為向量
X_train = np.array([sentence_to_vector(sentence, word2vec_model) for sentence in train_df['text']])
y_train = train_df['label'].values

X_val = np.array([sentence_to_vector(sentence, word2vec_model) for sentence in val_df['text']])
y_val = val_df['label'].values

X_test = np.array([sentence_to_vector(sentence, word2vec_model) for sentence in test_df['text']])

X_train = np.nan_to_num(X_train)
X_val = np.nan_to_num(X_val)
X_test = np.nan_to_num(X_test)

print(X_train[0])
print(X_val[0])
print(X_test[0])

In [None]:
# SVM
clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)

# predict
y_val_pred = clf.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_df['label'] = y_val_pred
val_df.to_csv('val_SVM.csv', index=False)
print(f'Validation Accuracy: {val_accuracy:.2f}')

In [None]:
# Logistic Regression
logreg_clf = LogisticRegression(max_iter=1000)
logreg_clf.fit(X_train, y_train)

# predict
y_val_pred = logreg_clf.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_df['label'] = y_val_pred
val_df.to_csv('val_LR.csv', index=False)
print(f'Validation Accuracy: {val_accuracy:.2f}')

In [None]:
# predict test data
y_test_pred = logreg_clf.predict(X_test)
test_df['label'] = y_test_pred
test_df.to_csv('test_LR.csv', index=False)

# predict test data
y_test_pred = clf.predict(X_test)
test_df['label'] = y_test_pred
test_df.to_csv('test_SVM.csv', index=False)
