# Pipeline 1
Content -> Word2Vec -> Encoding -> ML Classification

In [1]:
# import package
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
import gensim
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [2]:
# set random seed
np.random.seed(42)

## Dataset

In [3]:
# load data
train_df = pd.read_csv('dataset/train.csv', sep='\t', encoding='utf-8')
test_df = pd.read_csv('dataset/test.csv', sep='\t', encoding='utf-8')
print(f"Training data shape: {train_df.shape}")
print(train_df.head())
print(f"Testing data shape: {test_df.shape}")
print(test_df.head())   # no labels

Training data shape: (4987, 2)
                                                text label
0  Get the latest from TODAY Sign up for our news...     1
1  2d  Conan On The Funeral Trump Will Be Invited...     1
2  It’s safe to say that Instagram Stories has fa...     0
3  Much like a certain Amazon goddess with a lass...     0
4  At a time when the perfect outfit is just one ...     0
Testing data shape: (1247, 2)
   id                                               text
0   2  The 2017 Teen Choice Awards ceremony was held ...
1   3  The concert, part of “The Joshua Tree Tour,” w...
2   4  Selena Gomez refuses to talk to her mother abo...
3   5  This is worse than a lump of coal in your stoc...
4   6  Luann De Lesseps is going to rehab after her a...


In [4]:
# EDA

# check NaN values
print(f"Null values in training data:")
print(train_df.isnull().sum())
# print unique labels
print(f"Unique labels in training data:")
print(train_df['label'].unique())
# find the row that label == 'label'
print(f"Rows with label 'label':")
print(train_df[train_df['label'] == 'label'])

# remove the row that label == 'label'
train_df = train_df[train_df['label'] != 'label']

# save labels as int type
train_df['label'] = train_df['label'].astype(int)

Null values in training data:
text     0
label    0
dtype: int64
Unique labels in training data:
['1' '0' 'label']
Rows with label 'label':
         text  label
1615  content  label


In [5]:
# train validation split
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)
print(train_df.shape)
print(val_df.shape)

print(train_df.head())
print(val_df.head())

(3988, 2)
(998, 2)
                                                   text  label
2415  Singer Aaron Carter, who has previously been f...      0
3159  Nineteen years ago, a gay man and his straight...      0
3009  The mother! of all relationships is over.  Jen...      1
3612  The Republican war on women continues unabated...      1
4518  As Taylor Swift calls out the haters on her ne...      0
                                                   text  label
1489  George Timothy Clooney (born May 6, 1961) is a...      1
2755  Do you feel it in your fingers? Do you feel it...      0
465   Advertisement  The royal family gathered this ...      0
2489  Roger Ailes, Former Fox News CEO, Dies At 77  ...      0
676   American serial child sexual abuser and physic...      0


# Word2Vec

In [6]:
# 把句子分成單字
train_sentences = [sentence.split() for sentence in train_df['text']]
val_sentences = [sentence.split() for sentence in val_df['text']]
test_sentences = [sentence.split() for sentence in test_df['text']]

print(train_sentences[0])
print(val_sentences[0])
print(test_sentences[0])

['Singer', 'Aaron', 'Carter,', 'who', 'has', 'previously', 'been', 'forthcoming', 'about', 'his', 'substance', 'abuse', 'and', 'eating', 'disorder,', 'is', 'now', 'opening', 'up', 'about', 'his', 'sexuality.', 'The', '29-year-old', 'brother', 'of', 'Backstreet', 'Boys', 'alum', 'Nick', 'Carter', 'tweeted', 'on', 'Saturday', 'night', 'a', 'long', 'open', 'letter', 'in', 'which', 'he', 'acknowledges', 'his', 'attraction', 'to', 'both', 'men', 'and', 'women', 'since', 'he', 'was', '13.', '"There’s', 'something', 'I’d', 'like', 'to', 'say', 'that', 'I', 'feel', 'is', 'important', 'for', 'myself', 'and', 'my', 'identity', 'that', 'has', 'been', 'weighing', 'on', 'my', 'chest', 'for', 'nearly', 'half', 'of', 'my', 'life,"', 'he', 'wrote.', '"This', 'doesn’t', 'bring', 'me', 'shame,', 'just', 'a', 'weight', 'and', 'burden', 'I', 'have', 'held', 'onto', 'for', 'a', 'long', 'time', 'that', 'I', 'would', 'like', 'lifted', 'off', 'me."', 'He', 'explains,', '"I', 'grew', 'up', 'in', 'this', 'enter

In [7]:
# 訓練word2vec模型
word2vec_model = Word2Vec(train_sentences, vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.train(train_sentences, total_examples=len(train_sentences), epochs=10)

(21615697, 26397440)

In [8]:
# 把句子轉換成向量
def sentence_to_vector(sentence, model):
    words = sentence.split()
    vector = np.mean([model.wv[word] for word in words if word in model.wv], axis=0)
    return vector

In [9]:
# 將句子轉換為向量
X_train = np.array([sentence_to_vector(sentence, word2vec_model) for sentence in train_df['text']])
y_train = train_df['label'].values

X_val = np.array([sentence_to_vector(sentence, word2vec_model) for sentence in val_df['text']])
y_val = val_df['label'].values

X_test = np.array([sentence_to_vector(sentence, word2vec_model) for sentence in test_df['text']])

X_train = np.nan_to_num(X_train)
X_val = np.nan_to_num(X_val)
X_test = np.nan_to_num(X_test)

print(X_train[0])
print(X_val[0])
print(X_test[0])

[ 0.35088202 -0.29509535 -0.5618353  -1.3412067   1.1911632   0.07781454
  0.5047654   0.36611196 -1.1751217  -1.1126319   0.18229185 -0.5232262
  0.35012913  0.20127165  0.47247872  0.4306528   0.6133306  -0.08717418
  0.6590974  -1.1371609   0.40045935  0.02766516  0.23296683  0.09252621
 -1.5692799   0.30900177 -0.35449332  0.05814846  0.1777359  -0.8193022
 -1.3566234  -0.19416773  1.1533852  -0.29473552  0.23983485 -1.3387115
  0.0182434  -0.03306995 -0.21799798 -0.07500646  0.43992034  0.16314623
 -0.45600447 -0.20230165 -0.32542163  0.43894076  0.53585833  0.72273946
 -0.16965775  0.3108655   0.30317724  0.15120286  0.6302182   0.23104557
  0.6564047   0.07569491  0.02747363  1.5145037   0.6233521  -0.37034655
 -0.32526258  0.58198303 -0.77887326  0.45506787 -0.5885206  -0.10961789
  0.49023247  0.5969614   0.82236403  0.20018351  0.01071951 -0.7073056
 -0.19569285 -0.01543058 -0.25014985 -0.88514036 -0.03991383  0.61903054
  0.8767525  -0.69850886 -0.6301532   0.13939852  0.403

# Validation

- Logistic Regression: 0.71
- SVM: 0.72

In [10]:
# SVM
clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)

# predict
y_val_pred = clf.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_df['label'] = y_val_pred
val_df.to_csv('result/val_SVM.csv', index=False)
print(f'Validation Accuracy: {val_accuracy:.2f}')

Validation Accuracy: 0.72


In [11]:
# Logistic Regression
logreg_clf = LogisticRegression(max_iter=1000)
logreg_clf.fit(X_train, y_train)

# predict
y_val_pred = logreg_clf.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_df['label'] = y_val_pred
val_df.to_csv('result/val_LR.csv', index=False)
print(f'Validation Accuracy: {val_accuracy:.2f}')

Validation Accuracy: 0.71


# Test

In [12]:
# predict test data
y_test_pred = logreg_clf.predict(X_test)
test_df['label'] = y_test_pred
test_df.to_csv('result/test_LR.csv', index=False)

# predict test data
y_test_pred = clf.predict(X_test)
test_df['label'] = y_test_pred
test_df.to_csv('result/test_SVM.csv', index=False)
