##1. Tải bộ dữ liệu



In [1]:
!gdown 1dANzxlEjeCEbveFOWWqzSG-niVx-vCgK

Downloading...
From: https://drive.google.com/uc?id=1dANzxlEjeCEbveFOWWqzSG-niVx-vCgK
To: /content/sentiment_analysis.csv
  0% 0.00/1.10M [00:00<?, ?B/s]100% 1.10M/1.10M [00:00<00:00, 12.4MB/s]


##2. Import thư viện cần thiết

In [2]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from nltk.tokenize import TweetTokenizer
from collections import defaultdict

##3. Đọc bộ dữ liệu

In [4]:
dataset_path = '/content/sentiment_analysis.csv'
df = pd.read_csv(dataset_path, index_col='id')
df

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
2,0,Finally a transparant silicon case ^^ Thanks t...
3,0,We love this! Would you go? #talk #makememorie...
4,0,I'm wired I know I'm George I was made that wa...
5,1,What amazing service! Apple won't even talk to...
...,...,...
7916,0,Live out loud #lol #liveoutloud #selfie #smile...
7917,0,We would like to wish you an amazing day! Make...
7918,0,Helping my lovely 90 year old neighbor with he...
7919,0,Finally got my #smart #pocket #wifi stay conne...


##4. Tiền xử lý bộ dữ liệu

In [5]:
def text_normalize(text):
    # Retweet old acronum "RT" removal
    text = re.sub(r'^RT[\s]+', '', text)

    # Hyperlinks removal
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)

    # Hashtags removal
    text = re.sub(r'#', '', text)

    # Punctuation removal
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenization
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    return tokenizer.tokenize(text)

In [6]:
def get_freqs(df):
    freqs = defaultdict(lambda: 0)
    for idx, row in df.iterrows():
        tweet = row['tweet']
        label = row['label']

        tokens = text_normalize(tweet)
        for token in tokens:
            pair = (token, label)
            freqs[pair] += 1
    return freqs

In [7]:
freqs = get_freqs(df)
freqs

defaultdict(<function __main__.get_freqs.<locals>.<lambda>()>,
            {('fingerprint', 0): 4,
             ('pregnancy', 0): 1,
             ('test', 0): 8,
             ('finally', 0): 168,
             ('a', 0): 727,
             ('transparant', 0): 1,
             ('silicon', 0): 1,
             ('case', 0): 228,
             ('thanks', 0): 94,
             ('to', 0): 876,
             ('my', 0): 1227,
             ('uncle', 0): 4,
             ('yay', 0): 63,
             ('sony', 0): 701,
             ('xperia', 0): 54,
             ('s', 0): 38,
             ('sonyexperias', 0): 1,
             ('we', 0): 159,
             ('love', 0): 385,
             ('this', 0): 456,
             ('would', 0): 95,
             ('you', 0): 557,
             ('go', 0): 67,
             ('talk', 0): 12,
             ('makememories', 0): 1,
             ('unplug', 0): 2,
             ('relax', 0): 31,
             ('iphone', 0): 2905,
             ('smartphone', 0): 61,
             ('wifi',

In [8]:
def get_feature(text, freqs):
    tokens = text_normalize(text)

    X = np.zeros(3)
    X[0] = 1
    for token in tokens:
        X[1] += freqs[(token, 1)]
        X[2] += freqs[(token, 0)]
    return X

In [9]:
X = []
y = []

freqs = get_freqs(df)
for idx, row in df.iterrows():
    tweet = row['tweet']
    label = row['label']

    X.append(get_feature(tweet, freqs))
    y.append(label)

X = np.array(X)
y = np.array(y)
print(X)
print(X.shape)
print(y)
print(y.shape)

[[1.000e+00 2.000e+00 1.300e+01]
 [1.000e+00 2.788e+03 4.183e+03]
 [1.000e+00 1.425e+03 4.768e+03]
 ...
 [1.000e+00 4.635e+03 6.175e+03]
 [1.000e+00 1.959e+03 3.814e+03]
 [1.000e+00 3.181e+03 6.160e+03]]
(7920, 3)
[0 0 0 ... 0 0 0]
(7920,)


##5. Chia bộ train, val, test

In [10]:
val_size = 0.2
test_size = 0.125
random_state = 2
is_shuffle = True

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=val_size,
    random_state=random_state,
    shuffle=is_shuffle
)

X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train,
    test_size=test_size,
    random_state=random_state,
    shuffle=is_shuffle
)

##6. Chuẩn hóa dữ liệu

In [11]:
from sklearn.preprocessing import StandardScaler

normalizer = StandardScaler()
X_train[:, 1:] = normalizer.fit_transform(X_train[:, 1:])
X_val[:, 1:] = normalizer.transform(X_val[:, 1:])
X_test[:, 1:] = normalizer.transform(X_test[:, 1:])

##7. Cài đặt các hàm quan trọng

In [12]:
# Hàm sigmoid:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Hàm dự đoán:
def predict(X, theta):
    dot_product = np.dot(X, theta)
    y_hat = sigmoid(dot_product)

    return y_hat

# Hàm tính loss:
def compute_loss(y_hat, y):
    y_hat = np.clip(y_hat, 1e-7, 1 - 1e-7)

    return (
        -y * np.log(y_hat) - (1 - y) * np.log(1 - y_hat)
    ).mean()

# Hàm tính gradient:
def compute_gradient(X, y, y_hat):
    return np.dot(X.T, (y_hat - y)) / y.size

# Hàm cập nhật trọng số:
def update_theta(theta, gradient, lr):
    return theta - lr * gradient

# Hàm tính độ chính xác:
def compute_accuracy(X, y, theta):
    y_hat = predict(X, theta).round()
    acc = (y_hat == y).mean()

    return acc

##8. Khai báo các siêu tham số và khởi tạo weights

In [13]:
lr = 0.01
epochs = 200
batch_size = 128

np.random.seed(random_state)
theta = np.random.uniform(size=X_train.shape[1])

##9. Huấn luyện mô hình

In [14]:
train_accs = []
train_losses = []
val_accs = []
val_losses = []

for epoch in range(epochs):
    train_batch_losses = []
    train_batch_accs = []
    val_batch_losses = []
    val_batch_accs = []

    for i in range(0, X_train.shape[0], batch_size):
        X_i = X_train[i:i + batch_size]
        y_i = y_train[i:i + batch_size]

        y_hat = predict(X_i, theta)

        train_loss = compute_loss(y_hat, y_i)
        gradient = compute_gradient(X_i, y_i, y_hat)
        theta = update_theta(theta, gradient, lr)

        train_batch_losses.append(train_loss)

    train_acc = compute_accuracy(X_train, y_train, theta)
    train_batch_accs.append(train_acc)

    y_val_hat = predict(X_val, theta)
    val_loss = compute_loss(y_val_hat, y_val)
    val_batch_losses.append(val_loss)

    val_acc = compute_accuracy(X_val, y_val, theta)
    val_batch_accs.append(val_acc)

    train_batch_loss = sum(train_batch_losses) / len(train_batch_losses)
    val_batch_loss = sum(val_batch_losses) / len(val_batch_losses)
    train_batch_acc = sum(train_batch_accs) / len(train_batch_accs)
    val_batch_acc = sum(val_batch_accs) / len(val_batch_accs)

    train_losses.append(train_batch_loss)
    val_losses.append(val_batch_loss)
    train_accs.append(train_batch_acc)
    val_accs.append(val_batch_acc)

    print(f'\nEPOCH {epoch + 1}:\tTraining loss: {train_batch_loss:.3f}\tValidation loss: {val_batch_loss:.3f}')


EPOCH 1:	Training loss: 0.772	Validation loss: 0.743

EPOCH 2:	Training loss: 0.724	Validation loss: 0.700

EPOCH 3:	Training loss: 0.684	Validation loss: 0.663

EPOCH 4:	Training loss: 0.651	Validation loss: 0.633

EPOCH 5:	Training loss: 0.624	Validation loss: 0.608

EPOCH 6:	Training loss: 0.601	Validation loss: 0.587

EPOCH 7:	Training loss: 0.581	Validation loss: 0.569

EPOCH 8:	Training loss: 0.565	Validation loss: 0.554

EPOCH 9:	Training loss: 0.551	Validation loss: 0.541

EPOCH 10:	Training loss: 0.539	Validation loss: 0.530

EPOCH 11:	Training loss: 0.528	Validation loss: 0.520

EPOCH 12:	Training loss: 0.519	Validation loss: 0.511

EPOCH 13:	Training loss: 0.511	Validation loss: 0.504

EPOCH 14:	Training loss: 0.504	Validation loss: 0.497

EPOCH 15:	Training loss: 0.497	Validation loss: 0.491

EPOCH 16:	Training loss: 0.492	Validation loss: 0.485

EPOCH 17:	Training loss: 0.486	Validation loss: 0.481

EPOCH 18:	Training loss: 0.482	Validation loss: 0.476

EPOCH 19:	Training

##10. Đánh giá mô hình

In [15]:
val_set_acc = compute_accuracy(X_val, y_val, theta)
test_set_acc = compute_accuracy(X_test, y_test, theta)

print('Evaluation on validation and test set:')
print(f'Accuracy (Validation): {val_set_acc}')
print(f'Accuracy (Test): {test_set_acc}')

Evaluation on validation and test set:
Accuracy (Validation): 0.8207070707070707
Accuracy (Test): 0.8396464646464646
