In [61]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.decomposition import TruncatedSVD
import torch
import torch.nn as nn


In [62]:
df_train = pd.read_csv('Sentiment-Analysis/train_data.csv', delimiter=',')[: 10000]
df_train['label'].value_counts()

label
2    4861
0    3943
1    1196
Name: count, dtype: int64

In [63]:
df_val = pd.read_csv('Sentiment-Analysis/val_data.csv', delimiter=',')[: 1000]
df_val['label'].value_counts()

label
2    444
0    408
1    148
Name: count, dtype: int64

In [64]:
df_test = pd.read_csv('Sentiment-Analysis/test_data.csv', delimiter=',')[: 1000]
df_test['label'].value_counts()

label
2    493
0    399
1    108
Name: count, dtype: int64

In [66]:
tfidf = TfidfVectorizer(use_idf=True, smooth_idf=True, analyzer='word')
pca = TruncatedSVD(n_components=500)

X_train, Y_train = tfidf.fit_transform(list(df_train['text'])), torch.tensor(list(df_train['label']))
X_train = torch.tensor(pca.fit_transform(X_train), dtype=torch.float32)

X_val, Y_val = tfidf.transform(list(df_val['text'])), torch.tensor(list(df_val['label']))
X_val = torch.tensor(pca.transform(X_val), dtype=torch.float32)

X_test, Y_test = tfidf.transform(list(df_test['text'])), torch.tensor(list(df_test['label']))
X_test = torch.tensor(pca.transform(X_test), dtype=torch.float32)


print(X_train.shape, X_val.shape, X_test.shape)         

torch.Size([10000, 500]) torch.Size([1000, 500]) torch.Size([1000, 500])


In [67]:
def compute_metrics(y_true, y_pred):
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')

    return precision, recall, f1

In [69]:
logistic_regression = RandomizedSearchCV(LogisticRegression(), param_distributions={
    'C': [0.5, 1, 2, 3, 4],
    'max_iter': [100, 500, 1000]
}, cv=5, scoring='f1_macro')

logistic_regression.fit(X_train, Y_train)
y_hat = logistic_regression.predict(X_test)

print(compute_metrics(Y_test, y_hat))

(0.6371364351656216, 0.5766583369248754, 0.5930820690541875)


In [70]:
class FFN(nn.Module):
    def __init__(self, in_feats, hidden_size, out_feats):
        super(FFN, self).__init__()
        self.lin1 = nn.Linear(in_feats, hidden_size)
        self.relu = nn.ReLU()
        self.lin2 = nn.Linear(hidden_size, out_feats)

    def forward(self, X):
        return self.lin2(self.relu(self.lin1(X)))


in_feats = X_train.shape[1]
hidden_size = in_feats * 8
out_feats = 3
epochs = 100
lr = 0.03

ffn = FFN(in_feats, hidden_size, out_feats)
optimizer = torch.optim.AdamW(ffn.parameters(), lr)
loss_fn = nn.CrossEntropyLoss()

for epoch in range(epochs):
    optimizer.zero_grad()
    y_hat = ffn.forward(X_train)
    loss = loss_fn(y_hat, Y_train)
    loss.backward()
    optimizer.step()

    with torch.no_grad():
        p, r, f1 = compute_metrics(Y_val, torch.argmax(ffn.forward(X_val), dim=1))
        print(f'epoch: {epoch + 1} loss: {loss.item()} dev f1-macro: {f1}')

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


epoch: 1 loss: 1.1024974584579468 dev f1-macro: 0.21574351832011998


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


epoch: 2 loss: 1.9625439643859863 dev f1-macro: 0.1931818181818182


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


epoch: 3 loss: 2.794063091278076 dev f1-macro: 0.33001907830283117


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


epoch: 4 loss: 0.8826376795768738 dev f1-macro: 0.38692810457516336


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


epoch: 5 loss: 0.7822012305259705 dev f1-macro: 0.28834102190258487
epoch: 6 loss: 0.813383936882019 dev f1-macro: 0.35040520639945383
epoch: 7 loss: 0.7523594498634338 dev f1-macro: 0.4226115397426027
epoch: 8 loss: 0.6647911071777344 dev f1-macro: 0.4805865907801283
epoch: 9 loss: 0.6343325972557068 dev f1-macro: 0.5350691976182196
epoch: 10 loss: 0.5584775805473328 dev f1-macro: 0.5848623498369797
epoch: 11 loss: 0.49861204624176025 dev f1-macro: 0.6085708034694683
epoch: 12 loss: 0.45301786065101624 dev f1-macro: 0.6478583169745037
epoch: 13 loss: 0.3963864743709564 dev f1-macro: 0.6491157522896854
epoch: 14 loss: 0.35223880410194397 dev f1-macro: 0.6445630355153572
epoch: 15 loss: 0.30504703521728516 dev f1-macro: 0.6429983422149473
epoch: 16 loss: 0.2637464702129364 dev f1-macro: 0.6403909107930839
epoch: 17 loss: 0.22303786873817444 dev f1-macro: 0.6437890082273877
epoch: 18 loss: 0.18889452517032623 dev f1-macro: 0.6382259514859359
epoch: 19 loss: 0.16079837083816528 dev f1-mac

In [71]:
with torch.no_grad():
    p, r, f1 = compute_metrics(Y_test, torch.argmax(ffn.forward(X_test), dim=1))
    print(f'precision-macro: {p} recall-macro: {r} dev f1-macro: {f1}')

precision-macro: 0.6158086562592433 recall-macro: 0.6072983808028823 dev f1-macro: 0.6102392381282622
