In [1]:
import pandas as pd
import numpy as np
import spacy
import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression


In [2]:
train = pd.read_csv('simplified_emotions.csv')
test = pd.read_csv('test.csv', delimiter='\t')

In [3]:
X_train = train['Sentences'].values
y_train = train['Emotions'].values

X_test = test['sentence'].values

In [4]:
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# TF-IDF

When I implemented tf-idf in my model, I noticed there was an increase in the F1-score in the public board. When I used TF-IDF with logistic regression, it helps the model to start paying attention to words that are important in each sentence. This makes the model better at telling the difference between different emotions. TF-IDF also makes sure that words that show up a lot, like "the" or "and," don't have too much influence on the model. This way, the model can focus on the words that really matter for figuring out emotions in sentences.

In [5]:
tfidf_vectorizer = TfidfTransformer()
X_train = tfidf_vectorizer.fit_transform(X_train_vectorized)
X_test = tfidf_vectorizer.transform(X_test_vectorized)

In [6]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
y_pred = lr.predict(X_test)

submission = pd.DataFrame({'id': test['id'], 'emotion': y_pred})

submission

Unnamed: 0,id,emotion
0,0,other
1,1,other
2,2,other
3,3,other
4,4,other
...,...,...
1431,1431,happiness
1432,1432,happiness
1433,1433,other
1434,1434,happiness


In [8]:
submission["emotion"].value_counts()

emotion
other        1202
happiness     234
Name: count, dtype: int64

In [9]:
submission.to_csv('submissions/submission_lr_tdif.csv', index=False)