## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv("https://github.com/murpi/wilddata/raw/master/quests/tweets.zip")
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


## Preprocessing

In [3]:
df = df[df['sentiment'] != 'neutral']
df['sentiment'].value_counts(normalize=True)

positive    0.524476
negative    0.475524
Name: sentiment, dtype: float64

In [4]:
df.reset_index(drop=True, inplace=True)

About 52% of the tweets are positive, after removing the neutral ones.

In [5]:
X = df['text']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=32, train_size = 0.75)

vectorizer = TfidfVectorizer()
vectorizer.fit(X_train)

TfidfVectorizer()

In [6]:
X_train_TV = vectorizer.transform(X_train)
X_train_TV

<12272x15806 sparse matrix of type '<class 'numpy.float64'>'
	with 144578 stored elements in Compressed Sparse Row format>

In [7]:
X_test_TV = vectorizer.transform(X_test)
X_test_TV

<4091x15806 sparse matrix of type '<class 'numpy.float64'>'
	with 44633 stored elements in Compressed Sparse Row format>

## Classification

In [8]:
model = LogisticRegression().fit(X_train_TV, y_train)

print(f"Accuracy score on the train dataset: {model.score(X_train_TV, y_train)}")
print(f"Accuracy score on the test dataset: {model.score(X_test_TV, y_test)}")

Accuracy score on the train dataset: 0.9321219035202086
Accuracy score on the test dataset: 0.8731361525299438


Accuracy score is as expected.

A little less overfitting, and less iterations required compared to Bag of Words, but the results look pretty similar.