Import all the necessary packages here:

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

load the dataset:

In [2]:
df = pd.read_csv("../datasets/sentiment_dataset.csv", encoding="ISO-8859-1")

Check the meta data:

In [3]:
df.columns

Index(['0', '1467810369', 'Mon Apr 06 22:19:45 PDT 2009', 'NO_QUERY',
       '_TheSpecialOne_',
       '@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D'],
      dtype='object')

Define column names.

In [4]:
df.columns = ["label", "id", "date", "query", "username", "comment"]
df.columns

Index(['label', 'id', 'date', 'query', 'username', 'comment'], dtype='object')

In [5]:
df.head()

Unnamed: 0,label,id,date,query,username,comment
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


Check unique labels:

In [6]:
df["label"].unique()

array([0, 4], dtype=int64)

Downscale the label

In [7]:
df["label"] = df["label"].replace(4, 1)
df["label"].unique()

array([0, 1], dtype=int64)

Drop the irrelevant columns!

In [8]:
df = df.dropna()
df

Unnamed: 0,label,id,date,query,username,comment
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
...,...,...,...,...,...,...
1599994,1,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599995,1,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599996,1,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599997,1,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [9]:
df = df.drop(columns=["id", "date", "query", "username"])
df

Unnamed: 0,label,comment
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew
...,...,...
1599994,1,Just woke up. Having no school is the best fee...
1599995,1,TheWDB.com - Very cool to hear old Walt interv...
1599996,1,Are you ready for your MoJo Makeover? Ask me f...
1599997,1,Happy 38th Birthday to my boo of alll time!!! ...


Create the tf-idfs for each comment:

In [10]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["comment"])
X

<1599999x684357 sparse matrix of type '<class 'numpy.float64'>'
	with 18986976 stored elements in Compressed Sparse Row format>

In [11]:
y = df["label"]
y

0          0
1          0
2          0
3          0
4          0
          ..
1599994    1
1599995    1
1599996    1
1599997    1
1599998    1
Name: label, Length: 1599999, dtype: int64

Now let's split the test and training data!

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

Let's fit a new instance of LR to this data:

In [13]:
model = LogisticRegression(max_iter=1000)
model

In [14]:
model =model.fit(X_train, y_train)

In [15]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)


In [16]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.78      0.80    319728
           1       0.79      0.82      0.80    320272

    accuracy                           0.80    640000
   macro avg       0.80      0.80      0.80    640000
weighted avg       0.80      0.80      0.80    640000



In [17]:
def make_prediction(comment: str) -> None:
    """
    Uses the previosly trianed model to determine if a comment is positive
    or negative in terms of it's sentiment.
    Positive: 1
    Negative: 0.
    """
    comment = vectorizer.transform([comment])
    return model.predict(comment)[0]