Import all the necessary packages here:

In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

load the dataset:

In [16]:
df = pd.read_csv("../datasets/sentiment_dataset.csv", encoding="ISO-8859-1")

Check the meta data:

In [17]:
df.columns

Index(['0', '1467810369', 'Mon Apr 06 22:19:45 PDT 2009', 'NO_QUERY',
       '_TheSpecialOne_',
       '@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D'],
      dtype='object')

Define column names.

In [18]:
df.columns = ["label", "id", "date", "query", "username", "comment"]
df.columns

Index(['label', 'id', 'date', 'query', 'username', 'comment'], dtype='object')

In [19]:
df.head()

Unnamed: 0,label,id,date,query,username,comment
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


Check unique labels:

In [20]:
df["label"].unique()

array([0, 4], dtype=int64)

Downscale the label

In [21]:
df["label"] = df["label"].replace(4, 1)
df["label"].unique()

array([0, 1], dtype=int64)

Drop the irrelevant columns!

In [22]:
df = df.dropna()
df

Unnamed: 0,label,id,date,query,username,comment
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
...,...,...,...,...,...,...
1599994,1,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599995,1,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599996,1,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599997,1,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [23]:
df = df.drop(columns=["id", "date", "query", "username"])
df

Unnamed: 0,label,comment
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew
...,...,...
1599994,1,Just woke up. Having no school is the best fee...
1599995,1,TheWDB.com - Very cool to hear old Walt interv...
1599996,1,Are you ready for your MoJo Makeover? Ask me f...
1599997,1,Happy 38th Birthday to my boo of alll time!!! ...


Create the tf-idfs for each comment:

In [24]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["comment"])
X

<1599999x684357 sparse matrix of type '<class 'numpy.float64'>'
	with 18986976 stored elements in Compressed Sparse Row format>

In [26]:
y = df["label"]
y

0          0
1          0
2          0
3          0
4          0
          ..
1599994    1
1599995    1
1599996    1
1599997    1
1599998    1
Name: label, Length: 1599999, dtype: int64

Now let's split the test and training data!

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

Let's fit a new instance of LR to this data:

In [29]:
model = LogisticRegression(max_iter=1000)
model

In [30]:
model =model.fit(X_train, y_train)

In [31]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)


In [32]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.78      0.80    319728
           1       0.79      0.82      0.80    320272

    accuracy                           0.80    640000
   macro avg       0.80      0.80      0.80    640000
weighted avg       0.80      0.80      0.80    640000



In [46]:
def make_prediction(comment: str) -> None:
    """
    Uses the previosly trianed model to determine if a comment is positive
    or negative in terms of it's sentiment.
    Positive: 1
    Negative: 0.
    """
    comment = vectorizer.transform([comment])
    return model.predict(comment)[0]

So the model has an accuracy, which is acceptable! Hence we shall now use this to create the feature for our original set!

In [33]:
sarcasm_data = pd.read_csv("../main_datasets/train_sarcasm.csv")

In [34]:
sentiment = list() #this will be our list of predictions for each comment.

In [65]:
sarcasm_data.columns
sarcasm_data = sarcasm_data.dropna()

In [67]:
sarcasm_data.head()


Unnamed: 0.1,Unnamed: 0,label,comment,votes
0,0,0,NC and NH.,-1
1,1,0,You do know west teams play against west teams...,-1
2,2,0,"They were underdogs earlier today, but since G...",3
3,3,0,"This meme isn't funny none of the ""new york ni...",-1
4,4,0,I could use one of those tools.,-1


In [71]:
X_sarcasm = vectorizer.transform(sarcasm_data["comment"])
print(X_sarcasm)
sarcasm_data["sentiment"] = model.predict(X_sarcasm)


  (0, 445290)	0.7072034826428388
  (0, 440867)	0.6783834660457788
  (0, 68526)	0.1991459443060681
  (1, 675496)	0.07574128198007558
  (1, 652287)	0.4361413562162685
  (1, 599998)	0.15044181611263965
  (1, 595722)	0.7736257784401225
  (1, 517493)	0.13330012391602775
  (1, 485102)	0.1615333219070897
  (1, 424348)	0.1258677214308434
  (1, 349754)	0.11736647021481776
  (1, 203744)	0.22594655752877382
  (1, 192925)	0.11119219744610416
  (1, 54482)	0.21452116472798524
  (2, 652075)	0.1874501367818558
  (2, 639349)	0.26234424862944694
  (2, 632681)	0.43404212302913237
  (2, 614214)	0.1415826386397645
  (2, 613860)	0.07629992350123302
  (2, 607679)	0.130947959650511
  (2, 606958)	0.15914228167905345
  (2, 600796)	0.07863591009572915
  (2, 555701)	0.20817771216709205
  (2, 473519)	0.42223176351272124
  (2, 426009)	0.27473570814098747
  :	:
  (1010770, 299012)	0.1207725452115816
  (1010770, 157727)	0.4484411048863809
  (1010770, 146449)	0.46796720984694884
  (1010770, 89826)	0.20674423214992077


In [72]:
sarcasm_data.columns

Index(['Unnamed: 0', 'label', 'comment', 'votes', 'sentiment'], dtype='object')

In [73]:
sarcasm_data.head()

Unnamed: 0.1,Unnamed: 0,label,comment,votes,sentiment
0,0,0,NC and NH.,-1,0
1,1,0,You do know west teams play against west teams...,-1,1
2,2,0,"They were underdogs earlier today, but since G...",3,0
3,3,0,"This meme isn't funny none of the ""new york ni...",-1,0
4,4,0,I could use one of those tools.,-1,1


We have now achieved the dataset that we wanted!

In [81]:
sarcasm_data.loc[sarcasm_data["label"]==1]

Unnamed: 0.1,Unnamed: 0,label,comment,votes,sentiment
33,33,1,But they'll have all those reviews!,-1,0
44,44,1,wow it is totally unreasonable to assume that ...,-1,1
45,45,1,Ho ho ho... But Melania said that there is no ...,-1,0
66,66,1,I can't wait until @potus starts a twitter war...,-1,1
69,69,1,gotta love the teachers who give exams on the ...,-1,1
...,...,...,...,...,...
1010821,1010821,1,I'm sure that Iran and N. Korea have the techn...,2,0
1010822,1010822,1,"whatever you do, don't vote green!",1,1
1010823,1010823,1,Perhaps this is an atheist conspiracy to make ...,1,1
1010824,1010824,1,The Slavs got their own country - it is called...,1,1


Finally, we shall now export this dataset!  

In [82]:
sarcasm_data.to_csv("../main_datasets/sarcasm_data.csv")