#Naive Bayes

In [1]:
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
df = pd.read_json("/content/Sarcasm_Headlines_Dataset_v2.json", lines=True) # Read line separated json
df.drop(columns=['article_link'], inplace=True) # Drop irrelevant columns
df.head()

Unnamed: 0,is_sarcastic,headline
0,1,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...


In [3]:
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    tokens = text.split()
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)

In [4]:
df['headline_without_stopwords'] = df['headline'].apply(remove_stopwords)
df.head()

Unnamed: 0,is_sarcastic,headline,headline_without_stopwords
0,1,thirtysomething scientists unveil doomsday clo...,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...,dem rep. totally nails congress falling short ...
2,0,eat your veggies: 9 deliciously different recipes,eat veggies: 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...,inclement weather prevents liar getting work
4,1,mother comes pretty close to using word 'strea...,mother comes pretty close using word 'streamin...


In [5]:
fractions = np.array([0.6, 0.2, 0.2]) # 60% training, 20% evaluation, 20% testing
df = df.sample(frac=1) # Shuffle the dataset
train, val, test = np.array_split(df, (fractions[:-1].cumsum() * len(df)).astype(int))

In [6]:
# Create a CountVectorizer to convert text into numerical features
vectorizer = CountVectorizer()
# Fit and transform the training data
X_train_vectorized = vectorizer.fit_transform(train['headline_without_stopwords'])
# Transform the validation data
X_val_vectorized = vectorizer.transform(val['headline_without_stopwords'])
# Transform the test data
X_test_vectorized = vectorizer.transform(test['headline_without_stopwords'])

In [7]:
# Create a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_vectorized, train['is_sarcastic'])

In [8]:
val_predictions = model.predict(X_val_vectorized)
val_accuracy = accuracy_score(val['is_sarcastic'], val_predictions)
print("Validation Accuracy:", val_accuracy)

Validation Accuracy: 0.7969951083158631


In [9]:
test_predictions = model.predict(X_test_vectorized)
test_accuracy = accuracy_score(test['is_sarcastic'], test_predictions)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.8006638714185884


#Error Analysis

In [10]:
# Get the misclassified instances
misclassified_indices = test[test['is_sarcastic'] != test_predictions].index

# Print the misclassified headlines
misclassified_headlines = test.loc[misclassified_indices, 'headline']
print("Misclassified Headlines:")
print(misclassified_headlines)

Misclassified Headlines:
27624    obama returns from india with these gross cand...
474      recording academy reminds aging musicians to d...
25494    prince harry humiliates royal family yet again...
22576      stampede at concert in guinea kills at least 34
2167     4 dead, 12 critically injured in seattle bus c...
                               ...                        
5362     ravens take out rival steelers in playoff grud...
19728                         ozzy wins tickets to ozzfest
7431     george lucas loves art so much he's opening a ...
24401    supreme court legalizes gay marriage after lan...
11922                    most items at garage sale haunted
Name: headline, Length: 1141, dtype: object


In [11]:
i=0
for index in test.index:
  if (test.loc[index, 'is_sarcastic']!=test_predictions[i]):
    print(test.loc[index, 'headline'])
    print("Truth is ",test.loc[index, 'is_sarcastic'],", Predicted value is ",test_predictions[i])
  i+=1

obama returns from india with these gross candies for everyone
Truth is  1 , Predicted value is  0
recording academy reminds aging musicians to die before december 15 to be included in 2017 grammy tributes
Truth is  1 , Predicted value is  0
prince harry humiliates royal family yet again as base invaded by afghan insurgents
Truth is  1 , Predicted value is  0
stampede at concert in guinea kills at least 34
Truth is  0 , Predicted value is  1
4 dead, 12 critically injured in seattle bus crash
Truth is  0 , Predicted value is  1
health experts urge parents to dramatically reduce childrens' on-screen time
Truth is  1 , Predicted value is  0
english teacher already armed with deadly weapon called shakespeare
Truth is  1 , Predicted value is  0
aclu stresses that it legal to film garbage men in all 50 states if you really need to
Truth is  1 , Predicted value is  0
87 killed in violent kerfuffle
Truth is  1 , Predicted value is  0
ladykiller gets life sentence
Truth is  1 , Predicted value 