In [5]:
import nltk
import pandas as pd

In [6]:
import spacy
from nltk import sent_tokenize

In [7]:
pwd


'C:\\path\\to\\new\\virtual\\NLP\\envir'

In [8]:
df = pd.read_csv(r"CommentsMay2017.csv", low_memory= False)


In [9]:
df['commentType'].unique()


array(['comment', 'userReply', 'reporterReply'], dtype=object)

In [10]:
df['sarcasm_label'] = None

In [11]:
df.drop("sarcasm_label", axis = 1, inplace=True)

In [12]:
df.dtypes[df.dtypes == "object"]

commentBody              object
commentTitle             object
commentType              object
parentUserDisplayName    object
permID                   object
picURL                   object
status                   object
userDisplayName          object
userLocation             object
userTitle                object
userURL                  object
articleID                object
sectionName              object
newDesk                  object
typeOfMaterial           object
dtype: object

In [13]:
df = df[["commentBody"]].copy()

In [14]:
print("duplicate :", df["commentBody"].duplicated().sum())
print("missing :" ,df["commentBody"].isnull().sum())

duplicate : 896
missing : 0


In [15]:
df.drop_duplicates(subset=["commentBody"], inplace= True)


In [16]:
text_column = df['commentBody']


In [17]:
words = text_column.str.split().explode()

In [18]:
word_counts = words.value_counts()


In [19]:
top_n = 150  #you may change this to whatever you want
most_common_words = word_counts.head(top_n)
##pd.options.display.max_rows = 150
print(most_common_words)

commentBody
the     928453
to      558491
and     481737
of      463690
a       379147
         ...  
US       13551
--       13404
In       13375
You      13364
back     13319
Name: count, Length: 150, dtype: int64


In [20]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
    text = re.sub(r"[^a-zA-Z0-9\s.,!?']", "", text)  # Keep only useful characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text
df["commentBody"] = df["commentBody"].apply(clean_text)


In [21]:
df["comment_length"] = df["commentBody"].apply(len)
df["comment_length"].describe()

count    275493.000000
mean        403.346270
std         357.497389
min           0.000000
25%         136.000000
50%         289.000000
75%         553.000000
max        1866.000000
Name: comment_length, dtype: float64

In [22]:
# Define common sarcastic phrases
sarcasm_keywords = [
    "oh great", "yeah sure", "just perfect", "wow amazing", "totally not", 
    "of course", "what a surprise", "love that", "brilliant move"
]

# Function to label sarcasm
def detect_sarcasm(text):
    for phrase in sarcasm_keywords:
        if phrase in text:
            return 1  # Sarcastic
    return 0  # Not Sarcastic

# Apply sarcasm detection
df["sarcasm_label"] = df["commentBody"].apply(detect_sarcasm)

# Check label distribution
print(df["sarcasm_label"].value_counts())

sarcasm_label
0    269013
1      6480
Name: count, dtype: int64


In [23]:
sarcastic_examples = df[df["sarcasm_label"] == 1]["commentBody"].sample(10, random_state=42)

# Print them one by one
for i, comment in enumerate(sarcastic_examples, 1):
    print(f"{i}. {comment}\n")

1. fallon talent is in comedic parodies and sketches and games. he is lost at sea when it comes to interviews. fallon's mistake was in making trump the toxic presidential candidate look normal and friendly, which he of course is not. nobody should expected fallon to talk politics or be politically savvy, his best approach would have been to be professionally aloof.

2. working for trump's administration would be a trip down the rabbit hole, a real donald in wonderland experience. trump, of course, is the queen of hearts yelling, off with their heads!. sean spicer and sarah sanders are twiddle dee and twiddle dum. do you eat a cookie and grow enormous or do you drink from a little bottle and shrink? queen of hearts trump does not like any grandstanding so you better shrink so small you won't be seen.

3. of the listed top tech companies, i can really live without the presence of apple. in fact, i already do! i don't use an iphone and instead use an android phone. i don't use macbooks, w

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Convert comments to TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X = vectorizer.fit_transform(df["commentBody"])
y = df["sarcasm_label"]

# Split into training & test set (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [25]:
# Train a Naïve Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9753534546906477
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     53706
           1       1.00      0.03      0.05      1393

    accuracy                           0.98     55099
   macro avg       0.99      0.51      0.52     55099
weighted avg       0.98      0.98      0.96     55099



In [26]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy=0.3, random_state=42)  # 30% sarcastic data
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Train model again
model = MultinomialNB()
model.fit(X_resampled, y_resampled)

# Predict again
y_pred_resampled = model.predict(X_test)

# Print new results
print("Accuracy:", accuracy_score(y_test, y_pred_resampled))
print(classification_report(y_test, y_pred_resampled))

Accuracy: 0.985898110673515
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     53706
           1       0.98      0.45      0.62      1393

    accuracy                           0.99     55099
   macro avg       0.98      0.73      0.81     55099
weighted avg       0.99      0.99      0.98     55099

