In [None]:
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

from wordcloud import WordCloud, STOPWORDS

from transformers import pipeline

df = pd.read_csv("/kaggle/input/simplygo-results/results.csv")

print(df.info(), "\n")
df

In [None]:
nullDf = df[df.isna().any(axis=1)]
nullDf

In [None]:
df = df.dropna()
df['Date'] = pd.to_datetime(df['Date'], format='%b-%y') 
df.info()

In [None]:
stop_words = stopwords.words('english')

def clean_text(text):
    text = text.lower()
    text = text.replace("\\", "")
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    tokens = word_tokenize(text)
    
    words = [word for word in tokens if word not in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(lemmatized_words)

df['Article Content'] = df['Article Content'].apply(clean_text)
df

In [None]:
sia = SentimentIntensityAnalyzer()
results = {}

for index, row in df.iterrows():
    text = row["Article Content"]
    row_id = index
    results[row_id] = sia.polarity_scores(text)

sentiment_score_df = pd.DataFrame(results).T
sentiment_score_df

In [None]:
df = df.merge(sentiment_score_df, on=df.index)
df

In [None]:
df = df.drop(['key_0'], axis=1)
df

In [None]:
df.to_csv('my_data.csv')

In [None]:
positive_df = df[df["compound"] >= 0].reset_index().drop(['index'], axis=1)
negative_df = df[df["compound"] < 0].reset_index().drop(['index'], axis=1)

print("Positive Mean: ", positive_df["compound"].mean())
print("Negative Mean: ", negative_df["compound"].mean())

In [None]:
negative_df

In [None]:
for url in negative_df["URL"]:
    print(url)
    
print("\n")

In [None]:
for title in negative_df["Title"]:
    print(title)

In [None]:
remove_words = ["mp", "attack", "teenager", "south africa", "international", "drug", "supremacist", "newsletter", "white", "pm", "mar", "s270", "u", "hong", "fourth", "ha", "wa", "one", "two", "t", ",", '"', "sg", "ll", "ng", "st", "chee", "s", "war", "fu", "prison", "jailed", "criminal", "murder", "violent", "bomb", "shit", "killed", "molested", "terrorist", "arrested", "brutally", "racism", "negative", "death", "perjury", "trauma"]
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(20, 10))

for index, (text, ax) in enumerate(zip(negative_df["Article Content"], axes.flatten())):
    word_list = []
    tokens = word_tokenize(text)

    for word in tokens:
        score = sia.polarity_scores(word)['compound']
        if score <= -0.3:
                word_list.append(word)
    
    words = ' '.join(word_list)
    
    wc_stopwords = list(STOPWORDS) + remove_words
    wc_params = {
        'background_color': 'white',
        'width': 170,
        'height': 170,
        'stopwords': wc_stopwords,
        'colormap': 'tab10',
        'max_words': 35
    }
    
    wordcloud = WordCloud(**wc_params).generate(text)
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.axis('off')
    
    title = negative_df["Title"][index]
    split_title = title.split()
            
    wrapped_title = '\n'.join([' '.join(split_title[i:i +5]) for i in range(0, len(words), 6)]).rstrip()
    
    ax.set_title(f"{index+1}: {wrapped_title}")
    ax.title.set_fontsize(9) 

plt.savefig('wordclouds.png')
plt.tight_layout()
plt.show()