#**Sentiment Analysis**

In [4]:
# from google.colab import drive
# drive.mount('/content/drive')

In [5]:
import numpy as np
import pandas as pd
import textblob
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

**COMMENTS**

In [6]:
new_df= pd.read_csv(r"/content/merged_singapore_data.csv")
new_df.head()

Unnamed: 0,id,text,label,subreddit,created_utc,date,original_language(s)
0,sample_11379,"Oohh noo, Vicky Prasetyo is rumored to want to...",0,indonesia,1641812360,10-01-2022,Indonesian
1,sample_4296,"In my case, my family and friends never asked ...",1,indonesia,1543136805,25-11-2018,Indonesian
2,sample_6019,"When it was booming, I used to spend more than...",1,indonesia,1580290245,29-01-2020,Indonesian
3,sample_5254,"> In a response, he stated: ""If there is a hou...",0,malaysia,1642219040,15-01-2022,Malay
4,sample_4120,"HEH ELU YES, GO FOR IT NGABBB BANDUNG COLD PER...",0,indonesia,1640355228,24-12-2021,Indonesian


In [7]:
#REQUIRED FOR TWITTER DATA_SET
# new_df['label'] = new_df.apply(lambda row: 0 if row['hate_speech'] + row['offensive_language'] == 0 else 1, axis=1)
# new_df.rename(columns={'tweet': 'text'}, inplace=True)

In [8]:
new_df.shape

(15000, 7)

In [9]:
df = new_df[['text', 'label']]

In [10]:
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(inplace=True)


In [11]:
df.shape

(14983, 2)

In [12]:
df['label'].value_counts()

0    8811
1    6172
Name: label, dtype: int64

In [None]:
from textblob import TextBlob
df['sentiment_score'] = df['text'].apply(lambda x: TextBlob(x).sentiment.polarity)
df['sentiment_label'] = df['sentiment_score'].apply(lambda score: "positive" if score > 0 else "neutral" if score == 0 else "negative")

In [None]:
df.head()

In [None]:
df.loc[:, 'text'] = df['text'].str.lower()

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

In [None]:
def get_token(obj):
        tokens = word_tokenize(obj)
        tokens = [word for word in tokens if word.isalpha()]
        tokens = [word for word in tokens if not word in stop_words]
        tokens = [porter.stem(word) for word in tokens]
        tokens = " ".join(tokens)
        return tokens

In [None]:
df['text'] = df['text'].apply(get_token)

In [None]:
df.head()

**HATE LEXICONS**

In [None]:
df1 = pd.read_csv(r"/lexicon.csv")

In [None]:
df1.loc[:, 'Hate_keywords'] = df1['Hate_keywords'].str.lower()
df1.drop_duplicates(subset=['Hate_keywords'], keep='first', inplace=True)
df1.reset_index(drop=True, inplace=True)
df1['Hate_keywords'] = df1['Hate_keywords'].apply(get_token)

In [None]:
df1.shape

In [None]:
df1.head()

**FINAL DATA-FRAME**

In [None]:
# Initialize the word_freq dictionary
word_freq = {}
# Count the frequency of words from 'Hate_keywords' column in the lexicon
for word in df1['Hate_keywords'].explode():
    if word not in word_freq:
        word_freq[word] = 0

# Tokenize the words in the comments and count their frequency
no_of_keywords = []
list_of_keywords = []
for comment in df['text']:
    words = word_tokenize(comment)
    key_words = []
    for word in words:
        if word in word_freq:
            word_freq[word] += 1
            key_words.append(word)
    no_of_keywords.append(len(key_words))
    if key_words:
      list_of_keywords.append(",".join(key_words))
    else:
      list_of_keywords.append("No hate keywords")
df.loc[:, 'no_of_hate_words'] = no_of_keywords
df.loc[:, 'hate_keywords'] = list_of_keywords


In [None]:
df.head()

In [None]:
df['label'].value_counts()

In [None]:
df['sentiment_label'].value_counts()

In [None]:
# count = pd.DataFrame(df.groupby('label')['no_of_hate_words'].value_counts(sort = False))
# count.to_csv('no_of_hate_words', index=False)
# from google.colab import files
# files.download('no_of_hate_words')
df.groupby('label')['no_of_hate_words'].value_counts(sort = False)

In [None]:
sns.countplot(x = 'sentiment_label', data = df)

In [None]:
sns.countplot(x = 'no_of_hate_words', data = df, hue = 'label')
plt.xlim(0, 18)

In [None]:
sns.displot(df, x = 'sentiment_score', hue = 'label')

In [None]:
sns.jointplot(x = 'sentiment_score', y = 'no_of_hate_words', data = df, hue = 'label')

In [None]:
sns.heatmap(df.corr(), annot = True)

# **Count of Hate_Keywords**

In [None]:
df_exploded = df.assign(hate_keywords=df['hate_keywords'].str.split(',')).explode('hate_keywords')
sentiment_counts = df_exploded.groupby(['hate_keywords', 'sentiment_label']).size().reset_index(name='count')
pivot_sentiment_counts = sentiment_counts.pivot_table(index='hate_keywords', columns='sentiment_label', values='count', fill_value=0)
pivot_sentiment_counts['Total'] = pivot_sentiment_counts.sum(axis=1)
pivot_sentiment_counts.sort_values(by='Total', ascending=False, inplace = True)

In [None]:
pivot_sentiment_counts.head(10)

In [None]:
# pivot_sentiment_counts.to_csv('sentiment_counts.csv')

# **Error Analysis**

In [None]:
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, classification_report
predicted_label_keyword = [1 if value > 0 else 0 for value in df['no_of_hate_words']]
predicted_label_sentiment = [0 if value >= 0 else 1 for value in df['sentiment_score']]

In [None]:
df['label'].value_counts()

In [None]:
print(pd.DataFrame(confusion_matrix(df['label'], predicted_label_keyword), index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1']))
print('\n');
print(classification_report(df['label'], predicted_label_keyword))

In [None]:
print(pd.DataFrame(confusion_matrix(df['label'], predicted_label_sentiment), index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1']))
print('\n');
print(classification_report(df['label'], predicted_label_sentiment))

In [None]:
loss_key_words = log_loss(df['label'], predicted_label_keyword)
loss_sentiment = log_loss(df['label'], predicted_label_sentiment)
print(loss_key_words)
print(loss_sentiment)