# SNS Sentiment Analysis

Analyze social media texts and measure potential inflammatory / offensive language.

# Data Pre-processing

From the selected datasets, extract the text and labels from all of them, then combine into one large CSV dataset.

(Install NLTK data if not already installed).

In [1]:
import nltk, os

# Run this if you are locally accessing the NLTK data
nltk.data.path.append('./nltk_data/')
if not os.path.exists('./nltk_data'):
    nltk.download('punkt', download_dir='./nltk_data/')
    nltk.download('stopwords', download_dir='./nltk_data/')
    nltk.download('words', download_dir='./nltk_data/')
    nltk.download('brown', download_dir='./nltk_data/')

In [2]:
hate_speech_dataset_path = "./datasets/hate_speech_detect/HateSpeechDatasetBalanced.csv"
malignant_dataset_path = "./datasets/malignant/train.csv"

In [None]:
import pandas as pd
# Process malignant train data
m_train_df = pd.read_csv(malignant_dataset_path)
m_train_df_no_id = m_train_df.drop(columns=m_train_df.columns[0])

processed_m_train_df = pd.DataFrame({
    "text": m_train_df_no_id[m_train_df_no_id.columns[0]],
    "label": m_train_df_no_id[m_train_df_no_id.columns[1:]].max(axis=1)
})

In [7]:
import data_util as du

In [None]:
hs_tuples = du.generate_tuples_from_file(hate_speech_dataset_path)

In [None]:
m_tuples = du.generate_tuples_from_df(processed_m_train_df)

In [None]:
# Combine and save the data to a CSV
processed_data_save = pd.DataFrame({
    "text": hs_tuples[0] + m_tuples[0],
    "label": hs_tuples[1] + m_tuples[1]
})

processed_data_save.to_csv("./datasets/processed/all_data.csv", index=False)

In [None]:
# Load the saved data, only to fetch from the previous state
import ast
import pandas as pd

complete_df = pd.read_csv("./datasets/processed/all_data.csv")
complete_df[complete_df.columns[0]] = complete_df[complete_df.columns[0]].apply(ast.literal_eval)
complete_df[complete_df.columns[1]] = complete_df[complete_df.columns[1]].astype(int)

In [None]:
from sklearn.model_selection import train_test_split

# Our current dataset is too large, so we'll be using a portion of it in our actual model
# Training data will be a size of 8,000
# Test data will be a size of 2,000
# This is a balanced dataset with half being labels 0 and 1
df_0 = complete_df[complete_df[complete_df.columns[1]] == 0].sample(n=int(5e3), random_state=1).reset_index(drop=True)
df_1 = complete_df[complete_df[complete_df.columns[1]] == 1].sample(n=int(5e3), random_state=1).reset_index(drop=True)

assert len(df_0[df_0[df_0.columns[1]] == 0]) > 0 and len(df_0[df_0[df_0.columns[1]] == 1]) == 0
assert len(df_1[df_1[df_1.columns[1]] == 1]) > 0 and len(df_1[df_1[df_1.columns[1]] == 0]) == 0

combined_data = []
for i in range(len(df_0)):
    combined_data.append([df_0.iloc[i, 0], 0])
    if i < len(df_1):
        combined_data.append([df_1.iloc[i, 0], 1])

combined_df = pd.DataFrame(combined_data, columns=[complete_df.columns[0], complete_df.columns[1]])

training_text, test_text, training_labels, test_labels = train_test_split(combined_df.text, combined_df.label, test_size=0.2, random_state=42)

training_data = pd.DataFrame({
    "text": training_text,
    "label": training_labels
})

test_data = pd.DataFrame({
    "text": test_text,
    "label": test_labels
})
#
training_data.to_csv("./datasets/processed/train.csv", index=False)
test_data.to_csv("./datasets/processed/test.csv", index=False)