# SNS Sentiment Analysis

Analyze social media texts and measure potential inflammatory / offensive language.

# Data Pre-processing

From the selected datasets, extract the text and labels from all of them, then combine into one large CSV dataset.

(Install NLTK data if not already installed).

In [6]:
import nltk, os

# Run this if you are locally accessing the NLTK data
nltk.data.path.append('./nltk_data/')
if not os.path.exists('./nltk_data'):
    nltk.download('punkt', download_dir='./nltk_data/')
    nltk.download('stopwords', download_dir='./nltk_data/')
    nltk.download('words', download_dir='./nltk_data/')
    nltk.download('brown', download_dir='./nltk_data/')

In [2]:
hate_speech_dataset_path = "./datasets/hate_speech_detect/HateSpeechDatasetBalanced.csv"
malignant_dataset_path = "./datasets/malignant/train.csv"

In [3]:
import pandas as pd
# Process malignant train data
m_train_df = pd.read_csv(malignant_dataset_path)
m_train_df_no_id = m_train_df.drop(columns=m_train_df.columns[0])

processed_m_train_df = pd.DataFrame({
    "text": m_train_df_no_id[m_train_df_no_id.columns[0]],
    "label": m_train_df_no_id[m_train_df_no_id.columns[1:]].max(axis=1)
})

In [1]:
import data_util as du

In [8]:
hs_tuples = du.generate_tuples_from_file(hate_speech_dataset_path)

In [9]:
m_tuples = du.generate_tuples_from_df(processed_m_train_df)

In [20]:
# Combine and save the data to a CSV
processed_data_save = pd.DataFrame({
    "text": hs_tuples[0] + m_tuples[0],
    "label": hs_tuples[1] + m_tuples[1]
})

processed_data_save.to_csv("./datasets/processed/all_data.csv", index=False)

In [2]:
# Load the saved data
import ast
import pandas as pd

complete_df = pd.read_csv("./datasets/processed/all_data.csv")
complete_df[complete_df.columns[0]] = complete_df[complete_df.columns[0]].apply(ast.literal_eval)
complete_df[complete_df.columns[1]] = complete_df[complete_df.columns[1]].astype(int)

In [3]:
from sklearn.model_selection import train_test_split

# Split the data up into training and testing
X_train, X_test, y_train, y_test = train_test_split(complete_df.text, complete_df.label, test_size=0.2, random_state=42)

# Vectorize Data

In [4]:
train_text = [" ".join(r) for r in X_train]
test_text = [" ".join(r) for r in X_test]

In [33]:
# Check for distribution of training vs testing data
print(len(train_text), len(test_text))
print(len(train_text) / (len(train_text) + len(test_text)))
print(len(test_text) / (len(train_text) + len(test_text)))

708552 177138
0.8
0.2


In [None]:
vocabulary = du.create_vocabulary(complete_df.text.to_list())

In [None]:
from nltk.corpus import brown

# Further processing (do not run this unless you really need refined vocabulary)
# DO NOT RUN THIS
vocabulary = [word.lower() for word in vocabulary if word.lower() in brown.words()]
print(len(vocabulary))

In [ ]:
# Save vocabulary to save resources
print(len(vocabulary))
with open("./datasets/processed/vocabulary.txt", "w") as vocab_file: 
    vocab_file.write(repr(vocabulary))

In [11]:
# Load vocabulary from file if needed
vocabulary = ast.literal_eval(open("./datasets/processed/vocabulary.txt", "r").read())
print(len(vocabulary))

265004


In [5]:
from sklearn.feature_extraction.text import CountVectorizer
train_partial_index = len(train_text) * 10 // 100 # 10%
test_partial_index = len(test_text) * 10 // 100 # 10%

vocabulary_e = vocabulary[:len(vocabulary) * 10 // 100]
# Binary 
bin_vectorizer = CountVectorizer(input='content', stop_words='english', binary=True, vocabulary=vocabulary_e, tokenizer=None, preprocessor=None, lowercase=False)
X_train_bin = bin_vectorizer.fit_transform(train_text[:train_partial_index])
X_test_bin = bin_vectorizer.transform(test_text[:test_partial_index])

# Multinomial
mul_vectorizer = CountVectorizer(input='content', stop_words='english', binary=False, vocabulary=vocabulary_e, tokenizer=None, preprocessor=None, lowercase=False)
X_train_mul = mul_vectorizer.fit_transform(train_text[:train_partial_index])
X_test_mul = mul_vectorizer.transform(test_text[:test_partial_index])

In [10]:
import numpy as np

print(len(train_text), len(test_text))
print(np.shape(X_train_bin))
print(np.shape(X_train_mul))
print(np.shape(X_test_bin))
print(np.shape(X_test_mul))

708552 177138
(70855, 26500)
(70855, 26500)
(17713, 26500)
(17713, 26500)


In [7]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers.legacy import Adam

def create_neural_network(X_training_data):
    model = Sequential()
    model.add(Dense(units=256, input_dim=X_training_data.shape[1], activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(units=128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(units=64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(units=32, activation='relu'))
    model.add(Dropout(0.5))
    
    # Output layer
    adam_opt = Adam(learning_rate=0.001)
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=adam_opt, metrics=['accuracy'])
    return model

bin_model = create_neural_network(X_train_bin)
bin_model.summary()

mul_model = create_neural_network(X_train_mul)
mul_model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               6784256   
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (None, 32)                2

In [8]:
# Use 10% of data
X_train_bin_partial_index = X_train_bin.shape[0] * 10 // 100
X_train_mul_partial_index = X_train_mul.shape[0] * 10 // 100
X_test_bin_partial_index = X_test_bin.shape[0] * 10 // 100
X_test_mul_partial_index = X_test_mul.shape[0] * 10 // 100
y_train_partial_index = len(y_train) * 1 // 100
y_test_partial_index = len(y_test) * 1 // 100

print(X_train_bin.shape)
print(X_test_bin.shape)
print(y_train.shape, y_train_partial_index)
print(y_test.shape, y_test_partial_index)

(70855, 26500)
(17713, 26500)
(708552,) 7085
(177138,) 1771


In [9]:
import numpy as np

print("Binary Neural Network Model:")
bin_model.fit(np.array(X_train_bin.toarray()[:X_train_bin_partial_index]), 
              np.array(y_train[:y_train_partial_index]), 
              epochs=5, batch_size=64, 
              validation_data=(np.array(X_test_bin.toarray())[:X_test_bin_partial_index], np.array(y_test)[:y_test_partial_index]))

Binary Neural Network Model:
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x35e65aa90>

In [None]:
bin_model.predict(np.array(X_test_bin.toarray())[:X_test_bin_partial_index])

bin_loss, bin_accuracy = bin_model.evaluate(X_test_bin.toarray()[:X_test_bin_partial_index], np.array(y_test[:y_test_partial_index]))
print("Binary Loss on Dev set:", bin_loss)
print("Binary Accuracy on Dev set:", bin_accuracy)

In [ ]:
print(" Multinomial Neural Network Model:")
mul_model.fit(np.array(X_train_mul.toarray()),
              np.array(y_train[:y_train_partial_index]),
              epochs=5, batch_size=64, 
              validation_data=(np.array(X_test.toarray())[:X_test_mul_partial_index], np.array(y_test)[:y_test_partial_index]))

In [ ]:
bin_model.save('sns_bin_model.keras')
mul_model.save('sns_mul_model.keras')