# SNS Sentiment Analysis

Analyze social media texts and measure potential inflammatory / offensive language.

# Data Pre-processing

From the selected datasets, extract the text and labels from all of them, then combine into one large CSV dataset.

(Install NLTK data if not already installed).

In [13]:
import nltk, os

# Run this if you are locally accessing the NLTK data
nltk.data.path.append('./nltk_data/')
if not os.path.exists('./nltk_data'):
    nltk.download('punkt', download_dir='./nltk_data/')
    nltk.download('stopwords', download_dir='./nltk_data/')
    nltk.download('words', download_dir='./nltk_data/')
    nltk.download('brown', download_dir='./nltk_data/')

In [2]:
hate_speech_dataset_path = "./datasets/hate_speech_detect/HateSpeechDatasetBalanced.csv"
malignant_dataset_path = "./datasets/malignant/train.csv"

In [3]:
import pandas as pd
# Process malignant train data
m_train_df = pd.read_csv(malignant_dataset_path)
m_train_df_no_id = m_train_df.drop(columns=m_train_df.columns[0])

processed_m_train_df = pd.DataFrame({
    "text": m_train_df_no_id[m_train_df_no_id.columns[0]],
    "label": m_train_df_no_id[m_train_df_no_id.columns[1:]].max(axis=1)
})

In [1]:
import data_util as du

In [8]:
hs_tuples = du.generate_tuples_from_file(hate_speech_dataset_path)

In [9]:
m_tuples = du.generate_tuples_from_df(processed_m_train_df)

In [20]:
# Combine and save the data to a CSV
processed_data_save = pd.DataFrame({
    "text": hs_tuples[0] + m_tuples[0],
    "label": hs_tuples[1] + m_tuples[1]
})

processed_data_save.to_csv("./datasets/processed/all_data.csv", index=False)

# Load data and split

In [2]:
# Load the saved data
import ast
import pandas as pd

complete_df = pd.read_csv("./datasets/processed/all_data.csv")
complete_df[complete_df.columns[0]] = complete_df[complete_df.columns[0]].apply(ast.literal_eval)
complete_df[complete_df.columns[1]] = complete_df[complete_df.columns[1]].astype(int)

# Use portion of data

In [39]:
# Our current dataset is too large, so we'll be using a portion of it in our actual model
# Training data will be a size of 100,000
# Test data will be a size of 50,000
training_text = complete_df[complete_df.columns[0]][:int(10e3)]
test_text = complete_df[complete_df.columns[0]][int(10e3):int(15e3)]

training_labels = complete_df[complete_df.columns[1]][:int(10e3)]
test_labels = complete_df[complete_df.columns[1]][int(10e3):int(15e3)]

training_data = pd.DataFrame({
    "text": training_text,
    "label": training_labels
})

test_data = pd.DataFrame({
    "text": test_text,
    "label": test_labels
})

training_data.to_csv("./datasets/processed/train.csv", index=False)
test_data.to_csv("./datasets/processed/test.csv", index=False)

In [40]:
train = pd.read_csv("./datasets/processed/train.csv")
X_train = train[train.columns[0]].apply(ast.literal_eval)
y_train = train[train.columns[1]].astype(int)

test = pd.read_csv("./datasets/processed/test.csv")
X_test = test[test.columns[0]].apply(ast.literal_eval)
y_test = test[test.columns[1]].astype(int)

In [7]:
# from sklearn.model_selection import train_test_split
# 
# # Split the data up into training and testing
# X_train, X_test, y_train, y_test = train_test_split(complete_df.text, complete_df.label, test_size=0.2, random_state=42)

In [41]:
train_text = [" ".join(r) for r in X_train]
test_text = [" ".join(r) for r in X_test]

In [42]:
# Check for distribution of training vs testing data
print(len(train_text), len(test_text))
print(len(train_text) / (len(train_text) + len(test_text)))
print(len(test_text) / (len(train_text) + len(test_text)))

10000 5000
0.6666666666666666
0.3333333333333333


In [43]:
import data_util as du

vocabulary = du.create_vocabulary(X_train.to_list() + X_test.to_list())

In [None]:
from nltk.corpus import brown

# Further processing (do not run this unless you really need refined vocabulary)
# DO NOT RUN THIS
vocabulary = [word.lower() for word in vocabulary if word.lower() in list(brown.words())]
print(len(vocabulary))

In [45]:
# Save vocabulary to file to save resources
print(len(vocabulary))
with open("./datasets/processed/vocabulary.txt", "w") as vocab_file: 
    vocab_file.write(repr(vocabulary))

11560


In [46]:
# Load vocabulary from file if needed
vocabulary = ast.literal_eval(open("./datasets/processed/vocabulary.txt", "r").read())
print(len(vocabulary))

11560


# Train Neural Network with BERT

In [47]:
import tensorflow as tf
from transformers import TFBertModel, BertTokenizer
from tensorflow.keras.layers import Dense, Dropout

In [None]:
# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased', output_attentions=False, output_hidden_states=False)
bert_model.trainable = False

## Create a Data Generator

In [49]:
import numpy as np
import tensorflow as tf

# define a batch size for our experiments
BATCH_SIZE = 4

# Define the maximum sequence length
MAX_LENGTH = 128

In [50]:
def data_generator(sentences: np.array, labels: np.array, batch_size: int) -> (dict, tf.Tensor):
    # i = 0
    num_samples = len(sentences)
    indices = np.arange(num_samples)
    while True:
        batch_x = []
        batch_y = []

        selected_indices = np.random.choice(indices, size=batch_size, replace=True)
        for j in selected_indices:
            batch_x.append(sentences[j])
            batch_y.append(labels[j])

        batch_x = tokenizer(batch_x, return_tensors="tf", max_length=MAX_LENGTH, padding='max_length', truncation=True)
        batch_y = tf.convert_to_tensor(batch_y)
        yield dict(batch_x), batch_y

In [51]:
train_data_generator = data_generator(train_text, y_train,BATCH_SIZE)
test_data_generator = data_generator(test_text, y_test,BATCH_SIZE)

In [52]:
def create_bert_model():
    input_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_ids')
    attention_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='attention_mask')
    token_type_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='token_type_ids')

    bert_output = bert_model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
    sequence_output = bert_output[0]
    pooled_output = sequence_output[:, 0, :]  # Take the [CLS] token representation
    
    model = tf.keras.Sequential([
        bert_model,
        tf.keras.layers.Lambda(lambda x: x[0][:,0,:]),
        tf.keras.layers.Dense(units=256, input_dim=len(vocabulary), activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(units=1, activation='softmax')
    ])

    # model = tf.keras.Model(inputs=[input_ids, attention_mask, token_type_ids], outputs=output)
    model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=), loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [54]:
# Tokenize the training data
X_train_tokens = tokenizer.batch_encode_plus(
    train_text,
    max_length=MAX_LENGTH,
    padding='max_length',
    truncation=True,
    return_tensors='tf'
)

# Tokenize the testing data
X_test_tokens = tokenizer.batch_encode_plus(
    test_text,
    max_length=MAX_LENGTH,
    padding='max_length',
    truncation=True,
    return_tensors='tf'
)

In [55]:
tokenizer.save_pretrained('./models/transformer_tokenizer_state1')

('./models/transformer_tokenizer_state1/tokenizer_config.json',
 './models/transformer_tokenizer_state1/special_tokens_map.json',
 './models/transformer_tokenizer_state1/vocab.txt',
 './models/transformer_tokenizer_state1/added_tokens.json')

In [57]:
# Create the model
model_1 = create_bert_model()

# Train the model
model_1.fit(
    [X_train_tokens['input_ids'], X_train_tokens['attention_mask'], X_train_tokens['token_type_ids']],
    y_train,
    validation_data=([X_test_tokens['input_ids'], X_test_tokens['attention_mask'], X_test_tokens['token_type_ids']], y_test),
    epochs=3,
    batch_size=BATCH_SIZE
)



Epoch 1/3

KeyboardInterrupt: 

In [53]:
# Create the model
model_1 = create_bert_model()

# Train the model
model_1.fit(
    train_data_generator,
    epochs=2,
    batch_size=BATCH_SIZE,
    steps_per_epoch=len(train_text) // BATCH_SIZE,
    validation_data=test_data_generator,
    validation_steps=BATCH_SIZE*4,
    validation_batch_size=BATCH_SIZE
)

Epoch 1/2
 137/2500 [>.............................] - ETA: 8:45 - loss: 0.4862 - accuracy: 0.1259

KeyboardInterrupt: 

In [None]:
preds = model_1.predict([X_test_tokens['input_ids'], X_test_tokens['attention_mask'], X_test_tokens['token_type_ids']])


In [46]:
print(X_test_tokens[:5], y_testing[:5])
print(preds[:5])

{'input_ids': <tf.Tensor: shape=(5, 128), dtype=int32, numpy=
array([[  101,  3374,  2442,  1997,  4771,  2008,  2028,  1012,  4283,
         2005,  2115,  3319,  1010,  1045,  2018,  2356,  2178,  3559,
         2000,  2079,  1037,  4248,  1039,  1013,  1041,  1997,  1996,
         5512,  2930,  1012,  4283,  1010,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0, 

# Build the neural networks and train them

In [7]:
import nltk

nltk.data.path.append('./nltk_data/')

# Custom tokenizer
# def tokenizer(text):
#     tokens = nltk.word_tokenize(text)
#     return tokens

In [8]:
# Initialize number of epochs
# Set number of chunks for training
epochs = 1
chunks = 20

In [9]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.optimizers.legacy import Adam

def create_neural_network(input_dim):
    model = Sequential()
    model.add(Dense(units=256, input_dim=input_dim, activation='relu'))
    # model.add(Flatten(input_shape=(32,32,3)))
    model.add(Dropout(0.5))
    model.add(Dense(units=128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(units=64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(units=32, activation='relu'))
    model.add(Dropout(0.5))
    
    # Output layer
    adam_opt = Adam(learning_rate=0.001)
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=adam_opt, metrics=['accuracy'])
    return model

bin_model = create_neural_network(len(vocabulary) // chunks)
bin_model.summary()

mul_model = create_neural_network(len(vocabulary) // chunks)
mul_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               1357056   
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (None, 32)                2

In [10]:
vocab_chunk_size = len(vocabulary) // chunks
vocab_chunks = [vocabulary[i:i + vocab_chunk_size] for i in range(0, len(vocabulary), vocab_chunk_size)]

train_text_chunk_size = len(train_text) // chunks
y_train_chunk_size = len(y_train) // chunks
test_text_chunk_size = len(test_text) // chunks
y_test_chunk_size = len(y_test) // chunks

train_text_chunks = [train_text[i:i + train_text_chunk_size] for i in range(0, len(train_text), train_text_chunk_size)]
y_train_chunks = [y_train[i:i + y_train_chunk_size] for i in range(0, len(y_train), y_train_chunk_size)]

test_text_chunks = [test_text[i:i + test_text_chunk_size] for i in range(0, len(test_text), test_text_chunk_size)]
y_test_chunks = [y_test[i:i + y_test_chunk_size] for i in range(0, len(y_test), y_test_chunk_size)]

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

def train_by_chunk(model: Sequential, chunked_vocab, chunked_train_text, chunked_test_text, chunked_y_train, chunked_y_test, training_epochs, training_chunks, binary=False, verbose=False):
    for chunk_idx in range(training_chunks):
        # Vectorize data with the current vocabulary chunk
        vectorizer = CountVectorizer(vocabulary=chunked_vocab[chunk_idx], tokenizer=tokenizer, preprocessor=None, lowercase=False, binary=binary)
        
        X_train_chunk = vectorizer.fit_transform(chunked_train_text[chunk_idx])
        X_test_chunk = vectorizer.transform(chunked_test_text[chunk_idx])
        y_train_chunk = chunked_y_train[chunk_idx]
        y_test_chunk = chunked_y_test[chunk_idx]
        
        model.layers[0].input_dim = X_train_chunk.shape[1]
        
        if verbose:
            print(X_train_chunk.shape, y_train_chunk.shape, X_test_chunk.shape, y_test_chunk.shape)
            print(np.shape(np.array(X_train_chunk.toarray())))
            print(np.shape(np.array(y_train_chunk)))
            print(np.shape(np.array(X_test_chunk.toarray())), np.shape(np.array(y_test_chunk)))
        
        model.fit(x=np.array(X_train_chunk.toarray()), 
                  y=np.array(y_train_chunk), 
                  epochs=training_epochs, 
                  batch_size=64, 
                  validation_data=(np.array(X_test_chunk.toarray()), np.array(y_test_chunk)))

In [12]:
print("Binary Neural Network Model:")
train_by_chunk(bin_model, vocab_chunks, train_text_chunks, test_text_chunks, y_train_chunks, y_test_chunks, epochs, chunks, binary=True)

Binary Neural Network Model:






KeyboardInterrupt: 

In [28]:
bin_model.save('./models/sns_bin_model_2.keras')

with open("./models/sns_bin_model_2.json", "w") as json_file:
    json_file.write(bin_model.to_json())
json_file.close()    
# Serialize weights to HDF5
bin_model.save_weights("./models/sns_bin_model_2_weights.h5")

In [30]:
import keras
import json

# bin_model = keras.models.load_model('./models/sns_bin_model_2.keras')

with open('./models/sns_bin_model_2.json', 'r') as json_model:
    architecture = json.load(json_model)
    bin_model = keras.models.model_from_json(json.dumps(architecture))
json_model.close()
bin_model.load_weights('./models/sns_bin_model_2_weights.h5')

In [92]:
bin_vectorizer = CountVectorizer(input='content', stop_words='english', binary=True, vocabulary=vocabulary, tokenizer=None, preprocessor=None, lowercase=False)
X_train_bin = bin_vectorizer.fit_transform(train_text)
X_test_bin = bin_vectorizer.transform(test_text)

# Multinomial
mul_vectorizer = CountVectorizer(input='content', stop_words='english', binary=False, vocabulary=vocabulary, tokenizer=None, preprocessor=None, lowercase=False)
X_train_mul = mul_vectorizer.fit_transform(train_text)
X_test_mul = mul_vectorizer.transform(test_text)

In [38]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer(input='content', stop_words='english', binary=False, vocabulary=vocabulary[:5300], tokenizer=None, preprocessor=None, lowercase=False)
vec2 = v.fit_transform(["test", "test2", "test3"])

# bin_preds = bin_model.predict(np.random.sample(np.array(X_test_bin.toarray())), 100)
print(bin_model.predict(np.array(vec2.toarray())))
#bin_loss, bin_accuracy = bin_model.evaluate(X_test_bin.toarray(), np.array(y_test))
#print("Binary Loss on Dev set:", bin_loss)
#print("Binary Accuracy on Dev set:", bin_accuracy)

# OUTPUT BELOW SHOULD NOT BE HAPPENING

[[0.4368525]
 [0.4368525]
 [0.4368525]]


In [ ]:
print(" Multinomial Neural Network Model:")
train_by_chunk(bin_model, vocab_chunks, train_text_chunks, test_text_chunks, y_train_chunks, y_test_chunks, epochs, chunks, binary=False)

In [ ]:
bin_model.save('sns_bin_model.keras')
mul_model.save('sns_mul_model.keras')