# Importing libraries

In [1]:
import pandas as pd 
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
import numpy as np
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,recall_score, precision_score
from sklearn.feature_extraction.text import CountVectorizer


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Reading youtube csv containing the field text

In [3]:
path = '/merged_yt_collection_text.csv'
df = pd.read_csv(path)

# Defining text preprocessing function

In [4]:
def preprocess_text(text):
    # Tokenize the text into individual words
    tokens = word_tokenize(text.lower())

    # Remove the punctuation
    table = str.maketrans('', '', string.punctuation)
    tokens = [token.translate(table) for token in tokens if token.isalpha()]

    # Remove the English stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Using a word Lemmatizer 
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

# Creation of a new column with the preprocessed text

In [5]:
df['preprocessed_text'] = df['tweet_text'].apply(lambda x: preprocess_text(x))

# Saving the new csv with the preprocessed text

In [14]:
df.to_csv("merged_yt_collection_preprocessedtxt.csv",index=False)

# Check script mood.py

# Reading the csv with the preprocessed text

In [2]:

df = pd.read_csv("merged_yt_collection_preprocessedtxt.csv")

# Concat the new five mood columns to the dataframe

In [3]:
# Specify the file path
files = ['hate.txt', 'irony.txt', 'neutral.txt', 'negative.txt', 'offensive.txt']

# Read the text files into pandas Series
new_columns = []
for file in files:
    with open(file, 'r') as f:
        values = f.read().splitlines()
    col = pd.Series(values, name=file.split(".")[0])
    new_columns.append(col)

# Concatenate the existing DataFrame with the new mood columns
combined_data = pd.concat([df] + new_columns, axis=1)


In [22]:
combined_data.columns

Index(['id', 'moderationStatus', 'tweet_ids', 'tweet_text',
       'preprocessed_text', 'hate', 'irony', 'neutral', 'negative',
       'offensive'],
      dtype='object')

# Extract input and target features

In [6]:
y = combined_data["moderationStatus"].to_numpy()
x = combined_data[["hate","irony","offensive","negative","neutral"]].to_numpy()

# Split data in train and test

In [7]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42,stratify=y)

# XGBoost

In [8]:
#Convert your data into DMatrix format (an optimized data structure for XGBoost)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

#Set the parameters for XGBoost
params = {
    'max_depth': 3,
    'eta': 0.1,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'seed': 42
}

#Train the XGBoost model
num_rounds = 100
model = xgb.train(params, dtrain, num_rounds)

#Make predictions on the test set
y_pred = model.predict(dtest)
predictions = [round(value) for value in y_pred]

In [11]:
#Evaluate the model
accuracy = accuracy_score(y_test, predictions)
fscore = f1_score(y_test,predictions,average="macro")
recall = recall_score(y_test,predictions)
precision = precision_score(y_test,predictions)
conf_matr = confusion_matrix(y_test,predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("MACRO f1_Score: ",fscore)
print("recall: ",recall)
print("precision: ",precision)
print("confusion matrix: ",conf_matr)

Accuracy: 80.21%
MACRO f1_Score:  0.44509232264334303
recall:  0.0
precision:  0.0
confusion matrix:  [[1374    0]
 [ 339    0]]


  _warn_prf(average, modifier, msg_start, len(result))


# Random forest

In [13]:
#Split in training and test set
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Create an instance of the Random Forest classifier
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)  # Adjust the number of estimators as needed

# Fit the model to the training data
random_forest.fit(X_train, y_train)

# Make predictions on the test data
y_pred = random_forest.predict(X_test)

# Evaluate the model's performance
#Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
fscore = f1_score(y_test,y_pred,average="macro")
recall = recall_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
conf_matr = confusion_matrix(y_test,y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("macro f1_Score: ",fscore)
print("recall: ",recall)
print("precision: ",precision)
print("confusion matrix: ",conf_matr)

Accuracy: 80.41%
macro f1_Score:  0.47347880065862835
recall:  0.030721966205837174
precision:  0.3333333333333333
confusion matrix:  [[2734   40]
 [ 631   20]]


# SVM

In [15]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Create an instance of the SVM classifier
svm = SVC(kernel='sigmoid')  # You can choose different kernels, such as 'linear', 'rbf', or 'sigmoid'

# Fit the model to the training data
svm.fit(X_train, y_train)

# Make predictions on the test data
y_pred = svm.predict(X_test)

# Evaluate the model's performance
#Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
fscore = f1_score(y_test,y_pred,average="macro")
recall = recall_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
conf_matr = confusion_matrix(y_test,y_pred)
print("accuracy: %.2f%%" % (accuracy * 100.0))
print("macro f1_Score: ",fscore)
print("recall: ",recall)
print("precision: ",precision)
print("confusion matrix: ",conf_matr)

Accuracy: 67.94%
macro f1_Score:  0.48834526806618145
recall:  0.17972350230414746
precision:  0.17180616740088106
confusion matrix:  [[2210  564]
 [ 534  117]]


# Logistic regression

In [16]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Create an instance of the SVM classifier
lr = LogisticRegression()  # You can choose different kernels, such as 'linear', 'rbf', or 'sigmoid'

# Fit the model to the training data
lr.fit(X_train, y_train)

# Make predictions on the test data
y_pred = lr.predict(X_test)

# Evaluate the model's performance
#Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
fscore = f1_score(y_test,y_pred,average="macro")
recall = recall_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
conf_matr = confusion_matrix(y_test,y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("macro f1_Score: ",fscore)
print("recall: ",recall)
print("precision: ",precision)
print("confusion matrix: ",conf_matr)

Accuracy: 80.99%
macro f1_Score:  0.4474915308920793
recall:  0.0
precision:  0.0
confusion matrix:  [[2774    0]
 [ 651    0]]


  _warn_prf(average, modifier, msg_start, len(result))


# Random Forest with the add of preprocessed text

In [18]:
text = combined_data["preprocessed_text"]

#text encoding
vect = CountVectorizer()
x_text = vect.fit_transform(text)

In [19]:
# Training and test set split
X_train,X_test,Y_Train,Y_test = train_test_split(x_text,y,test_size=0.2,random_state=42)

# Create an instance of the SVM classifier
model = RandomForestClassifier()  # You can choose different kernels, such as 'linear', 'rbf', or 'sigmoid'

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model's performance
#Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
fscore = f1_score(y_test,y_pred,average="macro")
recall = recall_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
conf_matr = confusion_matrix(y_test,y_pred)

print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("f1_Score: ",fscore)
print("recall: ",recall)
print("precision: ",precision)
print("confusion matrix: ",conf_matr) 

Accuracy: 82.22%
f1_Score:  0.5233141995497386
recall:  0.07987711213517665
precision:  0.8387096774193549
confusion matrix:  [[2764   10]
 [ 599   52]]
