## Set up

In [None]:
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import fasttext
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix


## Get data

The data was cleaned and pre-processed already in data_preprocess_eric

In [None]:
df = pd.read_csv('data/merged_pp_df.csv')
df = df.dropna(subset=['stopwords_punct_lemma'])
df.head()

In [None]:
X = df['stopwords_punct_lemma']
y = df['toxic']

### Create labels fasttext as per convention

In [None]:
df['toxic_label_ft'] = "__label__" + df['toxic'].astype(str)
df.head(3)

In [None]:
df['toxic_label_comment_text'] = df['toxic_label_ft'] + " " + df['stopwords_punct_lemma']
df.head(3)

## Split and train for the fast text model

In [None]:
train,test = train_test_split(df,test_size=0.2, random_state=42, stratify=y) 

In [None]:
train.to_csv("data/fasttext_train", columns=["toxic_label_comment_text"], index=False, header=False)
test.to_csv("data/fasttext_test", columns=["toxic_label_comment_text"], index=False, header=False)

In [None]:
model = fasttext.train_supervised(input="data/fasttext_train", 
                                  lr=0.5, 
                                  epoch=15, 
                                  wordNgrams=2, 
                                  t=0.0001)
model.test("data/fasttext_test")

### Get vectors

In [None]:
def clean_text(text):
    """
    Function to remove newline characters from text.
    """
    # Replace newline characters with spaces
    cleaned_text = text.replace('\n', ' ')
    return cleaned_text

In [None]:
tqdm.pandas()
df['stopwords_punct_lemma'] = df['stopwords_punct_lemma'].progress_apply(clean_text)
df['vector_fast_text'] = df['stopwords_punct_lemma'].progress_apply(lambda text: model.get_sentence_vector(text))

In [None]:
df.head(3)

In [None]:
df.drop(["pos_tags","pos_tags_str","toxic_label_ft","toxic_label_comment_text"],axis=1,inplace=True)

In [None]:
df.head(3)

In [None]:
df.drop(["vector_spacy"],axis=1,inplace=True)

In [None]:
df.head(3)

### temporary store the fast text vectors to use them with other models

In [None]:
# only need to run once
#df.to_csv('data/alldata_fast_text_vectors.csv')

## Fast Text with SMOTE and Logistic Regression

In [None]:

# Convert list of vectors into a 2D numpy array
X = np.stack(df['vector_fast_text'].values)
y = df['toxic'].values

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Train a model (Logistic Regression)
clf = LogisticRegression(max_iter=2500)  # Increasing max_iter for convergence
clf.fit(X_train_smote, y_train_smote)

# Evaluate the model on the original test set
predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))




In [None]:
sentence_vector = model.get_sentence_vector("I love all kind of people, black, gays, muslims, Christians")
sentence_vector_reshaped = np.array(sentence_vector).reshape(1, -1)  # Reshape to 2D array

# Now use this reshaped vector for prediction
prediction = clf.predict(sentence_vector_reshaped)
prediction_proba = clf.predict_proba(sentence_vector_reshaped)

if prediction == 1:
    prediction_text = 'Toxic'
else:
    prediction_text = 'Non-Toxic'

# Output the prediction
print(f'This comment is {prediction_text}')
print(f'The probability of being Non-Toxic is: {prediction_proba[0][0]}')
print(f'The probability of being Toxic is: {prediction_proba[0][1]}')

In [None]:
# Compute the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Toxic', 'Toxic'], yticklabels=['Non-Toxic', 'Toxic'])
plt.title('Confusion Matrix')
plt.ylabel('Actual Labels')
plt.xlabel('Predicted Labels')
plt.show()