In [1]:
from nltk import download, word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

import pickle
import gensim

import string
import re

import pandas as pd

import plotly.express as px

In [2]:
content_moderation_folder = 'C:/Users/witko/TwitterTwo/TwitterTwo/ContentModeration'

In [3]:
df = pd.read_csv(f'{content_moderation_folder}/data/labeled_data.csv', index_col=0)

In [4]:
df['labels'] = df['class'].map(
    {
        0: 'Hate Speech', 
        1: 'Offensive Language', 
        2: 'No Hate and Offensive',
    }
)

In [5]:
df= df[['tweet', 'labels']]

In [7]:
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub(r"\@w+|\#",'',text)
    text = re.sub(r"[^\w\s]",'',text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    tweet_tokens = word_tokenize(text)
    download('stopwords')
    filtered_tweets = [w for w in tweet_tokens if not w in set(stopwords.words('english'))] #removing stopwords
    return ' '.join(filtered_tweets)

In [8]:
df['tweet'] = df['tweet'].apply(lambda x: clean(x))

In [9]:
df = df.drop_duplicates("tweet")

In [10]:
download('wordnet')
lemmatizer=WordNetLemmatizer()
def lemmatizing(tweet):
    words = tweet.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    lemmatized_tweet = ' '.join(lemmatized_words)
    return lemmatized_tweet

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\witko\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
df['tweet'] = df['tweet'].apply(lambda x: lemmatizing(x))

In [28]:
X = df['tweet']
Y = df['labels']

In [29]:
vect = TfidfVectorizer(ngram_range=(1, 3)).fit(df['tweet'])


In [47]:
# with open(f'{content_moderation_folder}/models/tfidf_vectorizer.pkl', 'wb') as file:
#     pickle.dump(vect, file)

# with open(f'{content_moderation_folder}/models/tfidf_vectorizer.pkl', 'rb') as file:
#     vect = pickle.load(file)

In [30]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [31]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, Y_train) #loading x_train and y_train data on model
logreg_predict = logreg.predict(X_test) #predicting the value for test data
logreg_acc = accuracy_score(logreg_predict, Y_test)

In [33]:
print("Test accuracy: {:.2f}%".format(logreg_acc*100))

Test accuracy: 83.21%


In [34]:
cm = confusion_matrix(Y_test, logreg_predict, labels=logreg.classes_)

confusion_matrix_df = pd.DataFrame(cm, columns=logreg.classes_, index=logreg.classes_)

fig = px.imshow(
    confusion_matrix_df,
    x=logreg.classes_,
    y=logreg.classes_,
    labels=dict(x="Predicted", y="Actual"),
    color_continuous_scale="blues",
    title="Confusion Matrix",
)

fig.update_layout(
    xaxis=dict(title="Predicted", side="top"),
    yaxis=dict(title="Actual", autorange="reversed"),
)

for i in range(len(logreg.classes_)):
    for j in range(len(logreg.classes_)):
        fig.add_annotation(
            text=str(cm[i, j]),
            x=logreg.classes_[j],
            y=logreg.classes_[i],
            showarrow=False,
            font=dict(color="white" if cm[i, j] > cm.max() / 2 else "black"),  # Adjust text color
        )

fig.show()

In [None]:
param_grid = {'C': [100, 10, 1.0, 0.1, 0.01], 'solver': ['newton-cg', 'lbfgs', 'liblinear'], 'max_iter': [1000]}
grid = GridSearchCV(LogisticRegression(), param_grid, cv = 5)
grid.fit(X_train, Y_train)
print("Best Cross validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

In [None]:
with open(f'{content_moderation_folder}/models/logreg_model_with_word2vec.pkl', 'wb') as file:
    pickle.dump(logreg, file)

In [19]:
with open(f'{content_moderation_folder}/models/logreg_model_with_word2vec.pkl', 'rb') as file:
    logreg = pickle.load(file)

In [45]:
X = [
    "Just received some amazing news today!"
]
print(X)
X = [clean(x) for x in X]
print(X)
X = [lemmatizing(x) for x in X]
print(X)
X = vect.transform(X)

['Just received some amazing news today!']
['received amazing news today']
['received amazing news today']


In [46]:
logreg.predict(X)

array(['No Hate and Offensive'], dtype=object)