In [1]:
# importing the necessary libraries

import pandas as pd
import numpy as np
import nltk
import re
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [2]:
# Download stopwords and punkt (for word tokenization)
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
# loading the dataset

df = pd.read_csv('twitter.csv')
print(df.head(10))

   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   
5           5      3            1                   2        0      1   
6           6      3            0                   3        0      1   
7           7      3            0                   3        0      1   
8           8      3            0                   3        0      1   
9           9      3            1                   2        0      1   

                                               tweet  
0  !!! RT @mayasolovely: As a woman you shouldn't...  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  
2  !!!!!!! RT @UrKindOfBrand Da

In [4]:
df['labels'] = df['class'].map({0: 'Hate Speech', 1: 'Offensive Language', 2: 'Normal'})
print(df.head(10))

   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   
5           5      3            1                   2        0      1   
6           6      3            0                   3        0      1   
7           7      3            0                   3        0      1   
8           8      3            0                   3        0      1   
9           9      3            1                   2        0      1   

                                               tweet              labels  
0  !!! RT @mayasolovely: As a woman you shouldn't...              Normal  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn

In [5]:
#splitting the columns
df = df[['tweet', 'labels']]
print(df.head())

                                               tweet              labels
0  !!! RT @mayasolovely: As a woman you shouldn't...              Normal
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  Offensive Language
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  Offensive Language
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  Offensive Language
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  Offensive Language


In [6]:
# cleaning the text

stemmer = PorterStemmer()
stopwords = stopwords.words('english')

def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopwords]
    text = ' '.join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text = ' '.join(text)
    return text

df['tweet'] = df['tweet'].apply(clean)

In [7]:
# split data into train, validation, and test sets
X = np.array(df['tweet'])
y = np.array(df['labels'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [8]:
# Vectorize text data
cv = CountVectorizer()
X_train = cv.fit_transform(X_train)
X_val = cv.transform(X_val)
X_test = cv.transform(X_test)

# **Model 1 : Decision Tree**

In [9]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(random_state=42)
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5, 10],
    'max_features': [None, 'sqrt', 'log2'],  # Use None instead of 'auto'
}

from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(estimator=dt_model,
                           param_grid=param_grid,
                           scoring='f1_weighted',  # You can also use 'accuracy', 'precision', 'recall', etc.
                           cv=5,  # Number of cross-validation folds
                           verbose=1,
                           n_jobs=-1)  # Use all available cores

grid_search.fit(X_train, y_train)
print("Best Parameters: ", grid_search.best_params_)
print("Best F1 Score: ", grid_search.best_score_)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 432 candidates, totalling 2160 fits
Best Parameters:  {'criterion': 'gini', 'max_depth': 50, 'max_features': None, 'min_samples_leaf': 10, 'min_samples_split': 2}
Best F1 Score:  0.8793256680285086
                    precision    recall  f1-score   support

       Hate Speech       0.55      0.12      0.19       465
            Normal       0.75      0.94      0.83      1379
Offensive Language       0.94      0.94      0.94      6335

          accuracy                           0.89      8179
         macro avg       0.75      0.66      0.65      8179
      weighted avg       0.88      0.89      0.88      8179



In [11]:
# Save the model

In [10]:
!pip install joblib



In [12]:
import joblib

# Save the model
joblib.dump(best_model, 'decision_tree_model.pkl')

['decision_tree_model.pkl']

In [13]:
import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Assuming df['tweet'] has already been cleaned and preprocessed

# Split data
X = np.array(df['tweet'])
y = np.array(df['labels'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

from sklearn.feature_extraction.text import TfidfVectorizer

# Use TF-IDF with unigrams and bigrams
tfidf = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')
X_train = tfidf.fit_transform(X_train)
X_val = tfidf.transform(X_val)
X_test = tfidf.transform(X_test)


# Save the vectorizer for later use
joblib.dump(cv, 'tfidf_count_vectorizer.pkl')


['tfidf_count_vectorizer.pkl']

# **Model 2 : Logistic Regression**

In [14]:
from sklearn.linear_model import LogisticRegression

# Train the model with Logistic Regression
clf = LogisticRegression(max_iter=200)
clf.fit(X_train, y_train)

# Save the Logistic Regression model
joblib.dump(clf, 'logistic_regression_model.pkl')


['logistic_regression_model.pkl']

In [15]:
# Evaluate on validation set
from sklearn.metrics import classification_report
y_pred = clf.predict(X_val)
print("Validation Report:")
print(classification_report(y_val, y_pred, target_names=['Hate Speech', 'Offensive Language', 'Normal']))

# Evaluate on test set
y_pred = clf.predict(X_test)
print("Test Report:")
print(classification_report(y_test, y_pred, target_names=['Hate Speech', 'Offensive Language', 'Normal']))

Validation Report:
                    precision    recall  f1-score   support

       Hate Speech       0.61      0.21      0.31       288
Offensive Language       0.88      0.74      0.81       814
            Normal       0.90      0.97      0.94      3855

          accuracy                           0.89      4957
         macro avg       0.80      0.64      0.69      4957
      weighted avg       0.88      0.89      0.88      4957

Test Report:
                    precision    recall  f1-score   support

       Hate Speech       0.57      0.18      0.27       290
Offensive Language       0.86      0.73      0.79       835
            Normal       0.89      0.97      0.93      3832

          accuracy                           0.88      4957
         macro avg       0.77      0.63      0.66      4957
      weighted avg       0.87      0.88      0.87      4957



# **Model 3 : Random Forest**

In [16]:
from sklearn.ensemble import RandomForestClassifier

# Train the model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Save the model
joblib.dump(clf, 'random_forest_model.pkl')


['random_forest_model.pkl']

In [17]:
# Evaluate on validation set
from sklearn.metrics import classification_report
y_pred = clf.predict(X_val)
print("Validation Report:")
print(classification_report(y_val, y_pred, target_names=['Hate Speech', 'Offensive Language', 'Normal']))

# Evaluate on test set
y_pred = clf.predict(X_test)
print("Test Report:")
print(classification_report(y_test, y_pred, target_names=['Hate Speech', 'Offensive Language', 'Normal']))

Validation Report:
                    precision    recall  f1-score   support

       Hate Speech       0.60      0.17      0.26       288
Offensive Language       0.86      0.77      0.82       814
            Normal       0.90      0.97      0.93      3855

          accuracy                           0.89      4957
         macro avg       0.79      0.64      0.67      4957
      weighted avg       0.88      0.89      0.88      4957

Test Report:
                    precision    recall  f1-score   support

       Hate Speech       0.54      0.17      0.26       290
Offensive Language       0.83      0.76      0.79       835
            Normal       0.90      0.96      0.93      3832

          accuracy                           0.88      4957
         macro avg       0.76      0.63      0.66      4957
      weighted avg       0.87      0.88      0.87      4957



# **Model 4 : Naive Bayes**

In [18]:
from sklearn.naive_bayes import MultinomialNB

# Train the model
clf = MultinomialNB()
clf.fit(X_train, y_train)

# Save the model
joblib.dump(clf, 'naive_bayes_model.pkl')


['naive_bayes_model.pkl']

In [19]:
# Evaluate on validation set
from sklearn.metrics import classification_report
y_pred = clf.predict(X_val)
print("Validation Report:")
print(classification_report(y_val, y_pred, target_names=['Hate Speech', 'Offensive Language', 'Normal']))

# Evaluate on test set
y_pred = clf.predict(X_test)
print("Test Report:")
print(classification_report(y_test, y_pred, target_names=['Hate Speech', 'Offensive Language', 'Normal']))

Validation Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                    precision    recall  f1-score   support

       Hate Speech       0.00      0.00      0.00       288
Offensive Language       0.98      0.07      0.13       814
            Normal       0.79      1.00      0.88      3855

          accuracy                           0.79      4957
         macro avg       0.59      0.36      0.34      4957
      weighted avg       0.77      0.79      0.71      4957

Test Report:
                    precision    recall  f1-score   support

       Hate Speech       0.00      0.00      0.00       290
Offensive Language       1.00      0.06      0.12       835
            Normal       0.78      1.00      0.88      3832

          accuracy                           0.78      4957
         macro avg       0.59      0.35      0.33      4957
      weighted avg       0.77      0.78      0.70      4957



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# **Model 5 : KNN**

In [20]:
from sklearn.neighbors import KNeighborsClassifier

# Train the model
clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(X_train, y_train)

# Save the model
joblib.dump(clf, 'knn_model.pkl')


['knn_model.pkl']

In [21]:
# Evaluate on validation set
from sklearn.metrics import classification_report
y_pred = clf.predict(X_val)
print("Validation Report:")
print(classification_report(y_val, y_pred, target_names=['Hate Speech', 'Offensive Language', 'Normal']))

# Evaluate on test set
y_pred = clf.predict(X_test)
print("Test Report:")
print(classification_report(y_test, y_pred, target_names=['Hate Speech', 'Offensive Language', 'Normal']))

Validation Report:
                    precision    recall  f1-score   support

       Hate Speech       0.31      0.32      0.31       288
Offensive Language       0.18      0.96      0.30       814
            Normal       0.93      0.08      0.16      3855

          accuracy                           0.24      4957
         macro avg       0.47      0.45      0.26      4957
      weighted avg       0.77      0.24      0.19      4957

Test Report:
                    precision    recall  f1-score   support

       Hate Speech       0.33      0.36      0.34       290
Offensive Language       0.18      0.94      0.31       835
            Normal       0.94      0.09      0.17      3832

          accuracy                           0.25      4957
         macro avg       0.49      0.46      0.27      4957
      weighted avg       0.78      0.25      0.20      4957

