<a href="https://www.kaggle.com/code/nibakh/hate-speech-part-1-model-comparisons?scriptVersionId=170139338" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

# Load data
data = pd.read_csv('/kaggle/input/datset15/labeled_data.csv')  # Replace 'your_data.csv' with your dataset path
X = data['tweet']
y = data['class']

# data = pd.read_csv('/kaggle/input/dataset101/HateSpeechDataset.csv')  # Replace 'your_data.csv' with your dataset path
# X = data['Content']
# y = data['Label']


#three labels # hatespeech,ofensive,neither
# trial with other dataset with 2 labels 1 for offensive and 0 for non offensive

In [None]:
len(data)

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Assuming you have NLTK installed, if not, install it using:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

# Function to preprocess text data
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    # Join the words back into a string
    text = ' '.join(filtered_text)
    return text



# Model training and evaluation steps...


In [None]:
# Preprocess the tweets
X_train_preprocessed = X_train.apply(preprocess_text)
X_test_preprocessed = X_test.apply(preprocess_text)



In [None]:
# Data cleaning and preprocessing



# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_preprocessed)
X_test_tfidf = tfidf_vectorizer.transform(X_test_preprocessed)



In [None]:
# # Train Decision Tree
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train_tfidf, y_train)
dt_preds = dt_classifier.predict(X_test_tfidf)
dt_accuracy = accuracy_score(y_test, dt_preds)



In [None]:
# dt_accuracy

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 10, 20],       # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],   # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]      # Minimum number of samples required to be at a leaf node
}

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier()

# Grid search with cross-validation
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_tfidf, y_train)

# Best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Evaluate the best model
best_rf_model = grid_search.best_estimator_
best_rf_model_preds = best_rf_model.predict(X_test_tfidf)
best_rf_model_accuracy = accuracy_score(y_test, best_rf_model_preds)


In [None]:
best_rf_model_accuracy

In [None]:

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Assuming lr_preds and y_test are already defined
import pandas as pd
import numpy as np
# Assuming lr_preds and y_test are already defined
conf_matrix = confusion_matrix(y_test, best_rf_model_preds)
accuracy = accuracy_score(y_test, best_rf_model_preds)
precision = precision_score(y_test, best_rf_model_preds, average='weighted')
recall = recall_score(y_test, best_rf_model_preds, average='weighted')
f1score = f1_score(y_test, best_rf_model_preds, average='weighted')

# Calculate total for each row and column
conf_matrix_with_total = conf_matrix.copy()
conf_matrix_with_total = np.append(conf_matrix_with_total, [np.sum(conf_matrix_with_total, axis=0)], axis=0)
conf_matrix_with_total = np.append(conf_matrix_with_total, np.sum(conf_matrix_with_total, axis=1).reshape(-1, 1), axis=1)

# Create a DataFrame for the confusion matrix
conf_matrix_df = pd.DataFrame(conf_matrix_with_total, 
                               index=['Hate Speech', 'Offensive', 'Neither', 'Total'], 
                               columns=['Predicted 0', 'Predicted 1', 'Predicted 2', 'Total'])

# Create a DataFrame for the score metrics
score_metrics_df = pd.DataFrame({'Accuracy': [accuracy],
                                 'Precision': [precision],
                                 'Recall': [recall],
                                 'F1 Score': [f1score]})

print("Confusion Matrix:")
print(conf_matrix_df)
print("\nScore Metrics:")
print(score_metrics_df)


In [None]:
best_rf_model_accuracy

In [None]:
# Train Random Forest
# rf_classifier = RandomForestClassifier()
# rf_classifier.fit(X_train_tfidf, y_train)
# rf_preds = rf_classifier.predict(X_test_tfidf)
# rf_accuracy = accuracy_score(y_test, rf_preds)

# Train Logistic Regression
lr_classifier = LogisticRegression(max_iter=1000)
lr_classifier.fit(X_train_tfidf, y_train)
lr_preds = lr_classifier.predict(X_test_tfidf)
lr_accuracy = accuracy_score(y_test, lr_preds)






In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Define the logistic regression classifier
lr_classifier = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')

# Define the hyperparameters grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'penalty': ['l2'],                     # L2 regularization only for multinomial
    'class_weight': [None, 'balanced'],    # Weights associated with classes
    'fit_intercept': [True, False],        # Whether to calculate the intercept
}

# Perform grid search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=lr_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_tfidf, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Train a new logistic regression classifier using the best hyperparameters
best_lr_classifier = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs', **best_params)
best_lr_classifier.fit(X_train_tfidf, y_train)

# Make predictions on the test set using the best classifier
best_lr_preds = best_lr_classifier.predict(X_test_tfidf)

# Calculate accuracy
best_lr_accuracy = accuracy_score(y_test, best_lr_preds)

print("Best Hyperparameters:", best_params)
print("Accuracy with Best Hyperparameters:", best_lr_accuracy)


In [None]:

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Assuming lr_preds and y_test are already defined
import pandas as pd
import numpy as np
# Assuming lr_preds and y_test are already defined
conf_matrix = confusion_matrix(y_test, best_lr_preds)
accuracy = accuracy_score(y_test, best_lr_preds)
precision = precision_score(y_test, best_lr_preds, average='weighted')
recall = recall_score(y_test, best_lr_preds, average='weighted')
f1score = f1_score(y_test, best_lr_preds, average='weighted')

# Calculate total for each row and column
conf_matrix_with_total = conf_matrix.copy()
conf_matrix_with_total = np.append(conf_matrix_with_total, [np.sum(conf_matrix_with_total, axis=0)], axis=0)
conf_matrix_with_total = np.append(conf_matrix_with_total, np.sum(conf_matrix_with_total, axis=1).reshape(-1, 1), axis=1)

# Create a DataFrame for the confusion matrix
conf_matrix_df = pd.DataFrame(conf_matrix_with_total, 
                               index=['Hate Speech', 'Offensive', 'Neither', 'Total'], 
                               columns=['Predicted 0', 'Predicted 1', 'Predicted 2', 'Total'])

# Create a DataFrame for the score metrics
score_metrics_df = pd.DataFrame({'Accuracy': [accuracy],
                                 'Precision': [precision],
                                 'Recall': [recall],
                                 'F1 Score': [f1score]})

print("Confusion Matrix:")
print(conf_matrix_df)
print("\nScore Metrics:")
print(score_metrics_df)


In [None]:
lr_accuracy

In [None]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import accuracy_score
# from sklearn.tree import DecisionTreeClassifier

# # Create an instance of the Decision Tree Classifier
# dt_classifier = DecisionTreeClassifier()

# # Define the parameter grid to search
# param_grid = {
#     'max_depth': [None, 5, 10, 15],                 # Maximum depth of the tree
#     'min_samples_split': [2, 5, 10],                 # Minimum number of samples required to split an internal node
#     'min_samples_leaf': [1, 2, 4],                  # Minimum number of samples required to be at a leaf node
#     'max_features': ['auto', 'sqrt', 'log2', None]  # Number of features to consider when looking for the best split
# }

# # Initialize GridSearchCV
# grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# # Perform Grid Search cross-validation
# grid_search.fit(X_train_tfidf, y_train)

# # Get the best hyperparameters
# best_params = grid_search.best_params_

# # Train Decision Tree with the best hyperparameters
# best_dt_classifier = DecisionTreeClassifier(**best_params)
# best_dt_classifier.fit(X_train_tfidf, y_train)

# # Make predictions
# dt_preds = best_dt_classifier.predict(X_test_tfidf)

# # Calculate accuracy
# dt_accuracy = accuracy_score(y_test, dt_preds)
# print("Accuracy:", dt_accuracy)


In [None]:
from sklearn.svm import SVC

# Train SVM
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_tfidf, y_train)
svm_preds = svm_classifier.predict(X_test_tfidf)
svm_accuracy = accuracy_score(y_test, svm_preds)

In [None]:

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Assuming lr_preds and y_test are already defined
import pandas as pd
import numpy as np
# Assuming lr_preds and y_test are already defined
conf_matrix = confusion_matrix(y_test, svm_preds)
accuracy = accuracy_score(y_test, svm_preds)
precision = precision_score(y_test, svm_preds, average='weighted')
recall = recall_score(y_test, svm_preds, average='weighted')
f1score = f1_score(y_test, svm_preds, average='weighted')

# Calculate total for each row and column
conf_matrix_with_total = conf_matrix.copy()
conf_matrix_with_total = np.append(conf_matrix_with_total, [np.sum(conf_matrix_with_total, axis=0)], axis=0)
conf_matrix_with_total = np.append(conf_matrix_with_total, np.sum(conf_matrix_with_total, axis=1).reshape(-1, 1), axis=1)

# Create a DataFrame for the confusion matrix
conf_matrix_df = pd.DataFrame(conf_matrix_with_total, 
                               index=['Hate Speech', 'Offensive', 'Neither', 'Total'], 
                               columns=['Predicted 0', 'Predicted 1', 'Predicted 2', 'Total'])

# Create a DataFrame for the score metrics
score_metrics_df = pd.DataFrame({'Accuracy': [accuracy],
                                 'Precision': [precision],
                                 'Recall': [recall],
                                 'F1 Score': [f1score]})

print("Confusion Matrix:")
print(conf_matrix_df)
print("\nScore Metrics:")
print(score_metrics_df)


In [None]:
# ## bagging and boosting
# from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier
# from sklearn.metrics import accuracy_score
# from sklearn.svm import SVC
# base_dt_classifier = DecisionTreeClassifier()

# # Create base SVM classifier
# # base_svm_classifier = SVC(kernel='sigmoid')

# # AdaBoost
# adaboost_classifier = AdaBoostClassifier(base_estimator=base_dt_classifier,n_estimators=50, learning_rate=1.0)
# adaboost_classifier.fit(X_train_tfidf, y_train)
# adaboost_preds = adaboost_classifier.predict(X_test_tfidf)
# adaboost_accuracy = accuracy_score(y_test, adaboost_preds)
# print("AdaBoost Accuracy:", adaboost_accuracy)

# # Bagging
# bagging_classifier = BaggingClassifier(base_estimator=base_dt_classifier, n_estimators=50, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False)
# bagging_classifier.fit(X_train_tfidf, y_train)
# bagging_preds = bagging_classifier.predict(X_test_tfidf)
# bagging_accuracy = accuracy_score(y_test, bagging_preds)
# print("Bagging Accuracy:", bagging_accuracy)


In [None]:
svm_accuracy

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Create and train KNN classifier
knn_classifier = KNeighborsClassifier(n_neighbors=2)  # You can adjust the number of neighbors (n_neighbors)
knn_classifier.fit(X_train_tfidf, y_train)

# Predict on test set
knn_preds = knn_classifier.predict(X_test_tfidf)

# Calculate accuracy
knn_accuracy = accuracy_score(y_test, knn_preds)
print("KNN Accuracy:", knn_accuracy)


In [None]:

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Assuming lr_preds and y_test are already defined
import pandas as pd
import numpy as np
# Assuming lr_preds and y_test are already defined
conf_matrix = confusion_matrix(y_test, knn_preds)
accuracy = accuracy_score(y_test, knn_preds)
precision = precision_score(y_test, knn_preds, average='weighted')
recall = recall_score(y_test, knn_preds, average='weighted')
f1score = f1_score(y_test, knn_preds, average='weighted')

# Calculate total for each row and column
conf_matrix_with_total = conf_matrix.copy()
conf_matrix_with_total = np.append(conf_matrix_with_total, [np.sum(conf_matrix_with_total, axis=0)], axis=0)
conf_matrix_with_total = np.append(conf_matrix_with_total, np.sum(conf_matrix_with_total, axis=1).reshape(-1, 1), axis=1)

# Create a DataFrame for the confusion matrix
conf_matrix_df = pd.DataFrame(conf_matrix_with_total, 
                               index=['Hate Speech', 'Offensive', 'Neither', 'Total'], 
                               columns=['Predicted 0', 'Predicted 1', 'Predicted 2', 'Total'])

# Create a DataFrame for the score metrics
score_metrics_df = pd.DataFrame({'Accuracy': [accuracy],
                                 'Precision': [precision],
                                 'Recall': [recall],
                                 'F1 Score': [f1score]})

print("Confusion Matrix:")
print(conf_matrix_df)
print("\nScore Metrics:")
print(score_metrics_df)


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Define the Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()

# Define the hyperparameter grid
param_grid = {
    'alpha': [0.1, 0.5, 1.0],  # Add more values if needed
    'fit_prior': [True, False]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=nb_classifier, param_grid=param_grid, cv=5, scoring='accuracy')

# Perform Grid Search
grid_search.fit(X_train_tfidf, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Initialize Multinomial Naive Bayes classifier with the best hyperparameters
best_nb_classifier = MultinomialNB(**best_params)

# Train the best classifier on the entire training set
best_nb_classifier.fit(X_train_tfidf, y_train)

# Make predictions on the test set
nb_preds = best_nb_classifier.predict(X_test_tfidf)

# Calculate accuracy
nb_accuracy = accuracy_score(y_test, nb_preds)

print("Best Hyperparameters:", best_params)
print("Accuracy with Best Hyperparameters:", nb_accuracy)


In [None]:

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Assuming lr_preds and y_test are already defined
import pandas as pd
import numpy as np
# Assuming lr_preds and y_test are already defined
conf_matrix = confusion_matrix(y_test, nb_preds)
accuracy = accuracy_score(y_test, nb_preds)
precision = precision_score(y_test, nb_preds, average='weighted')
recall = recall_score(y_test, nb_preds, average='weighted')
f1score = f1_score(y_test, nb_preds, average='weighted')

# Calculate total for each row and column
conf_matrix_with_total = conf_matrix.copy()
conf_matrix_with_total = np.append(conf_matrix_with_total, [np.sum(conf_matrix_with_total, axis=0)], axis=0)
conf_matrix_with_total = np.append(conf_matrix_with_total, np.sum(conf_matrix_with_total, axis=1).reshape(-1, 1), axis=1)

# Create a DataFrame for the confusion matrix
conf_matrix_df = pd.DataFrame(conf_matrix_with_total, 
                               index=['Hate Speech', 'Offensive', 'Neither', 'Total'], 
                               columns=['Predicted 0', 'Predicted 1', 'Predicted 2', 'Total'])

# Create a DataFrame for the score metrics
score_metrics_df = pd.DataFrame({'Accuracy': [accuracy],
                                 'Precision': [precision],
                                 'Recall': [recall],
                                 'F1 Score': [f1score]})

print("Confusion Matrix:")
print(conf_matrix_df)
print("\nScore Metrics:")
print(score_metrics_df)


In [None]:
!pip install spacy
#python -m spacy download en_core_web_sm
import spacy

# Load English language model
nlp = spacy.load('en_core_web_sm')

# Define a function for preprocessing text with lemmatization
def preprocess_text_with_lemmatization(tweet):
    # Remove special characters, URLs, and mentions
    tweet = re.sub(r'http\S+|www\S+|pic.twitter\S+|@\S+', '', tweet)
    tweet = re.sub(r'[^a-zA-Z\s]', '', tweet)
    
    # Remove extra spaces and convert to lowercase
    tweet = ' '.join(tweet.lower().split())
    
    # Lemmatize the text
    lemmatized_tokens = []
    doc = nlp(tweet)
    for token in doc:
        lemmatized_tokens.append(token.lemma_)
    
    # Join lemmatized tokens back into a single string
    tweet = ' '.join(lemmatized_tokens)
    
    return tweet


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.layers import LSTM, Dropout


## X = data['tweet']
y = data['class']
X = data['tweet'].apply(preprocess_text_with_lemmatization)
#X_test_preprocessed = X_test.apply(preprocess_text)



tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

# Pad sequences
max_len = 20  # Max sequence length
X_padded = pad_sequences(sequences, maxlen=max_len)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.20, random_state=42)

# Define LSTM model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_len),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(3, activation='softmax')  # 3 output classes
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model with early stopping
model.fit(X_train, y_train, epochs=30, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model on the test set
_, test_accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", test_accuracy)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.layers import LSTM, Dropout

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_len),  # Increase output_dim
    tf.keras.layers.LSTM(64, return_sequences=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(3, activation='softmax')
])

# X = data['tweet']
y = data['class']
X = data['tweet'].apply(preprocess_text_with_lemmatization)
#X_test_preprocessed = X_test.apply(preprocess_text)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)

# Pad sequences
max_len = 20  # Max sequence length
X_padded = pad_sequences(sequences, maxlen=max_len)

# Learning rate scheduler
def lr_scheduler(epoch, lr):
    if epoch % 10 == 0 and epoch != 0:
        lr = lr * 0.9  # Decrease learning rate by 10% every 10 epochs
    return lr

optimizer = Adam(learning_rate=0.001)  # Set initial learning rate

# Compile the model with custom optimizer and loss
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Learning rate scheduler callback
lr_callback = LearningRateScheduler(lr_scheduler)

# Train the model with early stopping and learning rate scheduler
model.fit(X_train, y_train, epochs=30, batch_size=16, validation_split=0.2, callbacks=[early_stopping, lr_callback])


# Evaluate the model on the test set
_, test_accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", test_accuracy)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# Make predictions on the test set
# Make predictions on the test set
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# Compute precision, recall, and F1-score
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


In [None]:
# Custom voting mechanism
voting_preds = []
for i in range(len(best_rf_model_preds)):
    # Combine predictions from all classifiers
    predictions = [best_rf_model_preds[i], best_lr_preds[i], svm_preds[i], knn_preds[i], nb_preds[i], y_pred[i]]
    # Take majority vote
    majority_vote = max(set(predictions), key=predictions.count)
    voting_preds.append(majority_vote)

# Calculate accuracy
voting_accuracy = accuracy_score(y_test, voting_preds)
print("Voting Classifier Accuracy:", voting_accuracy)


In [None]:

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Assuming lr_preds and y_test are already defined
import pandas as pd
import numpy as np
# Assuming lr_preds and y_test are already defined
conf_matrix = confusion_matrix(y_test, voting_preds)
accuracy = accuracy_score(y_test, voting_preds)
precision = precision_score(y_test, voting_preds, average='weighted')
recall = recall_score(y_test, voting_preds, average='weighted')
f1score = f1_score(y_test, voting_preds, average='weighted')

# Calculate total for each row and column
conf_matrix_with_total = conf_matrix.copy()
conf_matrix_with_total = np.append(conf_matrix_with_total, [np.sum(conf_matrix_with_total, axis=0)], axis=0)
conf_matrix_with_total = np.append(conf_matrix_with_total, np.sum(conf_matrix_with_total, axis=1).reshape(-1, 1), axis=1)

# Create a DataFrame for the confusion matrix
conf_matrix_df = pd.DataFrame(conf_matrix_with_total, 
                               index=['Hate Speech', 'Offensive', 'Neither', 'Total'], 
                               columns=['Predicted 0', 'Predicted 1', 'Predicted 2', 'Total'])

# Create a DataFrame for the score metrics
score_metrics_df = pd.DataFrame({'Accuracy': [accuracy],
                                 'Precision': [precision],
                                 'Recall': [recall],
                                 'F1 Score': [f1score]})

print("Confusion Matrix:")
print(conf_matrix_df)
print("\nScore Metrics:")
print(score_metrics_df)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
import numpy as np
from tensorflow.keras.callbacks import EarlyStopping
# Load your dataset
data = pd.read_csv('/kaggle/input/datset15/labeled_data.csv')  # Update with your file path
X = data['tweet']
y = data['class']

# Tokenize text data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
X_encoded = tokenizer(X.tolist(), padding=True, truncation=True, max_length=64, return_tensors='tf')['input_ids']

# Convert the tensor to a numpy array
X_encoded = np.array(X_encoded)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# model.trainable = True
# for layer in model.layers[:-4]:
#     layer.trainable = False

# Learning rate scheduling
# lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
#     initial_learning_rate=3e-5,
#     decay_steps=10000,
#     decay_rate=0.9
# )

# Define optimizer with learning rate scheduling
# optimizer = Adam(learning_rate=lr_schedule)

# Load the pre-trained BERT model for sequence classification
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # Assuming 3 output classes

early_stopping = EarlyStopping(monitor='val_loss', patience=3)  # Stop training when validation loss stops decreasing for 3 epochs
# Compile the model
optimizer = Adam(learning_rate=3e-5)
# Compile the model with optimizer
model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

# Train the model with early stopping and learning rate scheduling
model.fit(X_train, y_train, epochs=20, batch_size=16, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)


In [None]:
import pickle

# Save the model to a file in the working directory
filename = '/kaggle/working/model_save2'
with open(filename, 'wb') as file:
    pickle.dump(model, file)


In [52]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch

# Load the model and tokenizer from the saved directory
model = BertForSequenceClassification.from_pretrained('/kaggle/working/my_bert_model',from_tf=True)
tokenizer = BertTokenizer.from_pretrained('/kaggle/working/my_bert_model')

# Test the model and tokenizer
text = "Hello, world!"
inputs = tokenizer(text, return_tensors='pt')
with torch.no_grad():
    outputs = model(**inputs)

# The outputs are logits; to convert to probabilities, apply the softmax function
logits = outputs.logits
probabilities = torch.nn.functional.softmax(logits, dim=1)

# Get the predicted class (the one with the highest probability)
predicted_class = probabilities.argmax(dim=1)

print(predicted_class)

All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


tensor([2])


In [57]:
!zip -r Bert_model.zip /kaggle/working/my_bert_model


  adding: kaggle/working/my_bert_model/ (stored 0%)
  adding: kaggle/working/my_bert_model/config.json (deflated 50%)
  adding: kaggle/working/my_bert_model/special_tokens_map.json (deflated 42%)
  adding: kaggle/working/my_bert_model/tf_model.h5 (deflated 8%)
  adding: kaggle/working/my_bert_model/tokenizer_config.json (deflated 75%)
  adding: kaggle/working/my_bert_model/outputname.tar.gz (deflated 0%)
  adding: kaggle/working/my_bert_model/vocab.txt (deflated 53%)


In [48]:
!tar -zcvf outputname.tar.gz /kaggle/working/my_bert_model.zip

tar: Removing leading `/' from member names
/kaggle/working/my_bert_model.zip


In [None]:
from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score

# Make predictions on the test set
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs.logits, axis=1)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Compute F1-score
f1 = f1_score(y_test, y_pred, average='macro')
print("F1-score:", f1)

# Compute recall
recall = recall_score(y_test, y_pred, average='macro')
print("Recall:", recall)

# Compute precision
precision = precision_score(y_test, y_pred, average='macro')
print("Precision:", precision)


In [None]:
 threshold = 0.5  # Adjust this threshold based on your requirements

# Obtain predicted logits from the model
y_pred_probs = model.predict(X_test)
y_pred_logits = np.argmax(y_pred_probs.logits, axis=1)

# Convert predicted logits to labels based on threshold
new_labels = []

for logits in y_pred_probs.logits:
    max_score_class = np.argmax(logits)
    probs = np.exp(logits) / np.sum(np.exp(logits),  keepdims=True)   
    # Split instances of classes 0 and 1 into sub-classes
    if max_score_class==0:
        if probs[0]<0.5:
            new_labels.append(0)  # Offensive a
        else:
            new_labels.append(1)  # Offensive b
    elif max_score_class==1:
        if probs[1]<0.5:
            new_labels.append(2)  # Hate speech a
        else:
            new_labels.append(3)  # Hate speech b
    else:
        new_labels.append(4)  # Neither

# Convert new labels to array
new_labels = np.array(new_labels)


In [None]:
X_test

In [None]:
 np.unique(new_labels)

In [None]:
len(new_labels)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_test, new_labels, test_size=0.3, random_state=42)

In [None]:
np.unique(y_train)

In [None]:


# Load the pre-trained BERT model for sequence classification
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)  # Assuming 5 output classes

early_stopping = EarlyStopping(monitor='val_loss', patience=3)  # Stop training when validation loss stops decreasing for 3 epochs
# Compile the model
optimizer = Adam(learning_rate=3e-5)
# Compile the model with optimizer
model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

# Train the model with early stopping and learning rate scheduling
model.fit(X_train, y_train, epochs=20, batch_size=16, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

In [None]:
np.unique(y_test)

In [None]:
from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score

# Make predictions on the test set
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs.logits, axis=1)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Compute F1-score
f1 = f1_score(y_test, y_pred, average='macro')
print("F1-score:", f1)

# Compute recall
recall = recall_score(y_test, y_pred, average='macro')
print("Recall:", recall)

# Compute precision
precision = precision_score(y_test, y_pred, average='macro')
print("Precision:", precision)


In [None]:
np.unique(y_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

conf_matrix = confusion_matrix(y_test, y_pred)

# Compute classification report (includes precision, recall, F1-score, and support)
cls_report = classification_report(y_test, y_pred)

# Compute accuracy
accuracy = accuracy_score(y_test, y_pred)

# Print results
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(cls_report)
print("\nAccuracy:", accuracy)

In [None]:
import matplotlib.pyplot as plt

# Classifier names
classifiers = ['Logistic Regression', 'Random Forest', 'SVM', 'KNN', 'Naive Bayes', 'LSTM', 'Voting Classifier', 'Bert Based Uncased']

# Accuracy scores
accuracy_scores = [0.89187, 0.895501, 0.899133, 0.785758, 0.849102, 0.891063, 0.892879, 0.904579]

# Sort classifiers and accuracy scores based on accuracy
sorted_indices = sorted(range(len(accuracy_scores)), key=lambda k: accuracy_scores[k])
classifiers_sorted = [classifiers[i] for i in sorted_indices]
accuracy_scores_sorted = [accuracy_scores[i] for i in sorted_indices]

# Create column graph
plt.figure(figsize=(10, 6))
plt.barh(classifiers_sorted, accuracy_scores_sorted, color='skyblue')
plt.xlabel('Accuracy')
plt.title('Accuracy of Different Classifiers (Increasing Order)')
plt.xlim(0.7, 1.0)  # Set x-axis limit
plt.gca().invert_yaxis()  # Invert y-axis to show the highest accuracy on top
plt.show()



In [None]:
import matplotlib.pyplot as plt

# Classifier names
classifiers = ['Logistic Regression', 'Random Forest', 'SVM', 'KNN', 'Naive Bayes', 'LSTM', 'Voting Classifier', 'Bert Based Uncased']

# Accuracy scores
accuracy_scores = [0.89187, 0.895501, 0.899133, 0.785758, 0.849102, 0.891063, 0.892879, 0.904579]

# Sort classifiers and accuracy scores based on accuracy
sorted_indices = sorted(range(len(accuracy_scores)), key=lambda k: accuracy_scores[k])
classifiers_sorted = [classifiers[i] for i in sorted_indices]
accuracy_scores_sorted = [accuracy_scores[i] for i in sorted_indices]

# Create column graph
plt.figure(figsize=(10, 6))
plt.barh(classifiers_sorted, accuracy_scores_sorted, color='skyblue')
plt.xlabel('Accuracy')
plt.title('Accuracy of Different Classifiers (Increasing Order)')
plt.xlim(0.7, 1.0)  # Set x-axis limit
plt.gca().invert_yaxis()  # Invert y-axis to show the highest accuracy on top
plt.show()
