In [1]:
# importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re

In [4]:
# Load the dataset
dataset = pd.read_csv('labeled_data.csv', encoding='latin-1')
dataset.shape

(24783, 7)

In [18]:
dataset

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...,...,...,...,...,...
24778,25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24780,25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,25295,6,0,6,0,1,youu got wild bitches tellin you lies


Data Preprocessing

In [5]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [6]:
dataset.tail()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
24778,25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24780,25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,25295,6,0,6,0,1,youu got wild bitches tellin you lies
24782,25296,3,0,0,3,2,~~Ruffled | Ntac Eileen Dahlia - Beautiful col...


In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Unnamed: 0          24783 non-null  int64 
 1   count               24783 non-null  int64 
 2   hate_speech         24783 non-null  int64 
 3   offensive_language  24783 non-null  int64 
 4   neither             24783 non-null  int64 
 5   class               24783 non-null  int64 
 6   tweet               24783 non-null  object
dtypes: int64(6), object(1)
memory usage: 1.3+ MB


Exploratory Data Analysis

In [8]:
# check statistical measures
dataset.describe(include = 'all')

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
count,24783.0,24783.0,24783.0,24783.0,24783.0,24783.0,24783
unique,,,,,,,24783
top,,,,,,,!!! RT @mayasolovely: As a woman you shouldn't...
freq,,,,,,,1
mean,12681.192027,3.243473,0.280515,2.413711,0.549247,1.110277,
std,7299.553863,0.88306,0.631851,1.399459,1.113299,0.462089,
min,0.0,3.0,0.0,0.0,0.0,0.0,
25%,6372.5,3.0,0.0,2.0,0.0,1.0,
50%,12703.0,3.0,0.0,3.0,0.0,1.0,
75%,18995.5,3.0,0.0,3.0,0.0,1.0,


In [9]:
# check for null values
dataset.isnull().sum()

Unnamed: 0            0
count                 0
hate_speech           0
offensive_language    0
neither               0
class                 0
tweet                 0
dtype: int64

In [10]:
# check unique values
dataset.nunique()

Unnamed: 0            24783
count                     5
hate_speech               8
offensive_language       10
neither                  10
class                     3
tweet                 24783
dtype: int64

In [15]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rutik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rutik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
# Preprocessing function
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    # Join tokens back into text
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

# Apply preprocessing to the dataset
dataset['preprocessed_text'] = dataset['tweet'].apply(preprocess_text)

In [20]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dataset['preprocessed_text'], dataset['class'], test_size=0.2, random_state=42)

In [21]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(19826,)
(4957,)
(19826,)
(4957,)


In [24]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
# Convert text data to input format for BERT
inputs_train = tokenizer(X_train.tolist(), padding=True, truncation=True, return_tensors="pt")
inputs_test = tokenizer(X_test.tolist(), padding=True, truncation=True, return_tensors="pt")

In [27]:
inputs_train

{'input_ids': tensor([[  101, 19387,  4569,  ...,     0,     0,     0],
        [  101,  4569,  3490,  ...,     0,     0,     0],
        [  101, 19387, 16837,  ...,     0,     0,     0],
        ...,
        [  101, 22555,  5685,  ...,     0,     0,     0],
        [  101, 19387,  8437,  ...,     0,     0,     0],
        [  101,  7743,  7110,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [28]:
inputs_test

{'input_ids': tensor([[  101,  2288,  3335,  ...,     0,     0,     0],
        [  101, 19387,  2332,  ...,     0,     0,     0],
        [  101, 19387, 19413,  ...,     0,     0,     0],
        ...,
        [  101,  7743,  2589,  ...,     0,     0,     0],
        [  101,  6638, 29461,  ...,     0,     0,     0],
        [  101,  2196, 12171,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [29]:
# # Tokenize input texts
# tokenized_texts = [tokenizer.encode(text, max_length=512, truncation=True) for text in X_train]

# # Pad tokenized sequences
# max_len = max([len(text) for text in tokenized_texts])
# padded_texts = [text + [0]*(max_len-len(text)) for text in tokenized_texts]

# # Convert to tensors
# input_ids = torch.tensor(padded_texts)
# labels = torch.tensor(y_train_numerical)

# # Prepare optimizer and loss function
# optimizer = AdamW(model.parameters(), lr=5e-5)

# # Train the model
# model.train()
# optimizer.zero_grad()
# outputs = model(input_ids, labels=labels)
# loss = outputs.loss
# loss.backward()
# optimizer.step()

# # Evaluate the model
# model.eval()
# tokenized_texts_test = [tokenizer.encode(text, max_length=512, truncation=True) for text in X_test]
# padded_texts_test = [text + [0]*(max_len-len(text)) for text in tokenized_texts_test]
# input_ids_test = torch.tensor(padded_texts_test)
# labels_test = torch.tensor(y_test)

# with torch.no_grad():
#     outputs_test = model(input_ids_test)
#     predictions = torch.argmax(outputs_test.logits, dim=1)
#     accuracy = torch.sum(predictions == labels_test).item() / len(predictions)

# print("Accuracy:", accuracy)

In [30]:
# from sklearn.preprocessing import LabelEncoder
# import torch
# # Initialize LabelEncoder
# label_encoder = LabelEncoder()

# # Fit label encoder and transform labels
# y_train_numerical = label_encoder.fit_transform(y_train)

# # Convert to tensor
# labels = torch.tensor(y_train_numerical)

In [31]:
# Model selection and training
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Define the model pipeline
model_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', SVC(kernel='linear'))
])

# Train the model
model_pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = model_pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.9023602985676821
Classification Report:
               precision    recall  f1-score   support

           0       0.54      0.20      0.29       290
           1       0.93      0.96      0.94      3832
           2       0.84      0.89      0.86       835

    accuracy                           0.90      4957
   macro avg       0.77      0.68      0.70      4957
weighted avg       0.89      0.90      0.89      4957



In [32]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# y_pred contains the predicted labels and y_test contains the true labels
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.9023602985676821
Precision: 0.8887885737696777
Recall: 0.9023602985676821
F1-score: 0.8906022363899996


In [33]:
from sklearn.ensemble import RandomForestClassifier

# Define the model pipeline with Random Forest classifier
model_pipeline_rf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])

# Train the model
model_pipeline_rf.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = model_pipeline_rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.9023602985676821
Classification Report:
               precision    recall  f1-score   support

           0       0.54      0.20      0.29       290
           1       0.93      0.96      0.94      3832
           2       0.84      0.89      0.86       835

    accuracy                           0.90      4957
   macro avg       0.77      0.68      0.70      4957
weighted avg       0.89      0.90      0.89      4957



In [34]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Define the model pipeline with Logistic Regression
model_pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', LogisticRegression())
])

# Train the model
model_pipeline_lr.fit(X_train, y_train)

# Predict on the test set
y_pred_lr = model_pipeline_lr.predict(X_test)

# Evaluate the model
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr, average='weighted')
recall_lr = recall_score(y_test, y_pred_lr, average='weighted')
f1_score_lr = f1_score(y_test, y_pred_lr, average='weighted')
classification_report_lr = classification_report(y_test, y_pred_lr)

# Print evaluation metrics
print("Evaluation Metrics for Logistic Regression Model:")
print("Accuracy:", accuracy_lr)
print("Precision:", precision_lr)
print("Recall:", recall_lr)
print("F1 Score:", f1_score_lr)
print("\nClassification Report:")
print(classification_report_lr)

Evaluation Metrics for Logistic Regression Model:
Accuracy: 0.8930804922332055
Precision: 0.8797297876344393
Recall: 0.8930804922332055
F1 Score: 0.8816000202883858

Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.21      0.31       290
           1       0.91      0.96      0.94      3832
           2       0.84      0.82      0.83       835

    accuracy                           0.89      4957
   macro avg       0.77      0.66      0.69      4957
weighted avg       0.88      0.89      0.88      4957



In [35]:
from joblib import dump

dump(model_pipeline_lr, 'model.pkl')

['model.pkl']

In [36]:
from google.colab import files

files.download('model.pkl')

ModuleNotFoundError: No module named 'google.colab'