In [None]:
# Importing the necessary libraries for data handling and text processing

import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [None]:
# Load the dataset

df = pd.read_csv('/Users/roopasreesubramanyam/Desktop/msba265-finalstorage/data_storage/CyberBullying.csv')


In [None]:
# Check the first few rows to get a sense of the data structure

print(df.head())
print(df.columns)

  Text-based data (Cyberbullying)  \
0                              No   
1                               1   
2                               2   
3                               3   
4                               4   

                                          Unnamed: 1 Unnamed: 2    Unnamed: 3  \
0                                               Text     Emojis  Social Media   
1                           u0 lmao wow fuck you too        😂 😂       YouTube   
2  a white dress and red lipstick make everything...        NaN           NaN   
3  this has been a trend since <number> of course...        NaN       YouTube   
4  <user> <user> babies in cages destroying envir...        NaN       YouTube   

       Unnamed: 4 Unnamed: 5  Unnamed: 6  Unnamed: 7  Unnamed: 8  Unnamed: 9  \
0            Type      Label         NaN         NaN         NaN         NaN   
1  neutral/normal          0         NaN         NaN         NaN         NaN   
2  neutral/normal          0         NaN         N

In [None]:
df.columns = ['No', 'Text', 'emoji', 'social media', 'type', 'label'] + [f'Unnamed: {i}' for i in range(6, len(df.columns))]
# Rename columns to more readable names and handle any generic column names

In [10]:
# Drop columns we don't need (e.g., 'emoji')
df = df.drop(columns=['emoji'])

In [None]:

# Keep only the columns we care about for this analysis
df = df[['No', 'Text', 'social media', 'type', 'label']]

In [None]:
df['Text'] = df['Text'].fillna('')

# Replace any missing text values with empty strings so we don't run into errors during processing

In [None]:
stop_words = set(stopwords.words('english'))
# Load the set of common stop words from NLTK

In [15]:
def preprocess_text(text):
    # Tokenize the text (split it into words) and make everything lowercase
    tokens = word_tokenize(text.lower())
    # Remove punctuation and stop words (common words like 'the', 'is')
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    # Join the tokens back into a single string
    return ' '.join(tokens)
# Function to clean and preprocess text

In [16]:
# Apply the text preprocessing function to the 'Text' column
df['processed_text'] = df['Text'].apply(preprocess_text)

In [17]:
# Check the processed data to make sure it looks right
print(df[['Text', 'processed_text', 'label']].head())

                                                Text  \
0                                               Text   
1                           u0 lmao wow fuck you too   
2  a white dress and red lipstick make everything...   
3  this has been a trend since <number> of course...   
4  <user> <user> babies in cages destroying envir...   

                                      processed_text  label  
0                                               text  Label  
1                                   u0 lmao wow fuck      0  
2    white dress red lipstick make everything better      0  
3  trend since number course wall street assumed ...      0  
4  user user babies cages destroying environment ...      0  


In [18]:
# Ensure that 'label' is a number and drop any rows where it isn't
df = df[pd.to_numeric(df['label'], errors='coerce').notnull()]
df['label'] = df['label'].astype(int)

In [19]:
# Convert the processed text into a numerical format using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X = tfidf_vectorizer.fit_transform(df['processed_text'])

In [None]:
print("TF-IDF matrix shape:", X.shape)

# Display the shape of the TF-IDF matrix to check the number of features

TF-IDF matrix shape: (8499, 5000)


In [21]:
# Define the target variable 'y' as the 'label' column
y = df['label']

In [22]:
# Split the data into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print("Training set size:", X_train.shape, "Test set size:", X_test.shape)

Training set size: (5949, 5000) Test set size: (2550, 5000)


In [23]:
# Use SMOTE to create synthetic examples for underrepresented classes and RandomUnderSampler to balance overrepresented ones
smote = SMOTE(random_state=42)
under_sampler = RandomUnderSampler(random_state=42)

In [24]:
# Create a pipeline that applies both oversampling and undersampling
pipeline = Pipeline([('smote', smote), ('under_sampler', under_sampler)])

In [25]:
# Resample the training data to create a balanced dataset
X_train_res, y_train_res = pipeline.fit_resample(X_train, y_train)

In [26]:
# Check the new size of the balanced training data
print("Resampled training set size:", X_train_res.shape, "Resampled labels size:", y_train_res.shape)

Resampled training set size: (9426, 5000) Resampled labels size: (9426,)
