In [18]:
# Importing the necessary libraries for data handling and text processing

import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [19]:

import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/roopasreesubramanyam/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/roopasreesubramanyam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
# Load the dataset

df = pd.read_csv('/Users/roopasreesubramanyam/Desktop/msba265-finalstorage/data_storage/CyberBullying.csv')


In [21]:
# Check the first few rows to get a sense of the data structure

df = df.iloc[1:]
# Rename columns to make sure 'text' and 'label' are properly identified
df.columns = [f'col_{i}' if 'Unnamed' in col else col for i, col in enumerate(df.columns)]
print(df.head())

  Text-based data (Cyberbullying)  \
1                               1   
2                               2   
3                               3   
4                               4   
5                               5   

                                               col_1 col_2    col_3  \
1                           u0 lmao wow fuck you too   😂 😂  YouTube   
2  a white dress and red lipstick make everything...   NaN      NaN   
3  this has been a trend since <number> of course...   NaN  YouTube   
4  <user> <user> babies in cages destroying envir...   NaN  YouTube   
5  <user> more good neighbours yes that working o...   NaN  YouTube   

            col_4 col_5  col_6  col_7  col_8  col_9  ...  col_16  col_17  \
1  neutral/normal     0    NaN    NaN    NaN    NaN  ...     NaN     NaN   
2  neutral/normal     0    NaN    NaN    NaN    NaN  ...     NaN     NaN   
3  neutral/normal     0    NaN    NaN    NaN    NaN  ...     NaN     NaN   
4         neutral     0    NaN    NaN    NaN  

In [22]:
# Select only the columns for text and label (replace 'col_1' and 'col_5' with the actual names if needed)
cleaned_dataset = df[['col_1', 'col_5']]
cleaned_dataset.columns = ['text', 'label']

# Drop rows with missing data in 'text' or 'label'
cleaned_dataset.dropna(subset=['text', 'label'], inplace=True)
cleaned_dataset.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_dataset.dropna(subset=['text', 'label'], inplace=True)


Unnamed: 0,text,label
1,u0 lmao wow fuck you too,0
2,a white dress and red lipstick make everything...,0
3,this has been a trend since <number> of course...,0
4,<user> <user> babies in cages destroying envir...,0
5,<user> more good neighbours yes that working o...,0


In [23]:
# Check for null values after dropping
print("Null values in each column after dropping:")
print(cleaned_dataset.isnull().sum())

# Display a preview of the cleaned dataset
print("Cleaned dataset preview:")
cleaned_dataset.head()

Null values in each column after dropping:
text     0
label    0
dtype: int64
Cleaned dataset preview:


Unnamed: 0,text,label
1,u0 lmao wow fuck you too,0
2,a white dress and red lipstick make everything...,0
3,this has been a trend since <number> of course...,0
4,<user> <user> babies in cages destroying envir...,0
5,<user> more good neighbours yes that working o...,0


In [24]:
# Function to preprocess text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove words like <xyz> using regex
    text = re.sub(r'<[^>]+>', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove punctuation and stopwords, and filter out numbers
    stop_words = set(stopwords.words('english'))
    clean_tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    # Join the tokens back into a clean string
    return ' '.join(clean_tokens)

# Apply preprocessing to the 'text' column
cleaned_dataset['processed_text'] = cleaned_dataset['text'].apply(preprocess_text)

# Display a preview of the cleaned dataset
print(cleaned_dataset[['text', 'processed_text']].head())


                                                text  \
1                           u0 lmao wow fuck you too   
2  a white dress and red lipstick make everything...   
3  this has been a trend since <number> of course...   
4  <user> <user> babies in cages destroying envir...   
5  <user> more good neighbours yes that working o...   

                                      processed_text  
1                                      lmao wow fuck  
2    white dress red lipstick make everything better  
3  trend since course wall street assumed eternal...  
4  babies cages destroying environment rolling ba...  
5  good neighbours yes working well crime skyrock...  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_dataset['processed_text'] = cleaned_dataset['text'].apply(preprocess_text)


In [25]:
# Ensure that 'label' is a number and drop any rows where it isn't
cleaned_dataset = cleaned_dataset[pd.to_numeric(cleaned_dataset['label'], errors='coerce').notnull()]
cleaned_dataset['label'] = cleaned_dataset['label'].astype(int)
cleaned_dataset.head()

Unnamed: 0,text,label,processed_text
1,u0 lmao wow fuck you too,0,lmao wow fuck
2,a white dress and red lipstick make everything...,0,white dress red lipstick make everything better
3,this has been a trend since <number> of course...,0,trend since course wall street assumed eternal...
4,<user> <user> babies in cages destroying envir...,0,babies cages destroying environment rolling ba...
5,<user> more good neighbours yes that working o...,0,good neighbours yes working well crime skyrock...
