#### Import required libraries

In [1]:
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
import textstat
import re
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#### Data Collection

In [2]:
data = pd.read_csv('../data/train.csv')

In [3]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
new_data = data.copy()

#### Drop id column

In [5]:
new_data.drop(columns=['id'], axis=1, inplace=True)

#### Data Cleaning

In [6]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [7]:
important_stopwords = [
    "I", "you", "he", "she", "we", "they",
    "not", "no", "never", "can't", "won't", 
    "don't", "didn't", "hasn't", "haven't", "isn't", "aren't", "wasn't", "weren't",
    "this", "that", "these", "those",
    "on", "in", "at", "for", "with",
    "and", "or", "but",
    "is", "are", "was", "were",
    "very", "too"
]

custom_stopwords = set(word for word in stop_words if word not in important_stopwords)

In [8]:
def clean_data(text):
    text = text.lower()  
    text = re.sub('[^a-zA-Z0-9/s]', ' ', text)
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in custom_stopwords]
    return ' '.join(words)

In [9]:
new_data['clean_text'] = new_data['comment_text'].apply(clean_data)

#### Check for empty strings and drop if any

In [10]:
new_data[new_data['clean_text'] == '']

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text
64304,WHAT DID I D!!!!!!!!!!!!!!!!!!!!!O,0,0,0,0,0,0,
140477,what should i do? \n\n?????,0,0,0,0,0,0,


In [11]:
new_data = new_data[new_data['clean_text'] != '']

#### Calculate text length

In [12]:
new_data['text_length'] = new_data['comment_text'].apply(len)

#### Remove long texts

In [13]:
max_length = 500
new_data = new_data[new_data['text_length'] <= max_length]

In [14]:
data_loss_pct = (1-(new_data.shape[0]/data.shape[0]))*100
data_loss = data.shape[0] - new_data.shape[0]

print('Shape of new data:', new_data.shape)
print('Number of rows removed:', data_loss)
print('Percentage of data removed:', data_loss_pct)

Shape of new data: (125624, 9)
Number of rows removed: 33947
Percentage of data removed: 21.273915686434254


#### Calculate Sentiment Analysis features

In [15]:
new_data['sentiment_polarity'] = new_data['comment_text'].apply(
    lambda x: TextBlob(x).sentiment.polarity)

In [16]:
new_data['sentiment_subjectivity'] = new_data['comment_text'].apply(
    lambda x: TextBlob(x).sentiment.subjectivity)

#### Calculate Readability metrics 

In [17]:
new_data['flesch_kincaid_grade'] = new_data['comment_text'].apply(
    lambda x: textstat.flesch_kincaid_grade(x))

In [18]:
new_data['gunning_fog'] = new_data['comment_text'].apply(
    lambda x: textstat.gunning_fog(x))

#### Split data for training and testing

In [19]:
X = new_data.drop(columns=['toxic','severe_toxic','obscene','threat','insult','identity_hate'], axis=1)
y = new_data[['toxic','severe_toxic','obscene','threat','insult','identity_hate']]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

In [21]:
print('X_train:', X_train.shape)
print('y_train:', y_train.shape)
print('X_test:', X_test.shape)
print('y_test:', y_test.shape)

X_train: (100499, 7)
y_train: (100499, 6)
X_test: (25125, 7)
y_test: (25125, 6)


#### Standardize columns

In [22]:
scaler = StandardScaler()
selected_columns = ['text_length','sentiment_polarity','sentiment_subjectivity','flesch_kincaid_grade',
       'gunning_fog']

In [23]:
X_train[selected_columns] = scaler.fit_transform(X_train[selected_columns])
X_test[selected_columns] = scaler.transform(X_test[selected_columns])

#### Save new data

In [24]:
X_train.to_csv('../data/processed/X_train.csv', index=False)
y_train.to_csv('../data/processed/y_train.csv', index=False)
X_test.to_csv('../data/processed/X_test.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)