In [None]:
# Importing libraries
import numpy as np
import pandas as pd

# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Unzip file
import zipfile

# Test preprocessing related libraries
import re
import nltk

# Punctuation marks
import string

# Stop words
from nltk.corpus import stopwords
# nltk.download('stopwords')

# Stemming and Stopwords
stemming = nltk.SnowballStemmer('english')
stopword = set(stopwords.words("english"))

In [None]:
# Load dataset
imbalanced_data = pd.read_csv("./data/imbalanced_data.csv")
raw_data = pd.read_csv("./data/raw_data.csv")

In [None]:
# Display top & last 5 rows of imbalanced data 
imbalanced_data

In [None]:
# Display top & last 5 rows of raw data 
raw_data

In [None]:
# Remove un-neccessary columns from both datasets 
imbalanced_data.drop(columns=['id'], axis=1, inplace=True)
raw_data.drop(columns=['Unnamed: 0'], axis=1, inplace=True)

# I made a few strategies to deal with data.

### 1. Understanding data
### 2. EDA
### 3. Data preprocessing
### 4. Model building, and evaluation
### 5. Model testing

## 1. Understanding Data

In [None]:
class DataExplain:
    def __init__(self, df):
        self.data = df
    
    def data_looks(self):
        print(f"\t\t\t How data does looks like")
        print(f"\t\t\t Top 5 rows of data: \n")
        print(self.data.head())
        print('-'*90, '\n')
        print(f"\t\t\t Random 5 rows of data: \n")
        print(self.data.sample(5))
        print('-'*90, '\n')
        print(f"\t\t\t Last 5 rows of data: \n")
        print(self.data.tail())
        print('-'*90, '\n')

    
    def columns_and_types(self):
        print(f"\t\t\t Data shape (rows & columns): \n")
        print(self.data.shape)
        print('-'*45, '\n')
        
        print(f"\t\t\t Data columns name: \n")
        print(self.data.columns)
        print('-'*45, '\n')

        print(f"\t\t\t Data columns types: \n")
        print(self.data.info())
        print('-'*45, '\n')
    
    def missing_and_duplicate(self):
        print(f"\t\t\t Data missing values: \n")
        print(self.data.isnull().sum()*100)
        print('-'*45, '\n')

        print(f"\t\t\t Data duplicate values: \n")
        print(self.data.duplicated().sum())
        print('-'*45, '\n')


In [None]:
# class object of imbalanced_data
dataexplian = DataExplain(imbalanced_data)

In [None]:
# Display top, random, and last 5 rows of imbalanced_data
dataexplian.data_looks()

In [None]:
# Display shape, columns_name and columns types of imbalanced_data
dataexplian.columns_and_types()

In [None]:
# Display missing and duplicate values in imbalanced_data
dataexplian.missing_and_duplicate()

In [None]:
# class object of raw_data
dataexplian = DataExplain(raw_data)

In [None]:
# Display top, random, and last 5 rows of raw_data
dataexplian.data_looks()

In [None]:
# Display shape, columns_name and columns types of raw_data
dataexplian.columns_and_types()

In [None]:
# Display missing and duplicate values in raw_data
dataexplian.missing_and_duplicate()

## 2. EDA

In [None]:
# imbalanced_data label values
imbalanced_data['label'].value_counts()

In [None]:
# Bar chart of imbalanced_data column 'label'
imbalanced_data['label'].value_counts().plot(kind= 'bar')
plt.show()

In [None]:
# Pie chart imbalanced_data column 'label'
labels = imbalanced_data['label'].value_counts().index
plt.pie(imbalanced_data['label'].value_counts(), labels=labels, autopct="%.2f%%")
plt.show()

### Observation:

- label 0: no hate
- label 1: hate

In [None]:
# raw_data values
raw_cols = ['count', 'hate_speech', 'offensive_language', 'neither', 'class']
raw_data[raw_cols].value_counts()

In [None]:
# Drop columns which are not required
raw_data.drop(columns=['count', 'hate_speech', 'offensive_language', 'neither'], axis=1, inplace=True)

In [None]:
raw_data.head()

In [None]:
# raw_data class unique values
raw_data['class'].unique()

In [None]:
# raw_data class values
raw_data['class'].value_counts()

In [None]:
# Plotting the countplot on raw_data 'class' columns
sns.countplot(data= raw_data, x='class')
plt.show()

In [None]:
# Pie chart raw_data column 'class'
labels = raw_data['class'].value_counts().index
plt.pie(raw_data['class'].value_counts(), labels=labels, autopct="%.2f%%")
plt.show()

### Observation:

- class 0: hate
- class 1: abusive
- class 2: no hate

In [None]:
# Copy the values of class 1 into class 0.
raw_data[raw_data['class']==0]['class']==1

In [None]:
raw_data.head()

In [None]:
raw_data['class'].unique()

In [None]:
# Check the value in class 0
raw_data[raw_data['class']==0]

In [None]:
# Replace the class 0 to class 1 value
raw_data['class'].replace({0: 1}, inplace=True)

In [None]:
raw_data['class'].unique()

In [None]:
# Rename the class 2 to 0
raw_data['class'].replace({2: 0}, inplace=True)

In [None]:
# Change the class name into label
raw_data.rename(columns= {'class': 'label'}, inplace=True)

In [None]:
# Merge the both datasets
final_df = pd.concat([imbalanced_data, raw_data])

In [None]:
final_df

In [None]:
# class object of final_df
dataexplian = DataExplain(final_df)

In [None]:
# Display top, random, and last 5 rows of final_df
dataexplian.data_looks()

In [None]:
# Display shape, columns_name and columns types of final_df
dataexplian.columns_and_types()

In [None]:
# Display missing and duplicate values in final_df
dataexplian.missing_and_duplicate()

In [None]:
# final_df label unique values
final_df['label'].unique()

In [None]:
# final_df label values
final_df['label'].value_counts()

In [None]:
# Bar chart of final_df column 'label'
final_df['label'].value_counts().plot(kind= 'bar')
plt.show()

In [None]:
# Pie chart final_df column 'label'
labels = final_df['label'].value_counts().index
plt.pie(final_df['label'].value_counts(), labels=labels, autopct="%.2f%%")
plt.show()

## 3. Data preprocessing

### Basic Cleanup
- Remove HTML tags
- Convert Emoji
- Spell Checking
    - Fast type
    - Fat Fingure Type

### Basic Text Preparation
- Fundamental Basic
    - Number of Characters
    - Word Tokenization
    - Sentence Tokenization

- Optional
    - Lower Case
    - Remove Special Characters, Stop Words, Punctuation, and Digits
    - Stemming, Lemmatization
    - Language Detection

In [None]:
# Fetch number of characters with punctions, digits, and special characters
final_df['num_characters'] = final_df['tweet'].apply(len)

In [None]:
# Clean the text data
def data_clean(words):
    words = str(words).lower()
    words = re.sub(r'[^a-zA-Z0-9\s]', '', words)
    words = re.sub('\[.*?]', "", words)
    words = re.sub('https?://\S+|www\.\S+', "", words)
    words = re.sub('<.*?>+', '', words)
    words = re.sub('[%s]' % re.escape(string.punctuation), '', words)
    words = re.sub('\n', '', words)
    words = re.sub('\w*\d\w*', '', words)
    words = [word for word in words.split(" ") if word not in stopword]
    words = [stemming.stem(word) for word in words]
    words = " ".join(words)
    return words

In [None]:
final_df['tweet'][1]

In [None]:
data_clean(" #model   i love u take with u all the time in urÃ°ÂŸÂ“Â±!!! Ã°ÂŸÂ˜Â™Ã°ÂŸÂ˜ÂŽÃ°ÂŸÂ‘Â„Ã°ÂŸÂ‘Â…Ã°ÂŸÂ’Â¦Ã°ÂŸÂ’Â¦Ã°ÂŸÂ’Â¦  ")

In [None]:
# Apply data_clean function on tweet data
final_df['tweet_transformed'] = final_df['tweet'].apply(data_clean)

In [None]:
# Display the whole dataset
final_df

In [None]:
# Make wordCloud
from wordcloud import WordCloud
wc = WordCloud(width = 700, height = 700, min_font_size = 10, background_color = 'white')

In [None]:
# Generate wordcloud on hate tweets
hate_wc = wc.generate(final_df[final_df['label'] == 1]['tweet_transformed'].str.cat(sep= " "))

In [None]:
# Graph the word cloud of hate tweets
plt.figure(figsize=(25,12))
plt.imshow(hate_wc)
plt.style.use('classic')
plt.axis('off')
plt.show()

In [None]:
# Generate wordcloud on no-hate tweets
no_hate_wc = wc.generate(final_df[final_df['label'] == 0]['tweet_transformed'].str.cat(sep= " "))

In [None]:
# Graph the word cloud of no-hate tweets
plt.figure(figsize=(25,12))
plt.imshow(no_hate_wc)
plt.style.use('classic')
plt.axis('off')
plt.show()

In [None]:
# Top occuring Words in hate tweets
hate_corpus = []

for tweet in final_df[final_df['label'] == 1]['tweet_transformed']:
    for word in tweet.split():
        hate_corpus.append(word)

In [None]:
# Length of hate tweets words
len(hate_corpus)

In [None]:
from collections import Counter

a = pd.DataFrame(Counter(hate_corpus).most_common(50))[0]
b = pd.DataFrame(Counter(hate_corpus).most_common(50))[1]

sns.barplot(x=a, y=b)
plt.xticks(rotation= 'vertical')
plt.show()

In [None]:
# Top occuring Words in no-hate tweets
no_hate_corpus = []

for tweet in final_df[final_df['label'] == 0]['tweet_transformed']:
    for word in tweet.split():
        no_hate_corpus.append(word)

In [None]:
# Length of no-hate tweets words
len(no_hate_corpus)

In [None]:
c = pd.DataFrame(Counter(no_hate_corpus).most_common(50))[0]
d = pd.DataFrame(Counter(no_hate_corpus).most_common(50))[1]

sns.barplot(x=c, y=d)
plt.xticks(rotation= 'vertical')
plt.show()

In [None]:
# Split data into X,y features
X = final_df['tweet_transformed']
y = final_df['label']

In [None]:
# Split arrays or matrices into random train and test subsets.
from sklearn.model_selection import train_test_split

In [None]:
# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)

In [None]:
print(len(X_train), len(y_train))
print(len(X_test), len(y_test))

## 4. Model building, and evaluation

In [None]:
# Import keras library
import keras

# Tokenization with keras
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

# Metrics measure
from sklearn.metrics import confusion_matrix

In [None]:
# Creating a Sequential model
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
from keras.optimizers import RMSprop

In [None]:
# Perform Tokenization
max_words = 50000
max_len = 300

tokenizer = Tokenizer(num_words= max_words)
tokenizer.fit_on_texts(X_train)

sequences = tokenizer.texts_to_sequences(X_train)
sequences_matrix = pad_sequences(sequences, maxlen= max_len)

In [None]:
# Creating model architecture
model = Sequential()
model.add(Embedding(max_words, 100, input_length= max_len))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(50, dropout= 0.2, recurrent_dropout= 0.2, return_sequences=True))
model.add(LSTM(50, dropout=0.1, recurrent_dropout=0.1))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss= 'binary_crossentropy', optimizer= RMSprop(), metrics= ['accuracy'])

In [None]:
# Model deatailed summary
model.summary()

In [None]:
# Train model
history = model.fit(sequences_matrix, y_train, batch_size=64, epochs=2, verbose=1, validation_split=0.2)

In [None]:
# Test 
test_sequence = tokenizer.texts_to_sequences(X_test)
test_sequence_matrix = pad_sequences(test_sequence, maxlen= max_len)

In [None]:
# Model evaluation
model_acc = model.evaluate(test_sequence_matrix, y_test)

In [None]:
model_acc

In [None]:
# Prediction
y_pred = model.predict(test_sequence_matrix)

In [None]:
# Perform prediction
res = []

for prediction in y_pred:
    if prediction[0] < 0.5:
        res.append(0)
    else:
        res.append(1)

In [None]:
print(confusion_matrix(y_test, res))

In [None]:
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.legend()
plt.show()

In [None]:
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label='val_accuracy')
plt.legend()
plt.show()

In [None]:
# Save tokenizer and also save the model if you want
import pickle
with open('./data/tokenizer.pickle', 'wb') as f:
    pickle.dump(tokenizer, f, protocol= pickle.HIGHEST_PROTOCOL)

In [None]:
# Save model
model.save('./data/model.h5')

## 5. Model Testing

In [None]:
load_clf = keras.models.load_model('./data/model.h5')
with open('./data/tokenizer.pickle', 'rb') as f:
    load_token = pickle.load(f)

In [None]:
user_txt = "!!!!&#8220;@selfiequeenbri: cause I'm tired of you big bitches coming for us skinny girls!!&#8221;"
user_txt

In [None]:
user_test = [data_clean(user_txt)]
print(user_test)

In [None]:
seq = load_token.texts_to_sequences(user_test)
padded = pad_sequences(seq, maxlen= 300)

In [None]:
seq

In [None]:
pred = load_clf.predict(padded)

In [None]:
pred

In [None]:
if pred < 0.5:
    print("No Hate")
else:
    print("Hate and Abusive")