In [None]:
# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
import nltk

#preprocessing
from nltk.corpus import stopwords  #stopwords
from nltk import word_tokenize,sent_tokenize # tokenizing
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer  # lemmatizer from WordNet

from nltk import pos_tag

import re # regex

#model_selection
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

#evaluation
from sklearn.metrics import accuracy_score,roc_auc_score 
from sklearn.metrics import classification_report
from mlxtend.plotting import plot_confusion_matrix

#preprocessing scikit
from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.impute import SimpleImputer

#classifiaction.
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC,SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

#stop-words
stop_words=set(nltk.corpus.stopwords.words('english'))

#keras
import keras
from keras.preprocessing.text import one_hot,Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense , Flatten ,Embedding,Input,CuDNNLSTM,LSTM
from keras.models import Model
from keras.preprocessing.text import text_to_word_sequence

#gensim w2v
#word2vec
from gensim.models import Word2Vec

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/My Drive/

In [None]:
df = pd.read_csv('cyberbullying_tweets.csv')
df.head()

In [None]:
df = df.rename(columns={"tweet_text": "tweet", "cyberbullying_type": "label"})

In [None]:
plt.figure(figsize = (7,7))
sorted_counts = df['label'].value_counts()
plt.pie(sorted_counts, labels = sorted_counts.index, startangle = 90, counterclock = False, wedgeprops = {'width' : 0.6},
       autopct='%1.1f%%', pctdistance = 0.7, textprops = {'color': 'black', 'fontsize' : 15}, 
        colors = sns.color_palette("pastel")[4:])
plt.text(x = -0.40, y = 0, s = 'Total number of Tweets:')
plt.text(x = -0.15, y = -0.1, s = format(df.shape[0]))
plt.title('Pie chart for : Tweets in the Dataset', fontsize = 16);

In [None]:
print(df['tweet'].isnull().sum())
print(df['label'].isnull().sum())
print(df.shape)

In [None]:
for tweet in df['tweet'][:5]:
    print(tweet+'\n'+'\n')

In [None]:
print(df['label'].unique())

In [None]:
def rename_label(label):
    if(label == 'not_cyberbullying'):
        return 0
    else:
        return 1

In [None]:
df['label']=df['label'].apply(rename_label)
print(df['label'].unique())

In [None]:
plt.figure(figsize = (7,7))
sorted_counts = df['label'].value_counts()
plt.pie(sorted_counts, labels = sorted_counts.index, startangle = 90, counterclock = False, wedgeprops = {'width' : 0.6},
       autopct='%1.1f%%', pctdistance = 0.7, textprops = {'color': 'black', 'fontsize' : 15}, 
        colors = sns.color_palette("pastel")[4:])
plt.text(x = -0.40, y = 0, s = 'Total number of Tweets:')
plt.text(x = -0.15, y = -0.1, s = format(df.shape[0]))
plt.title('Pie chart for : Tweets in the Dataset', fontsize = 16);

In [None]:
df['label'].value_counts()

In [None]:
def cleanup(tweet):  
    
    tweet = re.sub(r'http\S+', '', tweet)
    tweet = re.sub("[^a-zA-Z]"," ",tweet)
    word_tokens= tweet.lower().split()

    
    # Remove stopwords
    le=WordNetLemmatizer()
    stop_words= set(stopwords.words("english"))     
    word_tokens= [le.lemmatize(w) for w in word_tokens if not w in stop_words]
    
    cleaned=" ".join(word_tokens)
    return cleaned

In [None]:
# shuffling rows
df = df.sample(frac=1).reset_index(drop=True)
print(df.shape)

In [None]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentences=[]
sum=0

sumbeforecleaning=0
for tweet in df['tweet']:
    sumbeforecleaning +=len(tweet)
    cleaned_sents=cleanup(tweet.strip())
    sum += len(cleaned_sents)
    sents=tokenizer.tokenize(cleaned_sents)
    for sent in sents:
        sentences.append(sent.split())

print(sumbeforecleaning)
print(sum)
print(len(sentences))

In [None]:
  # trying to print few sentences
for te in sentences[:5]:
    print(te,"\n")

In [None]:
import gensim
w2v_model=gensim.models.Word2Vec(sentences=sentences,size=300,window=10,min_count=1)

In [None]:
w2v_model.train(sentences,epochs=10,total_examples=len(sentences))

In [None]:
# total numberof extracted words.
vocab=w2v_model.wv.vocab
print("The total number of words are : ",len(vocab))

In [None]:
vocab=list(vocab.keys())
print(len(vocab))

In [None]:
# trying to print few vocab
for v in vocab[:5]:
    print(v,"\n")

In [None]:
word_vec_dict={} #dict of words we are creating for our vocab
for word in vocab:
  word_vec_dict[word]=w2v_model.wv.get_vector(word)
print("The no of key-value pairs : ",len(word_vec_dict)) # should come equal to vocab size
  

In [None]:
# cleaning
df['tweet']=df['tweet'].apply(cleanup)

In [None]:
#to find the maximum lenght of any document.

max=-1
for i,rev in enumerate(df['tweet']):
    tokens=rev.split()
    if(len(tokens)>max):
        max=len(tokens)
print(max)

In [None]:
tok = Tokenizer()
tok.fit_on_texts(df['tweet']) #Updates internal vocabulary based on a list of texts. word_index gives an index to each word
vocab_size = len(tok.word_index) + 1
print(vocab_size)
encd_rev = tok.texts_to_sequences(df['tweet']) #Transforms each text in texts to a sequence of integers

In [None]:
max_len = 496
vocab_size = len(tok.word_index) + 1  # total no of words
embed_dim=300

In [None]:
pad_rev= pad_sequences(encd_rev, maxlen=max_len, padding='post')
pad_rev.shape

In [None]:
print(list(tok.word_index.items())[:5])

In [None]:
# creating the embedding matrix
embed_matrix=np.zeros(shape=(vocab_size,embed_dim))
for word,i in tok.word_index.items():
  embed_vector=word_vec_dict.get(word)
  if embed_vector is not None:  # word is in the vocabulary learned by the w2v model
    embed_matrix[i]=embed_vector

In [None]:
# prepare train and val sets first
Y=keras.utils.to_categorical(df['label'])  # one hot target as required by NN.
x_train,x_test,y_train,y_test=train_test_split(pad_rev,Y,test_size=0.20,random_state=42, stratify = Y)

In [None]:
from keras.initializers import Constant
from keras.layers import ReLU
from keras.layers import Dropout
from keras.layers import LSTM

In [None]:
model=Sequential()
model.add(Embedding(input_dim=vocab_size,output_dim=embed_dim,input_length=max_len,embeddings_initializer=Constant(embed_matrix)))
model.add(Dropout(0.25))
model.add(LSTM(64, input_shape=(10,1))) 
model.add(Dropout(0.50))
model.add(Dense(2,activation='softmax')) 

model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

epochs=5
batch_size=64

model.fit(x_train,y_train,epochs=epochs,batch_size=batch_size,validation_data=(x_test,y_test))

In [None]:
from sklearn.metrics import classification_report
y_test_bool=np.argmax(y_test,axis=1)
y_pred = model.predict(x_test, verbose=1)
y_pred_bool = np.argmax(y_pred, axis=1)

print(classification_report(y_test_bool, y_pred_bool))