
#Copyright (c) DeepSphere.AI 2021

# All rights reserved

# We are sharing this partial code for learning and research, and the idea behind us sharing the source code is to stimulate ideas #and thoughts for the learners to develop their ML Knowledge.

# Author: # DeepSphere.AI | deepsphere.ai | dsschoolofai.com | info@deepsphere.ai

# Release: Initial release


In [None]:
# Importing Libraries

import pandas as pd
import numpy as np
import itertools
import re
import tensorflow as tf

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

from sklearn import metrics


In [3]:

class TextClassification:
    def __init__(self,data,target_column):
        self.data = data
        self.target_column = target_column
    def display_column(self):
        print('*'*30+'COLUMN NAMES'+'*'*30+'\n\t\t' ,self.data.columns)
    def data_preprocessing(self,vAR_test_data):
        print('*'*30+'DATA PRE-PROCESSING'+'*'*30+'\n\t\t1.Remove Stop Words\n\t\t2.Stemming/Lemmatization')
        vAR_ps = PorterStemmer()
        vAR_corpus = []
        if vAR_test_data is None:
            data = self.data
        else:
            data = vAR_test_data
        for i in range(0, len(data)):
            vAR_review = re.sub('[^a-zA-Z]', ' ', data['comment_text'][i])
            vAR_review = vAR_review.lower()
            vAR_review = vAR_review.split()

            vAR_review = [vAR_ps.stem(word) for word in vAR_review if not word in stopwords.words('english')]
            vAR_review = ' '.join(vAR_review)
            vAR_corpus.append(vAR_review)
        return vAR_corpus
    def bagofwords_vectorization(self,vAR_corpus,vAR_test_data):
        vAR_cv = CountVectorizer(max_features=5000,ngram_range=(1,3))
        vAR_X = vAR_cv.fit_transform(vAR_corpus).toarray()
        if vAR_test_data is None:
            vAR_y = self.data[self.target_column]
        else: 
            vAR_y = vAR_test_data[self.target_column]
        return vAR_X,vAR_y
    def tfidf_vectorization(self):
        pass
    def word_embedding_vectorization(self,vAR_corpus,vAR_test_data):
        vAR_voc_size=10000
        vAR_sent_length=8
        vAR_onehot_repr=[one_hot(words,vAR_voc_size)for words in vAR_corpus]
        vAR_embedded_docs=pad_sequences(vAR_onehot_repr,padding='pre',maxlen=vAR_sent_length)
        vAR_model=Sequential()
        vAR_model.add(Embedding(vAR_voc_size,10,input_length=vAR_sent_length))
        vAR_model.compile('adam','mse')
        vAR_X = vAR_model.predict(vAR_embedded_docs)
        if vAR_test_data is None:
            vAR_y = self.data[self.target_column]
        else: 
            vAR_y = vAR_test_data[self.target_column]
        return vAR_X,vAR_y
        
    def train_test_split(self,vAR_X,vAR_y):
        vAR_X_train, vAR_X_test, vAR_y_train, vAR_y_test = train_test_split(vAR_X, vAR_y, test_size=0.3, random_state=0)
        return vAR_X_train,vAR_y_train,vAR_X_test,vAR_y_test
    def test_model(self,vAR_model,vAR_X_test):
        vAR_prediction = vAR_model.predict(vAR_X_test)
        return vAR_prediction
    def accuracy_score(self,vAR_prediction,vAR_y_test):
        score = metrics.accuracy_score(vAR_y_test, vAR_prediction)
        return score
        

In [4]:
class ClassificationModels(TextClassification):
    def __init__(self,data,target_column):
        TextClassification.__init__(self,data,target_column)
    def train_model_naivebayes(self,vAR_X_train,vAR_y_train):
        vAR_model=MultinomialNB()
        vAR_model = MultiOutputClassifier(vAR_model)
        vAR_model.fit(vAR_X_train, vAR_y_train)
        return vAR_model
    def train_model_random_forest(self,vAR_X_train,vAR_y_train):
        vAR_model=RandomForestClassifier()
        # vAR_model = MultiOutputClassifier(vAR_model)
        vAR_model.fit(vAR_X_train, vAR_y_train)
        return vAR_model
    def train_model_lstm(self,vAR_X_train,vAR_y_train,vAR_X_test,vAR_y_test):
        # vAR_embedding_vector_features=40
        vAR_model=Sequential()
        # vAR_model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
        vAR_model.add(LSTM(100))
        vAR_model.add(Dense(units=20, activation="relu"))
        vAR_model.add(Dense(units=20, activation="relu"))
        vAR_model.add(Dense(6,activation='relu'))
        vAR_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
        vAR_model.fit(vAR_X_train,vAR_y_train,validation_data=(vAR_X_test,vAR_y_test),epochs=10,batch_size=64)
        return vAR_model

In [8]:
if __name__ == "__main__":
    vAR_data = pd.read_csv('/home/jupyter/DSAI_DMV_Text_Analyzer/DSAI_Dataset/train.csv').head(3000)
    vAR_target_columns = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
    vAR_model_obj = ClassificationModels(vAR_data,vAR_target_columns)
    vAR_model_obj.display_column()
    vAR_test_data = None
    vAR_corpus = vAR_model_obj.data_preprocessing(vAR_test_data)
    print('Data Preprocessing Completed')
    
############## *****Execute Below Code For NaiveBayes Classification***** ##############

#     vAR_X,vAR_y = vAR_model_obj.bagofwords_vectorization(vAR_corpus,vAR_test_data)
#     print('Vectorization Completed Using Bag of Words')
#     vAR_X_train,vAR_y_train,vAR_X_test,vAR_y_test = vAR_model_obj.train_test_split(vAR_X,vAR_y)
#     print('Train & Test Data Splitted Successfully')

#     vAR_model = vAR_model_obj.train_model_naivebayes(vAR_X_train,vAR_y_train)
#     print('Naive Bayes Model Trained successfully')
#     vAR_prediction = vAR_model_obj.test_model(vAR_model,vAR_X_test)
#     print('Naive Bayes Model Tested successfully')
#     accuracy = vAR_model_obj.accuracy_score(vAR_prediction,vAR_y_test)
#     print('Naive Bayes Model Accuracy - ',accuracy)

############## ******************************************************** ###############

############## *****Execute Below Code For Random Forest Classification***** ##############
    
    # vAR_X,vAR_y = vAR_model_obj.bagofwords_vectorization(vAR_corpus,vAR_test_data)
    # print('Vectorization Completed Using Bag of Words')
    # vAR_X_train,vAR_y_train,vAR_X_test,vAR_y_test = vAR_model_obj.train_test_split(vAR_X,vAR_y)
    # print('Train & Test Data Splitted Successfully')
    # vAR_model = vAR_model_obj.train_model_random_forest(vAR_X_train,vAR_y_train)
    # print('Random Forest Model Trained successfully')
    # vAR_prediction = vAR_model_obj.test_model(vAR_model,vAR_X_test)
    # print('Random Forest Model Tested successfully')
    # accuracy = vAR_model_obj.accuracy_score(vAR_prediction,vAR_y_test)
    # print('Random Forest Model Accuracy - ',accuracy)

############## ******************************************************** ###############
    
############## *****Execute Below Code For LSTM RNN Deep Learning Model***** ############## 

    vAR_X,vAR_y = vAR_model_obj.word_embedding_vectorization(vAR_corpus,vAR_test_data)
    print('Vectorization Completed Using Word Embedding')
    vAR_X_train,vAR_y_train,vAR_X_test,vAR_y_test = vAR_model_obj.train_test_split(vAR_X,vAR_y)
    print('Train & Test Data Splitted Successfully')
    
    vAR_model = vAR_model_obj.train_model_lstm(vAR_X_train,vAR_y_train,vAR_X_test,vAR_y_test)
    print('LSTM Model Trained successfully')
    vAR_prediction = vAR_model_obj.test_model(vAR_model,vAR_X_test)
    print('LSTM Model Tested successfully')
    
############## ******************************************************** ###############


############## *****Execute Below Code When You want to test the model with custom text data***** ##############
    
    vAR_test_data = pd.read_csv('/home/jupyter/DSAI_DMV_Text_Analyzer/DSAI_Dataset/test-compress-all-labels.csv')
    vAR_X_test_data = vAR_test_data.drop(['toxic','severe_toxic','obscene','threat','insult','identity_hate'],axis=1)
    print('Xtest length - ',len(vAR_test_data))
    vAR_corpus = vAR_model_obj.data_preprocessing(vAR_test_data)
    print('Data Preprocessing Completed')
    vAR_X,vAR_y = vAR_model_obj.word_embedding_vectorization(vAR_corpus,vAR_test_data)
    print('Vectorization Completed Using Word Embedding')
    vAR_prediction = vAR_model_obj.test_model(vAR_model,vAR_X)
    print('Naive Bayes Model Tested successfully')
    print('ypred length - ',len(vAR_prediction))
    vAR_X_test_data['toxic'] = vAR_prediction[:,0]
    vAR_X_test_data['severe_toxic'] = vAR_prediction[:,1]
    vAR_X_test_data['obscene'] = vAR_prediction[:,2]
    vAR_X_test_data['threat'] = vAR_prediction[:,3]
    vAR_X_test_data['insult'] = vAR_prediction[:,4]
    vAR_X_test_data['identity_hate'] = vAR_prediction[:,5]
    vAR_X_test_data.to_csv('/home/jupyter/DSAI_DMV_Text_Analyzer/DSAI_Model_Outcome/DSAI_Model_Outcome.csv')
    print(vAR_X_test_data.tail(20))
    
############## ******************************************************** ###############


******************************COLUMN NAMES******************************
		 Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')
******************************DATA PRE-PROCESSING******************************
		1.Remove Stop Words
		2.Stemming/Lemmatization
Data Preprocessing Completed
Vectorization Completed Using Word Embedding
Train & Test Data Splitted Successfully
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
LSTM Model Trained successfully
LSTM Model Tested successfully
Xtest length -  109
******************************DATA PRE-PROCESSING******************************
		1.Remove Stop Words
		2.Stemming/Lemmatization
Data Preprocessing Completed
Vectorization Completed Using Word Embedding
Naive Bayes Model Tested successfully
ypred length -  109
                   id                                       comment_text  \
89   b794843f3d604