# Dataset Preprocessing and Model Training
### Contents
1. Python Libraries
2. Function containing Text Preprocessing Techiniques done
> * Case Folding
> * Removal of Punctuations
> * Removal of Stopwords
> * Removal of Emojis
> * Word Stemming
3. Model Training
> * Holdout Method
> * Metrics

## Python Libraries

In [None]:
import pandas as pd
import re
import numpy as np
import string
import seaborn as sns
import demoji
import joblib

from collections import Counter
from IPython.core.interactiveshell import InteractiveShell
from nltk.corpus import stopwords, words
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## Function containing Text Preprocessing Techiniques done

In [None]:
def fxn_case_folding(var_input):
    """
    Preprocessing: Case Folding
    """
    return var_input.lower()

def fxn_remove_non_english(input_text):
    """
    Preprocessing: Removing non-english words
    """
    remove_words = " ".join([w for w in input_text.split() if w in words.words()])
    return remove_words

def fxn_punctuation(var_input_text):
    """
    Preprocessing: Punctuation Removal
    """
    var_output_text = re.sub("[%s]" % re.escape(string.punctuation), " ", var_input_text)
    var_output_text = re.sub("[%s]" % re.escape(string.punctuation), " ", var_output_text)
    var_output_text = re.sub('\w*\d\w*', '', var_output_text) # HINT: lookup isalpha() function
    return var_output_text

def fxn_stopwords(var_input_text):
    """
    Preprocessing: Stopwords Removal
    """
    var_etd_stop = " ".join([
        var_etd_word for var_etd_word in var_input_text.split() 
        if var_etd_word not in stopwords.words('english')
    ])
    return var_etd_stop

def fxn_demoji(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U00010000-\U0010ffff"
                               "]+", flags=re.UNICODE)
    return(emoji_pattern.sub(r'', text))

def fxn_stem(var_input_text):
    """
    Preprocessing: Stemming
    """
    var_stemmer = LancasterStemmer()
    var_output_text = " ".join([
        var_stemmer.stem(var_etd_word) for var_etd_word in var_input_text.split() 
    ])
    return var_output_text

In [None]:
pd_dataset = pd.read_csv('train.csv', sep = ',')

In [None]:
pd_dataset.head(10)

In [None]:
pd_dataset = pd_dataset.iloc[:,2:4]

In [None]:
pd_dataset.head()

In [None]:
len(pd_dataset)

In [None]:
y_array = pd_dataset['toxic']

In [None]:
pd_dataset = pd_dataset.drop(['toxic'], axis = 1)

In [None]:
pd_dataset.head()

In [None]:
pd_dataset['comment_text'] = pd_dataset['comment_text'].apply(fxn_case_folding)
pd_dataset.head()

In [None]:
pd_dataset['comment_text'] = pd_dataset['comment_text'].apply(fxn_punctuation)
pd_dataset.head()

In [None]:
pd_dataset['comment_text'] = pd_dataset['comment_text'].apply(fxn_stopwords)
pd_dataset.head()

In [None]:
pd_dataset['comment_text'] = pd_dataset['comment_text'].apply(fxn_demoji)
pd_dataset.head()

In [None]:

pd_dataset['comment_text'] = pd_dataset['comment_text'].apply(fxn_stem)
pd_dataset.head()

In [None]:
len(pd_dataset)

In [None]:
train_data_transformation_tfidv = TfidfVectorizer(max_features=534, min_df=5, max_df=0.7)

In [None]:
# Transforming the data to tfidf
train_data_transformation_tfidv.fit_transform(pd_dataset['comment_text'])
#variable to hold transformed data
pd_dataset_transformed = train_data_transformation_tfidv.fit_transform(pd_dataset['comment_text'])

In [None]:
train_data_transformation_tfidv.get_feature_names()
#len(train_data_transformation_tfidv.get_feature_names())

In [None]:
len(train_data_transformation_tfidv.get_feature_names())

In [None]:
pd_dataset_feature_names = pd.DataFrame(pd_dataset_transformed.toarray(), columns = train_data_transformation_tfidv.get_feature_names())

pd_dataset_feature_names

In [None]:
joblib.dump(pd_dataset_feature_names, 'training_dataset.pkl')

In [None]:
# Taking the feature as input data for training
X = pd_dataset_feature_names

In [None]:
# Taking the Toxic labeled data as output for training and also converting to them to numpy array
y = y_array.to_numpy()

In [None]:
# Using holdout method to split dataset into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()

In [None]:
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

In [None]:
import pickle

with open('text_classifier', 'wb') as picklefile:
    pickle.dump(classifier,picklefile)

In [None]:
with open('text_classifier', 'rb') as training_model:
    model = pickle.load(training_model)

In [None]:
y_pred2 = model.predict(X_test)

print(confusion_matrix(y_test, y_pred2))
print(classification_report(y_test, y_pred2))
print(accuracy_score(y_test, y_pred2)) 