# Importing Tools

In [2]:
import pandas as pd
import re
import imblearn
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import Normalizer
import warnings
import pickle
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score,\
f1_score, roc_auc_score, roc_curve, precision_recall_curve,classification_report,label_ranking_average_precision_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler
from numba import jit, cuda
import imblearn


# Loading Data and Cleaning

In [None]:
data = pd.read_csv("./input/traindata/updated_train_data.csv")

## Function to clean the text.

In [None]:
# function to clean the text performin some simple regex pattern matching
def apply_regex(corpus):
    corpus = corpus.apply(lambda x: re.sub("\S*\d\S*"," ", x))          # removes numbers and words concatenated with numbers (IE h4ck3r)
    corpus = corpus.apply(lambda x: re.sub("\S*@\S*\s?"," ", x))        # removes emails and mentions (words with @)
    corpus = corpus.apply(lambda x: re.sub("\S*#\S*\s?"," ", x))        # removes hashtags (words with #)
    corpus = corpus.apply(lambda x: re.sub(r'http\S+', ' ', x))         # removes URLs
    corpus = corpus.apply(lambda x: re.sub(r'[^a-zA-Z0-9 ]', ' ',x))    # keeps numbers and letters
    corpus = corpus.apply(lambda x: x.replace(u'\ufffd', '8'))          # replaces the ASCII '�' symbol with '8'
    corpus = corpus.apply(lambda x: re.sub(' +', ' ', x))               # removes multiple spaces
    corpus.strip()
    return corpus

In [None]:
# apply the function and clean the data
feature = "comment_text"
data[feature] = apply_regex(data[feature])

# Sentiment Analysis

The purpose of the following code is to demonstrate the data modification process of the original set. **Do not execute this section.**

In [None]:
#Importing orignal comment data
comm = pd.read_csv('./input/traindata/traindata.csv',encoding='utf8',error_bad_lines=False,index_col=False)

In [None]:
sid = SentimentIntensityAnalyzer()

In [None]:
def get_pol(df):
    df['neg'] = df['comment_text'].apply(lambda comment_text:sid.polarity_scores(str(comment_text))['neg'])
    df['neu'] = df['comment_text'].apply(lambda comment_text:sid.polarity_scores(str(comment_text))['neu'])
    df['pos'] = df['comment_text'].apply(lambda comment_text:sid.polarity_scores(str(comment_text))['pos'])
    df['compound'] = df['comment_text'].apply(lambda comment_text:sid.polarity_scores(str(comment_text))['compound'])
    return df

In [None]:
from numba import jit, cuda
import numpy as np
    
# function optimized to run on gpu 
@jit(target_backend='cuda')    
def sentiment_score(df):
	for i,score in enumerate(df['compound']):
		if float(score) > 0:
			df['sentiment'][i] = 'positive'
		elif float(score) == 0.0:
			df['sentiment'][i] = 'neutral'
		else:
			df['sentiment'][i] ='negative'
	return df

In [None]:
df = get_pol(comm)
df = sentiment_score(df)

In [None]:
filename = './input/traindata/updated_train_data.csv'  
df.to_csv(file_name, encoding='utf-8', index=False)

# Data Manipulation

## Creating a Toxicity Rating Scale

In [None]:
df = data[['comment_text','sentiment']]
df['score'] = data['severe_toxic']+data['obscene']+data['threat']+data['insult']+data['identity_hate']
df.head()

## Combining Text Columns

In [None]:
X = pd.DataFrame(df['sentiment'] +' '+ df['comment_text'])
X.columns = ['text']

## Splitting and Encoding

In [None]:
y = df[['score']]
X_train,X_test,y_train,y_test = train_test_split(X['text'],y,random_state = 42)

In [None]:
# encodin text into vectors
tfid = TfidfVectorizer(lowercase=False,max_features=500)

train_vectors_tfidf = tfid.fit_transform(X_train).toarray()
test_vectors_tfidf = tfid.transform(X_test).toarray()

## Normalizing

In [None]:
oversample = RandomOverSampler(sampling_strategy='minority')
train_vectors_tfidf, y_train_tfidf = oversample.fit_resample(train_vectors_tfidf, y_train.values.ravel())
test_vectors_tfidf, y_test_tfidf = oversample.fit_resample(test_vectors_tfidf, y_test)



norm_TFIDF = Normalizer(copy=False)
norm_train_tfidf = norm_TFIDF.fit_transform(train_vectors_tfidf)
norm_test_tfidf = norm_TFIDF.transform(test_vectors_tfidf)


# Logistic Regression Model

In [None]:
model = LogisticRegression(random_state=42, class_weight='balanced', n_jobs=6, max_iter=300)
model.fit(norm_train_tfidf, y_train_tfidf)


## Testing Model

In [None]:
prediction = model.predict(norm_test_tfidf)


In [None]:
prediction

In [None]:
print(classification_report(y_test_tfidf, prediction, labels=[0,1,2,3,4,5]))


In [None]:
filename = 'logist_model.sav'
pickle.dump(model, open(filename, 'wb'))