# Creating extra NLP features

- **Spell error ratio** using 'autocorrect' package
- **Polarity** and **subjectivity** from TextBlob
- Train Wikipedia comments:  **['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']**
- Readability: **Flesch-Kincaid** and **Automated Readability Index**
- **Spam** classfication probability is added using a Naive Bayes model trained w/1600-spam-corpus


# Read files

### Previous data set w/ Stanford NLP features will be loaded.

In [1]:
import os
import numpy as np
import pandas as pd
import time
import sys
from sklearn.linear_model import LogisticRegression
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import SnowballStemmer
# nltk.download('stopwords')
from nltk.corpus import stopwords
from autocorrect import spell
from textblob import TextBlob
import re, string
from readability_score.calculators.fleschkincaid import *
from readability_score.calculators.ari import *
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
# try:
#     sys.getwindowsversion()
# except AttributeError:
#     isWindows = False
# else:
#     isWindows = True
#use more RAM
# if isWindows:
#     import win32api,win32process,win32con
#     pid = win32api.GetCurrentProcessId()
#     handle = win32api.OpenProcess(win32con.PROCESS_ALL_ACCESS, True, pid)
#     win32process.SetPriorityClass(handle, win32process.HIGH_PRIORITY_CLASS)
regular_reviews = pd.read_csv('data/reg_reviews_NLP.csv')
not_recommended_reviews= pd.read_csv('data/not_reviews_NLP.csv')

In [2]:
len(regular_reviews)

230530

In [3]:
len(not_recommended_reviews)

27057

In [4]:
not_recommended_reviews.describe()

Unnamed: 0,friends,photos,rating,restaurant_id,reviews,numSentence,numWords,totSentiment,avgSentiment,Sfreq0,...,identity_hate,readability_FK,readability_AR,spam,toxic.1,severe_toxic.1,obscene.1,threat.1,insult.1,identity_hate.1
count,27057.0,27057.0,27057.0,27057.0,27057.0,27057.0,27057.0,27057.0,27057.0,27057.0,...,27057.0,27057.0,27057.0,27057.0,27057.0,27057.0,27057.0,27057.0,27057.0,27057.0
mean,33.113058,12.326829,4.122815,339.546993,17.022212,5.193961,62.696714,10.962856,2.299413,0.059208,...,0.000897,8.843109,9.733636,0.539512,0.033635,0.000657,0.008331,0.00022,0.008097,0.000897
std,142.656716,282.878244,1.28637,186.130745,57.924208,4.921907,75.617481,9.043593,0.694986,0.272098,...,0.001806,4.404453,4.938015,0.15168,0.083904,0.002269,0.041856,0.000334,0.025203,0.001806
min,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,3e-05,0.0,0.0,0.086098,1e-05,2.7e-05,5.5e-05,3.3e-05,8e-05,3e-05
25%,0.0,0.0,4.0,207.0,2.0,2.0,19.0,5.0,1.833333,0.0,...,0.000439,7.0,7.0,0.439331,0.008095,0.000316,0.002861,0.000121,0.002902,0.000439
50%,0.0,0.0,5.0,326.0,5.0,4.0,39.0,9.0,2.333333,0.0,...,0.000613,9.0,9.0,0.52046,0.014025,0.000434,0.004017,0.000156,0.004393,0.000613
75%,11.0,1.0,5.0,512.0,11.0,6.0,78.0,14.0,3.0,0.0,...,0.000954,10.0,11.0,0.622585,0.027332,0.000629,0.00608,0.00023,0.007328,0.000954
max,5000.0,24931.0,5.0,665.0,2835.0,101.0,942.0,202.0,4.0,5.0,...,0.18495,437.0,208.0,0.999493,1.0,0.247176,1.0,0.022564,0.984059,0.184952


# spellErrorRatio function

In [5]:
# Create a filter for punctuations
punctuation = re.compile(r'[0-9]')

# Create a regular expression tokenizer
tokenizer = RegexpTokenizer(r'\w+')

# Get the list of stop words 
stop_words = stopwords.words('english')

def spellErrorRatio(input_text):

    # Remove the numbers
    input_text = punctuation.sub("",input_text)
    
    # Tokenize the input string
    tokens = tokenizer.tokenize(input_text)

    # Remove the stop words 
    tokens = [x for x in tokens if not x in stop_words]
    
    # Check spelling error ratio
    if len(tokens)>1:
        cntWrong = 0
        for i, item in enumerate(tokens):
            if(tokens[i]!=spell(item)):
                cntWrong += 1
    #             print(tokens[i], spell(item))  # check which word is wrong.
        return (cntWrong/len(tokens))
    else:
        return (np.nan)

# calculate spell error ratio on both files and save

In [6]:
t0 = time.time()
regular_reviews['spellErrorRatio'] = regular_reviews['review'].apply(lambda x: spellErrorRatio(x))
regular_reviews.to_csv('data/regular_reviews_spell.csv', index=False)
print('hours processed for one file:', (time.time()-t0)/3600)

not_recommended_reviews['spellErrorRatio'] = not_recommended_reviews['review'].apply(lambda x: spellErrorRatio(x))
not_recommended_reviews.to_csv('data/not_reviews_spell.csv', index=False)
print('total hours processed:', (time.time()-t0)/3600)

hours processed for one file: 3.2670867806010775
total hours processed: 3.5838604907857046


## 2. Sentiment Features using 'Text Blob'

In [7]:
# # for regular review file
# regular_reviews['TB_polarity'] = regular_reviews.review.apply(lambda x:TextBlob(x).polarity)
# regular_reviews['TB_subjectivity'] = regular_reviews.review.apply(lambda x:TextBlob(x).subjectivity)
# regular_reviews.to_csv('data/regular_reviews_TB.csv', index=False)

In [8]:
# # for not-recommended review file
# not_recommended_reviews['TB_polarity'] = not_recommended_reviews.review.apply(lambda x:TextBlob(x).polarity)
# not_recommended_reviews['TB_subjectivity'] = not_recommended_reviews.review.apply(lambda x:TextBlob(x).subjectivity)
# not_recommended_reviews.to_csv('data/not_reviews_TB.csv', index=False)

# 3. ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

### Source: Kaggle/ 'Toxic Comment Classification Challenge'
Kaggle objective: Detecting toxic comments that are rude, disrespectful or otherwise likely to make someone leave a discussion.

Kaggle supporter: Conversation AI team by Jigsaw and Google.

### Train data: Wikipedia comments that are labeled by human raters for toxic behavior.  



## Training

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
train = pd.read_csv('data/train_Toxic.csv')
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train['none'] = 1-train[label_cols].max(axis=1)
COMMENT = 'comment_text'
train[COMMENT].fillna("unknown", inplace=True)
train.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805,0.898321
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342,0.302226
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
# Deleting existing columns 
regular_reviews.drop(label_cols, inplace = True, axis=1)
not_recommended_reviews.drop(label_cols, inplace = True, axis=1)

In [11]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,1
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,1
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,1
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,1
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,1


## Building the model

In [12]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

def pr(x,y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

def get_mdl(x,y):
    y = y.values
    r = np.log(pr(x,1,y) / pr(x,0,y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )

# Model cross-validation w/ Stratified k-fold (within Toxic data set)

In [13]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report

skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 1)

for i, j in enumerate(label_cols):

    X = train[COMMENT]
    y = train[j]
    skf.get_n_splits(X,y)

    print('Model Stratified 3-Ford Cross-Validation for ', j,'\n')
    for train_index, test_index in skf.split(X, y):
        X_train0, X_test0 = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        X_train = vec.fit_transform(X_train0)
        X_test = vec.transform(X_test0)

        m,r = get_mdl(X_train, y_train)
        preds = m.predict_proba(X_test.multiply(r))[:,1]

        print(classification_report(y_test, np.round(preds)))

Model Stratified 3-Ford Cross-Validation for  toxic 

             precision    recall  f1-score   support

          0       0.97      0.99      0.98     48093
          1       0.90      0.69      0.78      5098

avg / total       0.96      0.96      0.96     53191

             precision    recall  f1-score   support

          0       0.97      0.99      0.98     48092
          1       0.90      0.69      0.78      5098

avg / total       0.96      0.96      0.96     53190

             precision    recall  f1-score   support

          0       0.97      0.99      0.98     48092
          1       0.91      0.68      0.78      5098

avg / total       0.96      0.96      0.96     53190

Model Stratified 3-Ford Cross-Validation for  severe_toxic 

             precision    recall  f1-score   support

          0       0.99      1.00      0.99     52659
          1       0.48      0.26      0.34       532

avg / total       0.99      0.99      0.99     53191

             precision   

# Re-Model w/ all Toxic Train Data and Apply to Yelp Reviews

## Part 1. Fit to recommended file

In [14]:
n = train.shape[0]

x = vec.fit_transform(train[COMMENT])

test_x = vec.transform(regular_reviews['review'])

preds = np.zeros((len(regular_reviews), len(label_cols)))
for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(x, train[j])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]
    
regular_reviews = pd.concat([regular_reviews, pd.DataFrame(preds, columns = label_cols)], axis=1)
# regular_reviews.to_csv('data/regular_NBSVM.csv', index=False)

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate


## Part 2. Fit to not-recommended file

In [15]:
test_x = vec.transform(not_recommended_reviews['review'])

preds = np.zeros((len(not_recommended_reviews), len(label_cols)))
for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(x, train[j])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

not_recommended_reviews = pd.concat([not_recommended_reviews, pd.DataFrame(preds, columns = label_cols)], axis=1)
# not_recommended_reviews.to_csv('data/not_reviews_NBSVM.csv', index=False)

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate


# 4. Readability

There are multiple readability score formulas as shown below.  They utilize count values of words, sentence, cyllables, and/or characters.  Each formular varies slightly, and gives out grade level who can read easily.  


 - Flesch-Kincaid
 - Coleman-Liau
 - Dale-Chall
 - SMOG
 - Automated Readability Index
 - Flesch Reading Ease (does not have min_age)

source <https://github.com/wimmuskee/readability-score>

### Readabiltiy Score Calculation

<https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests>

https://en.wikipedia.org/wiki/Automated_readability_index

## Part 1. For recommended file

In [16]:
regular_reviews['readability_FK'] = regular_reviews.review.apply(lambda x:FleschKincaid(x).min_age)
regular_reviews['readability_AR'] = regular_reviews.review.apply(lambda x:ARI(x).min_age)
regular_reviews.to_csv('data/reg_reviews_Readability.csv', index=False)

## Part 2. For not-recommended file

In [17]:
not_recommended_reviews['readability_FK'] = not_recommended_reviews.review.apply(lambda x:FleschKincaid(x).min_age)
not_recommended_reviews['readability_AR'] = not_recommended_reviews.review.apply(lambda x:ARI(x).min_age)
not_recommended_reviews.to_csv('data/not_reviews_Readability.csv', index=False)

# 5. Spam classification

Spam Corpus Data : http://myleott.com/op-spam.html


## Objective :  Classifying Yelp review into spam index [1=spam, 0=not spam].


## Method: 

1. The raw training data set is processed to have a train and a valid data set.  
Code to process: <https://drive.google.com/drive/folders/1VkS0TkjoeQp-vLnXQsw25YeUEiIcbANe>

2. Logistic regression model will be tested with the corpus data itself to check the accuracy of model.

3. Merge corpus train and test set to model with all corpus data set (1600 recordings)

4. The trained model will be used to classify spam from the Yelp data set. 

In [18]:
train = pd.read_csv('data/train_SpamCorpus.csv')
valid = pd.read_csv('data/valid_SpamCorpus.csv')

In [19]:
# append corpus data sets
trainAll = train.append(valid, ignore_index=True)
len(trainAll)

1600

# Modeling/Validation within Spam Corpus Data Set

##  <span style="color:blue">Crete a bag of words representation, as a term document matrix. 
##  <span style="color:blue">Use ngrams, as suggested in the NBSVM paper.

In [20]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

### TF-IDF priors

In [21]:
n = train.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )

### Basic naive bayes feature equation:

In [22]:
def pr(x, y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

# Fit a model for one dependent at a time:
This function is designed for multiple dependent features/labels

In [23]:
def get_mdl(x, y):
    y = y.values
    r = np.log(pr(x,1,y) / pr(x,0,y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [24]:
trainAll.head()

Unnamed: 0,class,polarity,source,fold,file,review
0,0,positive_polarity,deceptive_from_MTurk,2,d_talbott_9.txt,"excellent staff and customer service, very cle..."
1,0,positive_polarity,deceptive_from_MTurk,2,d_talbott_8.txt,my stay at this hotel was one of the best i ha...
2,0,positive_polarity,deceptive_from_MTurk,2,d_affinia_20.txt,we just got back from a trip to chicago for my...
3,0,positive_polarity,deceptive_from_MTurk,2,d_hardrock_18.txt,i have to say that the hard rock hotel in chic...
4,0,positive_polarity,deceptive_from_MTurk,2,d_hardrock_19.txt,my husband and i recently stayed at the hard r...


# Model cross-validation w/ Stratified k-fold (within spam corpus data set)

In [25]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report

skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 1)

X = trainAll['review']
y = trainAll['class']
skf.get_n_splits(X,y)

print('Model Stratified 3-Ford Cross-Validation:')
for train_index, test_index in skf.split(X, y):
    X_train0, X_test0 = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    X_train = vec.fit_transform(X_train0)
    X_test = vec.transform(X_test0)
    
    m,r = get_mdl(X_train, y_train)
    preds = m.predict_proba(X_test.multiply(r))[:,1]

    print(classification_report(y_test, np.round(preds)))

Model Stratified 3-Ford Cross-Validation:
             precision    recall  f1-score   support

          0       0.87      0.89      0.88       267
          1       0.89      0.87      0.88       267

avg / total       0.88      0.88      0.88       534

             precision    recall  f1-score   support

          0       0.85      0.89      0.87       267
          1       0.88      0.84      0.86       267

avg / total       0.87      0.87      0.87       534

             precision    recall  f1-score   support

          0       0.85      0.90      0.87       266
          1       0.89      0.84      0.86       266

avg / total       0.87      0.87      0.87       532



# Re-Modeling w/ all 1600 spam corpus reviews to apply to Yelp reviews

In [26]:
n = trainAll.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )
x = vec.fit_transform(trainAll.review)

m,r = get_mdl(x,trainAll['class'])

# 4a. Prediction for recommended reveiws

In [27]:
test_x = vec.transform(regular_reviews['review'])
regular_reviews['spam'] = m.predict_proba(test_x.multiply(r))[:,1]
regular_reviews.to_csv('data/reg_reviews_NLP.csv', index=False)

# 4b. Predictions for not-recommended reviews

In [28]:
test_x = vec.transform(not_recommended_reviews['review'])
not_recommended_reviews['spam'] = m.predict_proba(test_x.multiply(r))[:,1]
not_recommended_reviews.to_csv('data/not_reviews_NLP.csv', index=False)