### This small project is meant to build rule-based and machine learning-based models to detect passive sentences.  It takes data from The Third Bush-Kerry Presidential Debat corpus, annotates it and classify the sentences based on n gram pos tags as feature. The pipeline involves data preprcessing, cleaning, preparation, labeling, applying feature selection, data imbalance handling (SMOTE). Three classifiers were used, Xgboost, Catboost and logistic regression. Additional code is also added to make the machine classifier available as a simple web API using Flask (a POST request with the text content as a payload).  
The accuracy metrics for the logistic regression classifier were as follows: accuracy 0.92, precision 1.00, recall 0.90, f1 0.95



In [427]:
import re
import nltk
from nltk.tag import pos_tag, map_tag
from itertools import dropwhile
import pandas as pd
import numpy as np
import collections


from nltk.corpus import CategorizedPlaintextCorpusReader

import random
from nltk.stem.snowball import SnowballStemmer
import numpy as np
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import make_scorer, f1_score, roc_auc_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split

from sklearn.feature_selection import SelectKBest, mutual_info_classif

from sklearn.feature_selection import chi2
import pickle
############################################################################### 
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE

In [341]:
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
nltk.download('punkt')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\majed.aljefri\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\majed.aljefri\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\majed.aljefri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [343]:
def over_sample_SMOTE(X_train, y_train):
    sm=SMOTE(sampling_strategy='not majority', random_state=10) # 'minority'
    X_train_ovr, y_train_ovr=sm.fit_sample(X_train, y_train)

    #print(X_train_ovr.shape, y_train_ovr.shape)
    return X_train_ovr, y_train_ovr

In [344]:
def evaluation(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    bl_accuracy = balanced_accuracy_score(y_test, y_pred)
    precision=precision_score(y_test, y_pred)
    recall=recall_score(y_test, y_pred)
    f1=f1_score(y_test, y_pred)
    
    return accuracy, precision, recall, f1

In [345]:
def print_evlauation(y_test, y_pred):
    accuracy, precision, recall, f1 = evaluation(y_test, y_pred)
    print('accuracy {:.2f}, precision {:.2f}, recall {:.2f}, f1 {:.2f}'.format(accuracy, precision, recall, f1))
    

In [346]:
def select_k_features(X_train,X_test,y_train,k):
    selection = SelectKBest(chi2, k)
    X_train = selection.fit_transform(X_train,y_train)
    X_test = selection.transform(X_test)
    
    return selection, X_train, X_test

In [381]:
def isPassive(sentence):
    '''
    For the sake of quick implementation, this function was utilized from 
    https://github.com/flycrane01/nltk-passive-voice-detector-for-English/blob/master/Passive-voice.py
    '''
    beforms = ['am', 'is', 'are', 'been', 'was', 'were', 'be', 'being']               # all forms of "be"
    aux = ['do', 'did', 'does', 'have', 'has', 'had']                                  # NLTK tags "do" and "have" as verbs, which can be misleading in the following section.
    words = nltk.word_tokenize(sentence)
    tokens = nltk.pos_tag(words)
    tags = [i[1] for i in tokens]
    if tags.count('VBN') == 0:                                                            # no PP, no passive voice.
        return False
    elif tags.count('VBN') == 1 and 'been' in words:                                    # one PP "been", still no passive voice.
        return False
    else:
        pos = [i for i in range(len(tags)) if tags[i] == 'VBN' and words[i] != 'been']  # gather all the PPs that are not "been".
        for end in pos:
            chunk = tags[:end]
            start = 0
            for i in range(len(chunk), 0, -1):
                last = chunk.pop()
                if last == 'NN' or last == 'PRP':
                    start = i                                                             # get the chunk between PP and the previous NN or PRP (which in most cases are subjects)
                    break
            sentchunk = words[start:end]
            tagschunk = tags[start:end]
            verbspos = [i for i in range(len(tagschunk)) if tagschunk[i].startswith('V')] # get all the verbs in between
            if verbspos != []:                                                            # if there are no verbs in between, it's not passive
                for i in verbspos:
                    if sentchunk[i].lower() not in beforms and sentchunk[i].lower() not in aux:  # check if they are all forms of "be" or auxiliaries such as "do" or "have".
                        break
                else:
                    return True
    return False

1.	Splitting the corpus into sentences, printing each sentence on a different line

In [380]:
corpus = 'C:/Side Projects/Learning Branch/2nd_Gore-Bush.txt'
f = open(corpus)
sentences = f.read()

sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', sentences)

for sent in sentences:
    print(sent)
    print('')



 Let's welcome the candidates, Governor Bush and Vice President Gore.

Good evening, from Wake Chapel at Wake Forest University at Winston-Salem, North Carolina.

I'm Jim Lehrer of the News Hour on PBS.

Welcome to this second election 2000 debate between the Republican candidate for president, George W.

Bush of Texas, and the Democratic candidate, Vice President Al Gore.

These debates are sponsored by the Commission on Presidential Debates.

The format and the rules are those negotiated by representatives of the two campaigns.

Only the subjects tonight and the questions are mine.

The format tonight is that of a conversation.

The only prevailing rule is that no single response can ever, ever exceed two minutes.

The prevailing rule for the audience here in the hall is as always, absolute quiet, please.

Good evening, Governor Bush, Vice President Gore.

The end of our 90 minutes last week in Boston, the total time each of you took was virtually the same.

Let's see if we can do 

### 2.	For each sentence, extract the part-of-speech (POS) tag of each word (Note: use one of the open-source automatic POS taggers that exist out there).


In [384]:
for sent in sentences:
    
    words = nltk.word_tokenize(sent)
    tokens = nltk.pos_tag(words)
    tags = [i[1] for i in tokens]
    print(tags)

['VB', 'POS', 'VB', 'DT', 'NNS', ',', 'NNP', 'NNP', 'CC', 'NNP', 'NNP', 'NNP', '.']
['JJ', 'NN', ',', 'IN', 'NNP', 'NNP', 'IN', 'NNP', 'NNP', 'NNP', 'IN', 'NNP', ',', 'NNP', 'NNP', '.']
['PRP', 'VBP', 'NNP', 'NNP', 'IN', 'DT', 'NNP', 'NNP', 'IN', 'NNP', '.']
['VB', 'TO', 'DT', 'JJ', 'NN', 'CD', 'NN', 'IN', 'DT', 'NNP', 'NN', 'IN', 'NN', ',', 'NNP', 'NNP', '.']
['NNP', 'IN', 'NNP', ',', 'CC', 'DT', 'JJ', 'NN', ',', 'NNP', 'NNP', 'NNP', 'NNP', '.']
['DT', 'NNS', 'VBP', 'VBN', 'IN', 'DT', 'NNP', 'IN', 'NNP', 'NNP', '.']
['DT', 'NN', 'CC', 'DT', 'NNS', 'VBP', 'DT', 'VBN', 'IN', 'NNS', 'IN', 'DT', 'CD', 'NNS', '.']
['RB', 'DT', 'NNS', 'NN', 'CC', 'DT', 'NNS', 'VBP', 'JJ', '.']
['DT', 'NN', 'NN', 'VBZ', 'IN', 'IN', 'DT', 'NN', '.']
['DT', 'JJ', 'NN', 'NN', 'VBZ', 'IN', 'DT', 'JJ', 'NN', 'MD', 'RB', ',', 'RB', 'VBP', 'CD', 'NNS', '.']
['DT', 'NN', 'NN', 'IN', 'DT', 'NN', 'RB', 'IN', 'DT', 'NN', 'VBZ', 'IN', 'RB', ',', 'JJ', 'NN', ',', 'NN', '.']
['JJ', 'NN', ',', 'NNP', 'NNP', ',', 'NNP', 'NN

## 3.	Extract only passive sentences, meaning only sentences where a participle (POS tag: VBN) is preceded by a form of the verb "to be" (am, is, are, be, been, being, was, were. Make sure to include contractions like 's, 're, and 'm as well). 


In [392]:
passive = []
active = []
for s in sentences:
    if isPassive(s):
        passive.append(s)
    else:
        active.append(s)

In [393]:
print('Passsive',len(passive))
print('Active',len(active))

Passsive 118
Active 970


### 4. Use the sentences that were extracted in the previous item (and those that were not) to create a machine learning based text classifier. The classifier gets a new sentence as an input and determines whether it is in the passive voice or not.

### I also added additional sentences for both classes from an additional resource

In [394]:
file = open('C:/Side Projects/Learning Branch/Passive.txt',"r+")  
additional_passive = file.read() 
file.close()

file = open('C:/Side Projects/Learning Branch/Active.txt',"r+")  
additional_active = file.read() 
file.close()

In [395]:
for s in additional_passive.split('.'):
    passive.append(s)

for s in additional_active.split('.'):
    active.append(s)

In [396]:
print('Passsive',len(passive))
print('Active',len(active))

Passsive 211
Active 1043


Build a dataframe df with all sentences and corresponsing labels

In [397]:
passive_sents = {'Sentence': passive,
        'Label': np.zeros(len(passive))
        }

active_sents = {'Sentence': active,
        'Label': np.ones(len(active))
        }
df1 = pd.DataFrame(passive_sents, columns = ['Sentence','Label'])
df2 = pd.DataFrame(active_sents, columns = ['Sentence','Label'])

df = pd.concat([df1, df2])


In [398]:
df.head()

Unnamed: 0,Sentence,Label
0,These debates are sponsored by the Commission ...,0.0
1,The format and the rules are those negotiated ...,0.0
2,One of you is about to be elected the leader o...,0.0
3,I also understand that an administration is no...,0.0
4,I haven't started naming names except for one ...,0.0


In [399]:
df.tail()

Unnamed: 0,Sentence,Label
1038,\\nI may not buy the computer,1.0
1039,\\nHe may sell the house,1.0
1040,\\nMay I buy the computer?\\nRisky can not buy...,1.0
1041,\\nShe can sell the car every time,1.0
1042,"\\nCan she play a violin?""\n",1.0


Some funtions for feature extraction. Mainly the features will be n-grams of POS tags as these are believed to reveal the passive voices in a given sentence if found

In [400]:
import string 
def get_tages (row):
    words_ = nltk.word_tokenize(row['Sentence'])
    words = [] 
    for word in words_: # Go through every word in your tokens list
        if (word not in string.punctuation):  # remove punctuation
            words.append(word)
            
    tokens = nltk.pos_tag(words)
    tags = [i[1] for i in tokens]
    
    return ' '.join(tags)

In [401]:
def get_tages2 (sent):
    
    words_ = nltk.word_tokenize(sent)
    words = [] 
    for word in words_: # Go through every word in your tokens list
        if (word not in string.punctuation):  # remove punctuation
            words.append(word)
        
    tokens = nltk.pos_tag(words)
    tags = [i[1] for i in tokens]
    
    return ' '.join(tags)

In [402]:
# add a new colums with pos tags to the dataframe
df['pos_tags'] = df.apply(lambda row: get_tages(row), axis=1)


In [404]:
# apply tfidf for feature extraction 
cv = TfidfVectorizer(ngram_range=(2, 4), sublinear_tf  =True)
        
X = np.array(df['pos_tags'])
y = np.array(df['Label'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=1, stratify=y)

X_train = cv.fit_transform(X_train) 
X_test = cv.transform(X_test)


In [405]:
#apply feature selection, here the top 200 n-grams will be used by the classifiers
selection, X_train, X_test = select_k_features(X_train,X_test,y_train,200)
X_train, y_train = over_sample_SMOTE(X_train, y_train)



In [406]:
cb_clf = CatBoostClassifier(verbose= False)
cb_clf.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x2760bdc9fd0>

In [407]:
y_pred = cb_clf.predict(X_test)
print(confusion_matrix(y_test, y_pred) )
print_evlauation(y_test, y_pred)

[[21  0]
 [ 7 98]]
accuracy 0.94, precision 1.00, recall 0.93, f1 0.97


In [408]:
rl_clf = LogisticRegression()
rl_clf.fit(X_train, y_train)
y_pred = rl_clf.predict(X_test)
print(confusion_matrix(y_test, y_pred) )
print_evlauation(y_test, y_pred)

[[21  0]
 [10 95]]
accuracy 0.92, precision 1.00, recall 0.90, f1 0.95


### Predict new sentences

In [410]:
def new_prediction(sentence,selection):
   
    tags = np.array(str(get_tages2(sentence)))
    features = cv.transform(tags.ravel())
   
    features = selection.transform(features[0]) 
    if rl_clf.predict(features):
        return 'Active'
    else:
        return 'Passive'
    

In [416]:
new_prediction('The checken was eaten by Mike',selection)

'Passive'

In [420]:
new_prediction('I built this cool passive detector',selection)

'Active'

In [419]:
new_prediction('My home has been built by a builder',selection)

'Passive'

## 5.	Make the machine classifier available as a simple web API using Flask (a POST request with the text content as a payload).  The API must be optimized for performance.
   

train a model on all data 

In [413]:
X_ = cv.fit_transform(X) 
#apply feature selection, here the top 200 n-grams will be used by the classifiers
selection = SelectKBest(chi2, 200)
X_ = selection.fit_transform(X_,y)
# apply oversampling
X_, y = over_sample_SMOTE(X_, y)



In [414]:
rl_clf = LogisticRegression()
rl_clf.fit(X_, y)

LogisticRegression()

In [418]:
model_file = 'C:/Side Projects/Learning Branch/model.pickle'
pickle.dump([rl_clf, cv, selection], open(model_file, 'wb'))

In [376]:
# This is meant to be in a separate python file
#app.py
from flask import Flask, request, redirect, url_for, flash, jsonify
import numpy as np
import pickle as p
import json
import string

app = Flask(__name__)
model,selection = p.load(open(model_file, 'rb'))


@app.route('/')
def home():
    return render_template('index.html')

@app.route('/predict_api/', methods=['POST'])
def predict_api():
    new_sentence = request.get_json(force = True)
    prediction = new_prediction(new_sentence, model, selection)
    
    return jsonify(prediction)

def get_tages2 (sent):
    
    words_ = nltk.word_tokenize(sent)
    words = [] 
    for word in words_: # Go through every word in your tokens list
        if (word not in string.punctuation):  # remove punctuation
            words.append(word)
        
    tokens = nltk.pos_tag(words)
    tags = [i[1] for i in tokens]
    
    return ' '.join(tags)

def new_prediction(sentence,model, selection):
   
    tags = np.array(str(get_tages2(sentence)))
    features = cv.transform(tags.ravel())
   
    features = selection.transform(features[0]) 
    if model.predict(features):
        return 'Active'
    else:
        return 'Passive'
    
if __name__ == '__main__':
    app.run(debug=True)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: on


 * Restarting with windowsapi reloader


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
# This is also meant to be in a separate file (the request file)
# -*- coding: utf-8 -*-
"""
Created on Tue Dec  8 16:43:23 2020

@author: majed.aljefri
"""
import requests
import json

url = 'http://127.0.0.1:5000/predict_api/'

data = 'The cake was eaten by Adam'
j_data = json.dumps(data)
headers = {'content-type': 'application/json', 'Accept-Charset': 'UTF-8'}
r = requests.post(url, data=j_data, headers=headers)
print('Sentence: ',data)
print(r.text)



## 6.	Some Unittest for all functions created


In [421]:
import unittest

In [426]:
class TestPassvie(unittest.TestCase):
    def test_passive1(self):
        # test if sentence is not string
        self.assertRaises(ValueError, new_prediction, ' 1223 is going 15')
        
    def test_passive2(self):
        # test if sentence is not string
        self.assertAlmostEqual(new_prediction('I built this cool passive detector',selection),'Active')
        self.assertAlmostEqual(new_prediction('The checken was eaten by Mike',selection),'Passive')
        
    