In [None]:
#Imports stuff and installs stuff takes a few secs to run
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import csv
import numpy as np
import spacy
import sklearn
import pickle
import re
from sklearn.preprocessing import MinMaxScaler
nltk.download('averaged_perceptron_tagger')
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
string_punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
stop_words = set(stopwords.words('english'))
!python -m spacy download en_core_web_md

# Spacy model imported
nlp = spacy.load("en_core_web_md")

In [2]:
#Input anvil sentence text in sentences list
sentences = ["How's it going?",
            "It is our honest opinion that our organization is not properly positioned to invest in that platform.",
            "We have reviewed your application and, unfortunately, we have decided to move forward with another applicant for this position.",
            "Hello I am very nice to meet you!",
            "I need help with my project."]

#Converts that to a dataframe
data = pd.DataFrame(sentences,columns=['text'])

In [3]:


#Active/passive voice

# function to check the type of sentence
# 0 refers to passive, 1 refers to active
def checkForSentType(inputSentence):   
    # running the model on sentence
    getDocFile = nlp(inputSentence)
    
    # getting the syntactic dependency 
    getAllTags = [token.dep_ for token in getDocFile]
    
    # checking for 'agent' tag
    checkPassiveTest = any(['agent' in sublist for sublist in getAllTags])
    
    # checking for 'nsubjpass' tag
    checkPassiveTestTwo = any(['nsubjpass' in sublist for sublist in getAllTags])
    if checkPassiveTest or checkPassiveTestTwo:
        return 0
    else:
        return 1


#Importing the list of formal/informal words
formal_words = pd.read_excel('formal_words.xlsx',header=None)
formal_list = formal_words[0].values.tolist()
informal_words = pd.read_excel('informal_words.xlsx',header=None)
informal_list = formal_words[0].values.tolist()


#Formal Pronouns
#Often third person
Formal_Pronouns = [
    "one","oneself","one's", 
    "who","whom","whomst","whose",
    "they","them","their","theirs","themself","themselves","theirself","theirselves",
    "it","its","itself",
    "he","him","himself","his",
    "she","her","herself","hers"
]

#Informal Pronouns
#Often first person
Informal_Pronouns = [
    "I","me","mine","mines","my","myself",
    "we","us","ourself","ourselves","our","ours",
    "you","your","yourself","yours","yourselves","y'all","yall","y'all's"
]

#Feature Extracter
def anvil_cleaner(data):
            
    #wored count
    data['Word Count']= data["text"].apply(lambda x: len(str(x).split()))
    
    #Counting number of formal pronouns
    data['Formal Pronoun'] = data["text"].apply(
        lambda x:len([w for w in str(x).lower().split() if w in Formal_Pronouns]))/data['Word Count']
        
    #Counting number of informal pronouns
    data['Informal Pronoun'] = data["text"].apply(
        lambda x:len([w for w in str(x).lower().split() if w in Informal_Pronouns]))/data['Word Count']
    
    #Counting number of contractions
    data['Contractions'] = data["text"].apply(lambda x: x.count("'"))/data['Word Count']
    
    #Identifying sentences with active voice
    data['Active Voice'] = data['text'].apply(checkForSentType)
    
    #Identifying sentences with passive voice (just opposite of active, since all sentences are active or passive)
    data['Passive Voice'] = 1 - data['text'].apply(checkForSentType)
        
    # Removing apostrophes so contractions are considered a single token
    data['clean_text'] = data['text'].apply(lambda x: re.sub("'", '', x))
    
    # Remove punctuation and stop words and lowercase the text
    data['clean_text'] = data['clean_text'].apply(lambda x: ' '.join(re.sub(
        r'[^a-zA-Z]',' ',w).lower() for w in x.split() if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stop_words))

    #lemmatize
    #data['clean_text'] = data['clean_text'].apply(lambda x: ' '.join(lemmatizer.lemmatize(w) for w in x.split()))


    #now engineer the features the model expects
    
    #Formal words count (normalized to sentence length)
    #data['Formal Words'] = data["text"].apply(
    #    lambda x:len([w for w in str(x).split() if w in formal_list]))/data['Word Count']
        
    #Informal words count (normalized to sentence length)
    #data['Informal Words'] = data["text"].apply(
    #    lambda x:len([w for w in str(x).split() if w in informal_list]))/data['Word Count']
        
    #wored count
    data['Word Count']= data["text"].apply(lambda x: len(str(x).split()))


    data['Character count'] = data["text"].apply(lambda x: len(str(x))) 

    data["average characters per word"] =  data['Character count']/data['Word Count']

    data['stopword count'] = data["text"].apply(lambda x: len([w for w in str(x).lower().split() if w in stop_words]))/data['Word Count']


    #feature for all the nowns in a text 
    from nltk import word_tokenize
    all_text_without_sw = ''
    for i in data.itertuples():
        all_text_without_sw = all_text_without_sw +  str(i.text)

    tokenized_all_text = word_tokenize(all_text_without_sw) #tokenize the text
    list_of_tagged_words = nltk.pos_tag(tokenized_all_text) #adding POS Tags to tokenized words

    set_pos  = (set(list_of_tagged_words)) # set of POS tags & words

    nouns = ['NN','NNS','NNP','NNPS'] #POS tags of nouns
    list_of_words = set(map(lambda tuple_2 : tuple_2[0], filter(lambda tuple_2 : tuple_2[1] in  nouns, set_pos)))
    #data['noun count'] = data['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in list_of_words]) )/data['Word Count']

    # prnown count

    pronouns = ['PRP','PRP$','WP','WP$'] # POS tags of pronouns
    list_of_words = set(map(lambda tuple_2 : tuple_2[0], filter(lambda tuple_2 : tuple_2[1] in  pronouns, set_pos)))
    #data['pronoun count'] = data['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in list_of_words]) )/data['Word Count']

    # count fo verbs

    verbs = ['VB','VBD','VBG','VBN','VBP','VBZ'] #POS tags of verbs
    list_of_words = set(map(lambda tuple_2 : tuple_2[0], filter(lambda tuple_2 : tuple_2[1] in  verbs, set_pos)))
    #data['verb count'] = data['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in list_of_words]) )/data['Word Count']


    #adverb count

    adverbs = ['RB','RBR','RBS','WRB'] #POS tags of adverbs
    list_of_words = set(map(lambda tuple_2 : tuple_2[0], filter(lambda tuple_2 : tuple_2[1] in  adverbs, set_pos)))
    data['adverb count'] = data['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in list_of_words]) )/data['Word Count']

    #Adjective count

    adjectives = ['JJ','JJR','JJS'] #POS tags of adjectives
    list_of_words = set(map(lambda tuple_2 : tuple_2[0], filter(lambda tuple_2 : tuple_2[1] in  adjectives, set_pos)))
    #data['adjective count'] = data['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in list_of_words]) )/data['Word Count']


    data['punctuation count'] = data['clean_text'].apply(lambda x: len([w for w in str(x) if w in string_punctuation]))/data['Word Count']


    data['mean sentance length'] = data['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
    return data



In [4]:
#Load in trained pickle model (Logistic regression)
with open('formality_model.pkl', 'rb') as f:
    lr_loaded = pickle.load(f)
clean_data = anvil_cleaner(data)

#Transforms data to be inserted into model (drops text and scales)
clean_data = clean_data.drop(['text', 'clean_text'], axis=1)
cols = clean_data.columns
scaler = MinMaxScaler()
clean_data = scaler.fit_transform(clean_data)
clean_data = pd.DataFrame(clean_data, columns=[cols])

#Get probabilities of formalities (0 is informal, 1 is formal)
lr_loaded.predict_proba(clean_data)[:,1]

