In [250]:
from nltk.tokenize import word_tokenize
import nltk
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [253]:
df = pd.read_excel("dataset.xlsx")
df = df[~df['PRONOUNS'].str.contains('its') & ~df['PRONOUNS'].str.contains('it\'s')]
def resolution_func(x):
    if type(x) == str:
        return x.split(';')
    return None

df['PRONOUNS'] = df['PRONOUNS'].apply(lambda x:[a for a in x[1:-1].replace('\'','').replace('"','').split(',') if a != ''])
df['RESOLUTION'] = df['RESOLUTION'].apply(resolution_func)
df = df[df['PRONOUNS'].map(len) != 0]
df.head()



219

In [254]:
def get_grammar_tree(text):
    grammar = r"""
        NP:
            {<DT>?<JJ>?<NN.*>*<NN.*>} 

    """
    chunker = nltk.RegexpParser(grammar)
    postoks = nltk.tag.pos_tag(word_tokenize(text))
    tree = chunker.parse(postoks)
    
    def get_terms(tree):
        ret = []
        for i,t in enumerate(tree):
            if type(t) == tuple:
                ret.append((t[0], t[1]))
            else:
                ret.append((' '.join([w for w,tt in t.leaves()]), t.leaves()[-1][1]))
        return ret

    return get_terms(tree)

In [255]:
def get_pronoun_position(tree, pronouns):
    
    def _get_position(tree, p,  starting_index = 0):
        for i,t in enumerate(tree[starting_index:]):

            if t[0] == p:
                return i + starting_index
        return -1
    
    if len(pronouns) == 1:
        return [_get_position(tree, pronouns[0])]
    else:
        index = 0
        poss=[]
        for pronoun in pronouns:
            index = _get_position(tree, pronoun, starting_index = index)
            poss.append(index)
            index = index + 1
        return poss


# -------- FEATURE FUNCTIONS --------

def get_plural_nouns(tree, nouns):
    return [n for n in nouns if tree[n][1].endswith('S') or (tree[n][1] == "NNP" and tree[n][0].endswith('s'))]
def get_singular_nouns(tree, nouns):       
    return [n for n in nouns if not (tree[n][1].endswith('S') or (tree[n][1] == "NNP" and tree[n][0].endswith('s')))]

def noun_count_before_pronoun(tree, pos, nouns):
    count = 0
    for n in nouns:
        if n >= pos:
            break
        count += 1
    return count

def noun_count_after_pronoun(tree, pos, nouns):
    count = 0
    for n in nouns:
        if n < pos:
            continue
        count += 1
    return count

def single_noun_before_pronoun(tree, pos, nouns):
    count = 0
    for n in nouns:
        if n >= pos:
            break
        count += 1
    return count == 1

def only_one_same_quantity(tree, pos, nouns):
    nouns = [n for n in nouns if n < pos]
    if tree[pos][0].lower() == "their" or tree[pos][0].lower() == "them" or tree[pos][0].lower() == "they":
        plural_nouns = get_plural_nouns(tree,nouns)
        if len(plural_nouns) == 1:
            return True
    else:
        singular_nouns = get_singular_nouns(tree,nouns)
        if len(singular_nouns) == 1:
            return True
    return False

def is_there_initial_noun(nouns):
    if len(nouns) > 0 and nouns[0] == 0:
        return True
    else: return False
    
def single_noun(nouns):
    return len(nouns) == 1

def pronoun_counts(pronoun_positions):
    return len(pronoun_positions)

def initial_pronoun(pos):
    return pos == 0
           
# -------- FEATURE FUNCTIONS --------

def get_feature_vector(tree, pronoun_position, nouns, pronoun_positions):
    return [
            single_noun_before_pronoun(tree,p,nouns),
            noun_count_before_pronoun(tree,p,nouns),
            noun_count_after_pronoun(tree,p,nouns),
            only_one_same_quantity(tree,p,nouns),
            is_there_initial_noun(nouns),
            single_noun(nouns),
            pronoun_counts(pronoun_positions),
            initial_pronoun(pronoun_position)
          ]



In [260]:

X=[]
y=[]

for i,d in df.iterrows():
    tree = get_grammar_tree(d['USER STORIES'].strip())
    pronouns = [p.strip() for p in d['PRONOUNS'] ]
    pronoun_positions = get_pronoun_position(tree, pronouns)
    nouns = [i for i,(t,p) in enumerate(tree) if p.startswith('NN')]
    for p in pronoun_positions:
        if d['AMBIGUITY'] == 'no':
            X.append(get_feature_vector(tree, p, nouns, pronoun_positions))
            y.append(0)
        else: 
            X.append(get_feature_vector(tree, p, nouns, pronoun_positions))
            y.append(1)


[[False, 3, 3, False, False, False, 2, False],
 [False, 4, 2, False, False, False, 2, False],
 [False, 9, 0, False, False, False, 1, False],
 [False, 3, 0, False, False, False, 1, False],
 [False, 4, 0, False, False, False, 1, False],
 [False, 2, 7, True, False, False, 1, False],
 [False, 4, 1, False, False, False, 1, False],
 [False, 2, 4, False, False, False, 1, False],
 [False, 2, 2, False, False, False, 1, False],
 [False, 6, 3, False, False, False, 1, False],
 [False, 4, 0, False, False, False, 1, False],
 [False, 4, 0, False, False, False, 1, False],
 [False, 4, 2, False, False, False, 1, False],
 [False, 5, 1, False, False, False, 1, False],
 [False, 2, 2, True, False, False, 1, False],
 [False, 4, 0, False, False, False, 1, False],
 [False, 3, 1, False, False, False, 1, False],
 [False, 3, 0, False, False, False, 1, False],
 [False, 5, 1, False, False, False, 1, False],
 [False, 5, 1, False, False, False, 1, False],
 [False, 5, 1, False, False, False, 1, False],
 [False, 6, 1, 

In [261]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42, shuffle=True)

In [262]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA", "Logistic Regression"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
    LogisticRegression()]

f1_scores = []
for name,clf in zip(names,classifiers):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("%s:\t\t Accuracy: %f, Recall: %f F1 score: %f" % (name, accuracy_score(y_test, y_pred),recall_score(y_test, y_pred),f1_score(y_test, y_pred)))
    f1_scores.append(f1_score(y_test, y_pred))


Nearest Neighbors:		 Accuracy: 0.458333, Recall: 0.257143 F1 score: 0.315789
Linear SVM:		 Accuracy: 0.513889, Recall: 0.000000 F1 score: 0.000000
RBF SVM:		 Accuracy: 0.500000, Recall: 0.228571 F1 score: 0.307692
Gaussian Process:		 Accuracy: 0.527778, Recall: 0.171429 F1 score: 0.260870
Decision Tree:		 Accuracy: 0.513889, Recall: 0.200000 F1 score: 0.285714
Random Forest:		 Accuracy: 0.569444, Recall: 0.371429 F1 score: 0.456140
Neural Net:		 Accuracy: 0.500000, Recall: 0.142857 F1 score: 0.217391
AdaBoost:		 Accuracy: 0.500000, Recall: 0.200000 F1 score: 0.280000
Naive Bayes:		 Accuracy: 0.486111, Recall: 0.028571 F1 score: 0.051282
QDA:		 Accuracy: 0.513889, Recall: 0.000000 F1 score: 0.000000
Logistic Regression:		 Accuracy: 0.486111, Recall: 0.057143 F1 score: 0.097561


  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
