In [18]:
from nltk.tokenize import word_tokenize
import nltk
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
"asd,asd,asd".split('q',1)

['asd,asd,asd']

In [146]:
df = pd.read_excel("dataset.xlsx")
df['PRONOUNS'] = df['PRONOUNS'].str.replace('it\'s', 'it')
df['USER STORIES'] = df['USER STORIES'].str.replace('it\'s', 'it is')
df['USER STORIES'] =df['USER STORIES'].apply(lambda x: x.split(',',1)[-1])  
def resolution_func(x):
    if type(x) == str:
        return x.split(';')
    return None

df['PRONOUNS'] = df['PRONOUNS'].apply(lambda x:[a.strip()[1:-1] for a in x[1:-1].split(',') if a != ''])
df['RESOLUTION'] = df['RESOLUTION'].apply(resolution_func)
df = df[df['PRONOUNS'].map(len) != 0]
#df.head()


In [157]:
def get_grammar_tree(text):
    grammar = r"""
        NP:
            {<DT>?<PRP.*>?<JJ>?<NN.*>*<IN>?<NN.*>}
        NP: 
            {<NP><IN><NP>}
        NP: 
            {<NP><IN><DT><VBG><NP>}
        NP: 
            {<NP><CC><NP>}
    """
    chunker = nltk.RegexpParser(grammar)
    postoks = nltk.tag.pos_tag(word_tokenize(text))
    tree = chunker.parse(postoks)
    
    def get_terms(tree):
        ret = []
        for i,t in enumerate(tree):
            if type(t) == tuple:
                ret.append((t[0], t[1]))
            else:
                ret.append((' '.join([w for w,tt in t.leaves()]), t.leaves()[-1][1]))
        return ret

    return get_terms(tree)

In [158]:
def get_pronoun_position(tree, pronouns):
    
    def _get_position(tree, p,  starting_index = 0):
        for i,t in enumerate(tree[starting_index:]):

            if t[0] == p:
                return i + starting_index
        return -1
    
    if len(pronouns) == 1:
        return [_get_position(tree, pronouns[0])]
    else:
        index = 0
        poss=[]
        for pronoun in pronouns:
            index = _get_position(tree, pronoun, starting_index = index)
            poss.append(index)
            index = index + 1
        return poss

# -------- FEATURE FUNCTIONS --------

def get_plural_nouns(tree, nouns):
    return [n for n in nouns if tree[n][1].endswith('S') or (tree[n][1] == "NNP" and tree[n][0].endswith('s'))]

def get_singular_nouns(tree, nouns):       
    return [n for n in nouns if not (tree[n][1].endswith('S') or (tree[n][1] == "NNP" and tree[n][0].endswith('s')))]

def noun_count_before_pronoun(tree, pos, nouns):
    count = 0
    for n in nouns:
        if n >= pos:
            break
        count += 1
    return count

def noun_count_after_pronoun(tree, pos, nouns):
    count = 0
    for n in nouns:
        if n < pos:
            continue
        count += 1
    return count

def single_noun_before_pronoun(tree, pos, nouns):
    count = 0
    for n in nouns:
        if n >= pos:
            break
        count += 1
    return count == 1

def only_one_same_quantity(tree, pos, nouns):
    nouns = [n for n in nouns if n < pos]
    if tree[pos][0].lower() == "their" or tree[pos][0].lower() == "them" or tree[pos][0].lower() == "they":
        plural_nouns = get_plural_nouns(tree,nouns)
        if len(plural_nouns) == 1:
            return True
    else:
        singular_nouns = get_singular_nouns(tree,nouns)
        if len(singular_nouns) == 1:
            return True
    return False

def is_there_initial_noun(nouns):
    if len(nouns) > 0 and nouns[0] == 0:
        return True
    else: return False
    
def single_noun(nouns):
    return len(nouns) == 1

def pronoun_counts(pronoun_positions):
    return len(pronoun_positions)

def initial_pronoun(pos):
    return pos == 0

def special_noun(nouns, tree):
    for n in nouns:
        if tree[n][0].isupper():
            return True
    return False

def special_single_noun(nouns, tree):
    if len([tree[n][0].isupper() for n in nouns]) == 1:
        return True
    return False

def single_little_special_noun(nouns, tree):
    if len([len([t for t in tree[n][0] if t.isupper()]) > 2 for n in nouns]) == 1:
            return True
    return False

def single_noun_with_the(nouns, tree):
    if len(["the" in tree[n][0].lower() for n in nouns]) == 1:
            return True
    return False

def single_noun_with_a_an(nouns, tree):
    if np.sum(["a" in tree[n][0].lower().split() for n in nouns]) == 1:
        return True
    elif np.sum(["an" in tree[n][0].lower().split() for n in nouns]) == 1:
        return True
    elif np.sum(["my"==tree[n-1][0].lower() for n in nouns]) == 1:
        return True
    return False

# -------- FEATURE FUNCTIONS --------

def get_feature_vector(tree, pronoun_position, nouns, pronoun_positions):
    return [
            single_noun_before_pronoun(tree,p,nouns),
            noun_count_before_pronoun(tree,p,nouns),
            noun_count_after_pronoun(tree,p,nouns),
            only_one_same_quantity(tree,p,nouns),
            is_there_initial_noun(nouns),
            single_noun(nouns),
            pronoun_counts(pronoun_positions),
            initial_pronoun(pronoun_position),
            special_noun(nouns, tree),
            special_single_noun(nouns, tree),
            single_little_special_noun(nouns, tree),
            single_noun_with_the(nouns, tree),
            single_noun_with_a_an(nouns, tree),
          ]


In [159]:
train_df, test_df = train_test_split(df, test_size=0.3, shuffle=False)

X_train=[]
y_train=[]
print("size of train df %d" % len(train_df))
for i,d in train_df.iterrows():
    tree = get_grammar_tree(d['USER STORIES'].strip())
    pronouns = [p.strip() for p in d['PRONOUNS'] ]
    pronoun_positions = get_pronoun_position(tree, pronouns)
    nouns = [i for i,(t,p) in enumerate(tree) if p.startswith('NN')]
    for p in pronoun_positions:
        if d['AMBIGUITY'] == 'no':
            X_train.append(get_feature_vector(tree, p, nouns, pronoun_positions))
            y_train.append(0)
        else: 
            X_train.append(get_feature_vector(tree, p, nouns, pronoun_positions))
            y_train.append(1)


size of train df 173


In [160]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes",
         #"QDA", 
         "Logistic Regression"
        ]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    #QuadraticDiscriminantAnalysis(),
    LogisticRegression()
]

f1_scores = []
print("size of test df: %d" % len(test_df))

for name,clf in zip(names,classifiers):
    clf.fit(X_train, y_train)
    y_test = []
    y_pred = []
    for i,d in test_df.iterrows():
        tree = get_grammar_tree(d['USER STORIES'].strip())
        pronouns = [p.strip() for p in d['PRONOUNS'] ]
        pronoun_positions = get_pronoun_position(tree, pronouns)
        nouns = [i for i,(t,p) in enumerate(tree) if p.startswith('NN')]
        X = []
        y = []
        
        for p in pronoun_positions:
            if d['AMBIGUITY'] == 'no':
                X.append(get_feature_vector(tree, p, nouns, pronoun_positions))
                y.append(0)
            else: 
                X.append(get_feature_vector(tree, p, nouns, pronoun_positions))
                y.append(1)
        y_pred_temp = clf.predict(X)
        
        if 1 in y_pred_temp:
            y_pred.append(1)
        else:
            y_pred.append(0)
            
        if 1 in y:
            y_test.append(1)
        else:
            y_test.append(0)
        
        
    print("%s:\t\t Accuracy: %.2f, Recall: %.2f, F1_score: %.2f, precision: %.2f" % (name, accuracy_score(y_test, y_pred),recall_score(y_test, y_pred),f1_score(y_test, y_pred), precision_score(y_test,y_pred)))
    f1_scores.append(f1_score(y_test, y_pred))
print(len(y_test))

size of test df: 75
Nearest Neighbors:		 Accuracy: 0.45, Recall: 0.20, F1_score: 0.23, precision: 0.26
Linear SVM:		 Accuracy: 0.63, Recall: 0.20, F1_score: 0.30, precision: 0.60
RBF SVM:		 Accuracy: 0.43, Recall: 0.17, F1_score: 0.19, precision: 0.22
Gaussian Process:		 Accuracy: 0.52, Recall: 0.27, F1_score: 0.31, precision: 0.36
Decision Tree:		 Accuracy: 0.39, Recall: 0.10, F1_score: 0.12, precision: 0.14
Random Forest:		 Accuracy: 0.53, Recall: 0.13, F1_score: 0.19, precision: 0.31
Neural Net:		 Accuracy: 0.47, Recall: 0.20, F1_score: 0.23, precision: 0.27
AdaBoost:		 Accuracy: 0.44, Recall: 0.23, F1_score: 0.25, precision: 0.27
Naive Bayes:		 Accuracy: 0.39, Recall: 0.63, F1_score: 0.45, precision: 0.35
Logistic Regression:		 Accuracy: 0.53, Recall: 0.30, F1_score: 0.34, precision: 0.39
75


In [161]:
np.asarray(y_pred)

array([0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 0, 0, 1, 0])

In [162]:
from textblob import TextBlob
import nltk
nltk.download('brown')
text = df.loc[3]['USER STORIES']
text= "As a Platform administrator, I want to be able to translate the data types hierarchies of the Viewer while in embed mode, So that my users can understand the interface in their native language."
text = "As a Developer, I want a jQuery plugin for Core Data Packages, so that I can use it to apply to form control that uses a core dataset for autocompletion."
text = "As a site visitor, I want to see a list of all upcoming Other Courses and can page through them if necessary, so that I can choose the best course for me."
text = "As a trainer, I want to update one of my existing courses or events, so that it reflects accurate information."
text = "As a Staff member, I want to create each condo unit, so that I can associate it to the child parcel and address after the condominiums are recorded in the system."
text = "As a moderator, I want to edit an item in the list of items to be estimated, so that I can make it better reflect the team's understanding of the item."
text = "As a Consumer, I want to know that the data I am downloading is good and can be relied on, so that that I don't have to check it myself or run into annoying bugs later on."
blob = TextBlob(text)

print(text)
print(blob.noun_phrases)

nouns = [t for i,(t,p) in enumerate(get_grammar_tree(text)) if p.startswith('NN')]
print(nouns)
get_grammar_tree(text)

As a Consumer, I want to know that the data I am downloading is good and can be relied on, so that that I don't have to check it myself or run into annoying bugs later on.
[]
['a Consumer', 'the data', 'bugs']


[nltk_data] Downloading package brown to /Users/emreeren/nltk_data...
[nltk_data]   Package brown is already up-to-date!


[('As', 'IN'),
 ('a Consumer', 'NNP'),
 (',', ','),
 ('I', 'PRP'),
 ('want', 'VBP'),
 ('to', 'TO'),
 ('know', 'VB'),
 ('that', 'IN'),
 ('the data', 'NN'),
 ('I', 'PRP'),
 ('am', 'VBP'),
 ('downloading', 'VBG'),
 ('is', 'VBZ'),
 ('good', 'JJ'),
 ('and', 'CC'),
 ('can', 'MD'),
 ('be', 'VB'),
 ('relied', 'VBN'),
 ('on', 'IN'),
 (',', ','),
 ('so', 'IN'),
 ('that', 'IN'),
 ('that', 'DT'),
 ('I', 'PRP'),
 ('do', 'VBP'),
 ("n't", 'RB'),
 ('have', 'VB'),
 ('to', 'TO'),
 ('check', 'VB'),
 ('it', 'PRP'),
 ('myself', 'PRP'),
 ('or', 'CC'),
 ('run', 'VB'),
 ('into', 'IN'),
 ('annoying', 'VBG'),
 ('bugs', 'NNS'),
 ('later', 'RB'),
 ('on', 'IN'),
 ('.', '.')]

In [21]:
df['USER STORIES']

0       I want the publish button in FABS to deactiva...
1       I want the DUNS validations to accept records...
2       I want the header information box to show upd...
3       I want to successfully Conduct a Plan Review ...
4       I want to Review Plans, so that I can review ...
                             ...                        
244     I want to know what the intellectual value of...
245     I want to have a mechanism to obtain a listin...
246     I want to have files adequately described, so...
247     I want to assess the probability/weight of a ...
248     I want to recommend different projects to vol...
Name: USER STORIES, Length: 248, dtype: object

In [47]:
"a".isupper()

False

In [68]:
"the" in "adatheasd"

True