In [114]:
import pandas as pd
import re
import numpy as np
import utilities as util
import importlib
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score,train_test_split, StratifiedKFold
from sklearn.metrics import f1_score,precision_score,recall_score,confusion_matrix,classification_report
from textblob import TextBlob
import matplotlib.pyplot as plt

from nltk.util import ngrams
from nltk.corpus import wordnet as wn
from nltk import word_tokenize
from nltk.collocations import *


In [115]:
importlib.reload(util)

<module 'utilities' from 'C:\\Users\\teddy\\Documents\\Research Project\\SreYantra_Data\\BugZ\\TKnegsampleGeneration\\utilities.py'>

In [116]:
nouns = ["NN","NNS","NNP","NNPS"]
ads = ["JJ","JJR","JJS"]

In [None]:
data = pd.read_csv("Teddy_Data/AllData.csv", low_memory = False)

In [None]:
reqs = data["summary"]

In [94]:
nlp_df = pd.DataFrame(columns = ["req orig", "req bigram", "req trigram"])

In [95]:
for index, value in reqs.items():
    ## generate the bigram
    tokens = util.generate_tokens(value)
    bigram = util.generate_collated_ngrams(tokens,2)
    trigram = util.generate_collated_ngrams(tokens,3)
    result = {"req orig": value,
             "req bigram": bigram,
             "req trigram": trigram}
    nlp_df = nlp_df.append(result, ignore_index = True)

In [136]:
text = "The constitutional constrains the bargain. An equal diagram guts a scroll. A cloth pops beneath the club! The captain camps. Each tiny man graduates behind the stiff sympathy. The ego dines throughout a mimic."
#col = util.generate_collated_ngrams(tokens,3)

tokens = util.generate_tokens(text)
tagged = nltk.pos_tag(tokens)
output = list(ngrams(tagged, 2))


In [137]:
output

[(('the', 'DT'), ('constitutional', 'JJ')),
 (('constitutional', 'JJ'), ('constrains', 'VBZ')),
 (('constrains', 'VBZ'), ('the', 'DT')),
 (('the', 'DT'), ('bargain', 'NN')),
 (('bargain', 'NN'), ('an', 'DT')),
 (('an', 'DT'), ('equal', 'JJ')),
 (('equal', 'JJ'), ('diagram', 'NN')),
 (('diagram', 'NN'), ('guts', 'VBZ')),
 (('guts', 'VBZ'), ('a', 'DT')),
 (('a', 'DT'), ('scroll', 'NN')),
 (('scroll', 'NN'), ('a', 'DT')),
 (('a', 'DT'), ('cloth', 'NN')),
 (('cloth', 'NN'), ('pops', 'VBZ')),
 (('pops', 'VBZ'), ('beneath', 'IN')),
 (('beneath', 'IN'), ('the', 'DT')),
 (('the', 'DT'), ('club', 'NN')),
 (('club', 'NN'), ('the', 'DT')),
 (('the', 'DT'), ('captain', 'NN')),
 (('captain', 'NN'), ('camps', 'NNS')),
 (('camps', 'NNS'), ('each', 'DT')),
 (('each', 'DT'), ('tiny', 'JJ')),
 (('tiny', 'JJ'), ('man', 'NN')),
 (('man', 'NN'), ('graduates', 'VBZ')),
 (('graduates', 'VBZ'), ('behind', 'IN')),
 (('behind', 'IN'), ('the', 'DT')),
 (('the', 'DT'), ('stiff', 'JJ')),
 (('stiff', 'JJ'), ('sympa

In [138]:
for bigram in output:
    if ~(((bigram[0][1] in nouns) or (bigram[0][1] in ads)) and (bigram[1][1] in nouns)):
        output.remove(bigram)

In [139]:
output

[(('constitutional', 'JJ'), ('constrains', 'VBZ')),
 (('the', 'DT'), ('bargain', 'NN')),
 (('an', 'DT'), ('equal', 'JJ')),
 (('diagram', 'NN'), ('guts', 'VBZ')),
 (('a', 'DT'), ('scroll', 'NN')),
 (('a', 'DT'), ('cloth', 'NN')),
 (('pops', 'VBZ'), ('beneath', 'IN')),
 (('the', 'DT'), ('club', 'NN')),
 (('the', 'DT'), ('captain', 'NN')),
 (('camps', 'NNS'), ('each', 'DT')),
 (('tiny', 'JJ'), ('man', 'NN')),
 (('graduates', 'VBZ'), ('behind', 'IN')),
 (('the', 'DT'), ('stiff', 'JJ')),
 (('sympathy', 'NN'), ('the', 'DT')),
 (('ego', 'NN'), ('dines', 'NNS')),
 (('throughout', 'IN'), ('a', 'DT'))]

In [121]:
filtered_list = filter(find_bigrams, output)

In [122]:
print('The filtered letters are:') 
for s in filtered_list: 
    print(s)

The filtered letters are:
