In [1]:
import nltk
import pandas as pd
import re
from collections import Counter
import time
import numpy as np
import PyDictionary
import warnings
warnings.filterwarnings("ignore")

### Reading the AmazonLawnAndGardenReviews csv file downloaded from UCI machine learning datasets repository

In [2]:
df=pd.read_csv('AmazonLawnAndGardenReviews.csv',encoding='iso-8859-1')
df.reviewText[0]

'Good USA company that stands behind their products. I have had to warranty two hoses and they send replacements right out to you. I had one burst after awhile, you could see it buldge for weeks before it went so no suprises. The other one was winter related as I am bad and leave them out most of the time. Highly reccomend. Note the hundred footer is heavy and like wresting an anaconda when its time to put away, but it does have a far reach.'

### Objective :
####              1) Classifying the Reviews based sentiment analysis on words of each Review.
####              2) Detecting the Root Cause of frequent malfunctioning products and assigning the frequency of occurances.

### Analysis :
####   1) Classifying the Review :
####      I have generated a Word Vector for each word. The Word Vector specifies the closeness and relativeness of word with      all possible sentiments which can be found from Reviews.
####   2) Detecting the Root Cause :
####      I have created a individual Dictionaries of all Possible verbs which can specifies the Physical state, action and mental   state of a each incident or sentence. 
####      By considering the negativity  of sentence the all mal functioning and less sentiment reviews will be seperated, From that the action associated with the review will be extracted and from that the Nouns of things will specifies the frequent malfunction products and action verbs specifies the reason behind the mal Function :-)


### Functions for Preprocessing of Text

The Required Preprocessing steps before doing the Analysis on the sentiment of Reviews

In [3]:
def Remove_URLs(x):
    x = x.split(' ')
    x = [i for i in x if not len(re.findall(r'[\w\.-]+@[\w\.-]+',i))]
    x = ' '.join(x)
    return x
def tokenizing(x):
    return nltk.tokenize.word_tokenize(x)
def stopwords(x):
    stop_words=nltk.corpus.stopwords.words('english')
    x=[i for i in x if i not in stop_words]
    return x
def Lemmatization(x):
    lemmatizer=nltk.stem.WordNetLemmatizer()
    x = [ lemmatizer.lemmatize(i) for i in x]
    return x
def Remove_numbers(x):
    x = [re.sub('[^A-Z,a-z]+','',i) for i in x]
    x = ' '.join(x).lower()
    return x

   

In [4]:
x = df.reviewText
Remove_numbers(tokenizing(x[0]))

'good usa company that stands behind their products  i have had to warranty two hoses and they send replacements right out to you  i had one burst after awhile , you could see it buldge for weeks before it went so no suprises  the other one was winter related as i am bad and leave them out most of the time  highly reccomend  note the hundred footer is heavy and like wresting an anaconda when its time to put away , but it does have a far reach '

In [6]:
def preprocess(x):

    x = Remove_URLs(x)
    x = tokenizing(x)
    x = stopwords(x)
    x = Lemmatization(x)
    x = Remove_numbers(x)
    return x

Text before preprocessing

In [7]:
x=list(df.reviewText)
df.reviewText[0]

'Good USA company that stands behind their products. I have had to warranty two hoses and they send replacements right out to you. I had one burst after awhile, you could see it buldge for weeks before it went so no suprises. The other one was winter related as I am bad and leave them out most of the time. Highly reccomend. Note the hundred footer is heavy and like wresting an anaconda when its time to put away, but it does have a far reach.'

Example Preprcessed Text

In [8]:
preprocess(df.reviewText[0])

'good usa company stand behind product  i warranty two hose send replacement right  i one burst awhile , could see buldge week went suprises  the one winter related i bad leave time  highly reccomend  note hundred footer heavy like wresting anaconda time put away , far reach '

Preprocessing the whole Text of Reviews

In [9]:
t1 = time.time()
df['Preprocess_Review'] =  df.reviewText.apply(lambda x: preprocess(x) if isinstance(x,str) else ' ')
time.time() - t1

46.58170413970947

Extracting feature words from the reviews Text

In [11]:
t1 = time.time()
Word_dict = set()
for i in df.Preprocess_Review:
    Word_dict |= set(nltk.tokenize.word_tokenize(i))
time.time()-t1

27.530222415924072

### Tagging the text with the Parts of speech to differentiate the Verbs and to create features to words

In [10]:
Word_tags = pd.DataFrame(nltk.pos_tag(list(Word_dict)))

The parts of speech abbrevations to consider

In [11]:
Word_tags[1].unique()

array(['JJ', 'NN', 'VBN', 'NNS', 'VBD', 'RB', 'VBZ', 'VBG', 'RBR', 'VBP',
       'VB', 'JJR', 'RBS', 'IN', 'RP', 'JJS', 'NNP', 'FW', 'WP', 'DT',
       'PRP', 'CC', 'EX', 'PRP$', 'WDT', 'CD', 'TO', 'WRB', 'MD', 'PDT',
       ',', 'WP$'], dtype=object)


    CC | Coordinating conjunction |
    CD | Cardinal number |
    DT | Determiner |
    EX | Existential there |
    FW | Foreign word |
    IN | Preposition or subordinating conjunction |
    JJ | Adjective |
    JJR | Adjective, comparative |
    JJS | Adjective, superlative |
    LS | List item marker |
    MD | Modal |
    NN | Noun, singular or mass |
    NNS | Noun, plural |
    NNP | Proper noun, singular |
    NNPS | Proper noun, plural |
    PDT | Predeterminer |
    POS | Possessive ending |
    PRP | Personal pronoun |
    PRP$ | Possessive pronoun |
    RB | Adverb |
    RBR | Adverb, comparative |
    RBS | Adverb, superlative |
    RP | Particle |
    SYM | Symbol |
    TO | to |
    UH | Interjection |
    VB | Verb, base form |
    VBD | Verb, past tense |
    VBG | Verb, gerund or present participle |
    VBN | Verb, past participle |
    VBP | Verb, non-3rd person singular present |
    VBZ | Verb, 3rd person singular present |
    WDT | Wh-determiner |
    WP | Wh-pronoun |
    WP$ | Possessive wh-pronoun |
    WRB | Wh-adverb |


Extracting the words which gives meaning out of sentence (verbs)

In [12]:
Words_verbs = Word_tags[Word_tags[1].isin(['VB','VBP','VBN','VBZ','RB'])]
[i for i in nltk.pos_tag(nltk.tokenize.word_tokenize(df.Preprocess_Review[0])) if i[1] in ['VB','VBP','VBZ','RB','VBN']]
df.Preprocess_Review[0]

'good usa company stand behind product  i warranty two hose send replacement right  i one burst awhile , could see buldge week went suprises  the one winter related i bad leave time  highly reccomend  note hundred footer heavy like wresting anaconda time put away , far reach '

Creating seperate column of preprocess Verbs of each review Sentence

In [13]:
t1 = time.time()
def func(x):
    tokens = nltk.pos_tag(nltk.tokenize.word_tokenize(x))
    verbs = [i for i in tokens if i[1] in ['VB','VBP','VBZ','RB','VBN']]
    return verbs
df['Verbs'] = df.Preprocess_Review.apply(lambda x : func(x))
time.time()-t1

372.6656494140625

Word categories to differentiate Nouns and Pronouns

In [14]:
nltk.corpus.brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [15]:
nltk.corpus.brown.words(categories='adventure')

['Dan', 'Morgan', 'told', 'himself', 'he', 'would', ...]

In [7]:
x[0]

'Good USA company that stands behind their products. I have had to warranty two hoses and they send replacements right out to you. I had one burst after awhile, you could see it buldge for weeks before it went so no suprises. The other one was winter related as I am bad and leave them out most of the time. Highly reccomend. Note the hundred footer is heavy and like wresting an anaconda when its time to put away, but it does have a far reach.'

## Types of Verbs
the main categories of physical verbs, mental verbs, and state of being verbs, there are several other types of verbs. In fact, there are more than ten different types of verbs that are grouped by function.


### List of all Verb Types

### Action Verbs
Action verbs express specific actions, and are used any time you want to show action or discuss someone doing something.

### Transitive Verbs
Transitive verbs are action verbs that always express doable activities. These verbs always have direct objects, meaning someone or something receives the action of the verb.

### Intransitive Verbs
Intransitive verbs are action verbs that always express doable activities. No direct object follows an intransitive verb.

### Auxiliary Verbs
Auxiliary verbs are also known as helping verbs, and are used together with a main verb to show the verb’s tense or to form a question or negative.

### Stative Verbs
Stative verbs can be recognized because they express a state rather than an action. They typically relate to thoughts, emotions, relationships, senses, states of being, and measurements.

### Modal Verbs
Modal verbs are auxiliary verbs that are used to express abilities, possibilities, permissions, and obligations.

### Phrasal Verbs
Phrasal verbs aren’t single words; instead, they are combinations of words that are used together to take on a different meaning to that of the original verb.

### Irregular Verbs
Irregular verbs are those that don’t take on the regular spelling patterns of past simple and past participle verbs.

### Regular Expressions in Text Synthesizing

In [8]:
w = x[0]+'98.4   34   4343  32.33 343.0'
re.findall('\d+\.\d+',w)

['98.4', '32.33', '343.0']

In [9]:
re.findall(r'[o][u][t]',x[0])

['out', 'out']

In [10]:
re.findall(r'\d+.\d+',w)

['98.4', '4343', '32.33', '343.0']

In [11]:
[w for w in nltk.tokenize.word_tokenize(x[0]) if re.search(r'ge$',w)]

['buldge']

In [12]:
w = wordlist = nltk.tokenize.word_tokenize(x[0])
list(map(lambda x: x.lower(),wordlist))[:5]

['good', 'usa', 'company', 'that', 'stands']

In [13]:
x[0]

'Good USA company that stands behind their products. I have had to warranty two hoses and they send replacements right out to you. I had one burst after awhile, you could see it buldge for weeks before it went so no suprises. The other one was winter related as I am bad and leave them out most of the time. Highly reccomend. Note the hundred footer is heavy and like wresting an anaconda when its time to put away, but it does have a far reach.'

In [14]:
[w for w in nltk.tokenize.word_tokenize(x[0]) if re.search(r'\w{5,7}$', w)][:5]

['company', 'stands', 'behind', 'their', 'products']

In [15]:
[w for w in nltk.tokenize.word_tokenize(x[0]) if re.search(r'^[a-zA-F]{5}$', w)][:5]

['their', 'hoses', 'right', 'burst', 'after']

In [16]:
re.findall(r'[o][u]',x[0])[:5]

['ou', 'ou', 'ou', 'ou', 'ou']

In [17]:
wordlist = nltk.tokenize.word_tokenize(x[3])
[w for w in wordlist if re.search('^[a-z]{3,}$', w)][:5]

['probably', 'should', 'have', 'bought', 'something']

In [18]:
re.search(r'[i][t]+',x[0])

<_sre.SRE_Match object; span=(174, 176), match='it'>

In [19]:
[w for w in nltk.tokenize.word_tokenize(x[0]) if re.search('^[ghi][mno][jlk][def]$', w)]

[]

In [20]:
import nltk
a=nltk.corpus.brown.tagged_words(categories='news')[:15]

In [21]:
fd = nltk.FreqDist(nltk.corpus.brown.words(categories='news'))
fd.most_common(5)

[('the', 5580), (',', 5188), ('.', 4030), ('of', 2849), ('and', 2146)]

In [31]:
fd = nltk.ConditionalFreqDist(nltk.corpus.brown.words(categories='news'))
fd.most_common(5)

ValueError: too many values to unpack (expected 2)

In [14]:
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')

In [15]:
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
unigram_tagger.tag(brown_sents[2007])[:10]

[('Various', 'JJ'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('apartments', 'NNS'),
 ('are', 'BER'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('terrace', 'NN'),
 ('type', 'NN'),
 (',', ',')]

In [16]:
train_sents = brown_tagged_sents[:4000]
test_sents = brown_tagged_sents[4000:]
unigram_tagger = nltk.UnigramTagger(train_sents)
unigram_tagger.evaluate(test_sents)

0.8111044507717668

In [21]:

d =nltk.ConditionalFreqDist(nltk.pos_tag(nltk.tokenize.word_tokenize(x[0])))
cfd = nltk.ConditionalFreqDist(nltk.corpus.brown.tagged_words(categories='news')[:15])
pd.DataFrame(cfd.tabulate()).head()

                 AT    IN    JJ JJ-TL    NN NN-TL   NP$ NP-TL    NR   VBD 
    Atlanta's     0     0     0     0     0     0     1     0     0     0 
       County     0     0     0     0     0     1     0     0     0     0 
       Friday     0     0     0     0     0     0     0     0     1     0 
       Fulton     0     0     0     0     0     0     0     1     0     0 
        Grand     0     0     0     1     0     0     0     0     0     0 
         Jury     0     0     0     0     0     1     0     0     0     0 
          The     1     0     0     0     0     0     0     0     0     0 
           an     1     0     0     0     0     0     0     0     0     0 
     election     0     0     0     0     1     0     0     0     0     0 
investigation     0     0     0     0     1     0     0     0     0     0 
           of     0     1     0     0     0     0     0     0     0     0 
      primary     0     0     0     0     1     0     0     0     0     0 
     produced     0     0

In [36]:
 nltk.corpus.brown.tagged_sents(categories='news')

[[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')], [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR'), ('said', 'VBD'), ('in', 'IN'), ('term-end', 'NN'), ('presentments', 'NNS'), ('that', 'CS'), ('the', 'AT'), ('City', 'NN-TL'), ('Executive', 'JJ-TL'), ('Committee', 'NN-TL'), (',', ','), ('which', 'WDT'), ('had', 'HVD'), ('over-all', 'JJ'), ('charge', 'NN'), ('of', 'IN'), ('the', 'AT'), ('election', 'NN'), (',', ','), ('``', '``'), ('deserves', 'VBZ'), ('the', 'AT'), ('praise', 'NN'), ('and', 'CC'), ('thanks', 'NNS'), ('of', 'IN'), ('the', 'AT'), ('City', 'NN-TL'), ('of', 'IN-TL'), ('Atlant

## Retrieving the Synonyms of a Word from OXFORD thesaurus dictionaries with the use of Rest API 

In [5]:
import requests
import json

app_id = '96612f3b'
app_key = 'dc9b1d7ab5ec44eca4ea60507126bdae'

language = 'en'
word_id = 'eat'

url = 'https://od-api.oxforddictionaries.com:443/api/v1/entries/' + language + '/' + word_id.lower() + '/synonyms'

r = requests.get(url, headers = {'app_id': app_id, 'app_key': app_key})

#print("code {}\n".format(r.status_code))
#print("text \n" + r.text)
#print("json \n" + json.dumps(r.json()))

### Oxford Thesaurus Dictionary can define the word in different ways 
####  Strong Synonyms : which can define very close Meaning of asked word, All Synonyms can give all relative possibilties of a asked word. We can use these closeness words, all synonyms,all antonyms  and antonyms as a features of a asked word in reviewing the sentence.

<img src="thesaurus.png",width=900,height=500>

In [6]:
# retrieving the Synonyms and Antonyms of a asked word 'eat'
da = r.json()['results'][0]['lexicalEntries'][0]['entries'][0]['senses'][0]['subsenses'][0]
try:
    if da['registers'][0] == 'informal':
        #print(da['synonyms'])
        print('informal')
        for i in (da['synonyms']):
            print(i['text'])
except:
    for i in (da['synonyms']):
            print(i['text'])
    

swallow
chew
munch
chomp
champ


## Retrieving the Synonyms from the PyDictionary in Python

In [7]:
import PyDictionary
Dictionary =  PyDictionary.PyDictionary()

In [8]:
Negative_Features = ['word','Apathy','Atrocious','Bemoan','Cold-hearted','Deplorable','Despicable','Detrimental','Dreadful','Ghastly','Grotesque','Gruesome','Hard-hearted','Insidious','Insipid','Malicious','Monstrous','Noxious','Offensive','Oppressive','Pessimistic','Prejudice','Repulsive','Ruthless','Sinister','Vice','Vicious','Vile','Villainous','Vindictive']

In [12]:

Dictionary.synonym('Apathy eat')

Error: A Term must be only a single word


In [18]:
word = 'Apathy'
word = word.split(' ')[0]

In [19]:
word

'Apathy'

In [10]:
word = 'Apathy'
temp = Dictionary.synonym(word)
while temp is None:
    temp = Dictionary.synonym(word)

In [None]:
word = 'Apathy'
temp_df = pd.DataFrame(columns= [word,'word'])
syns = Dictionary.synonym(word)
while syns is None:
    syns = Dictionary.synonym(word)
    
temp_list1 = syns
temp_list2 = []
words = []
score = []
words.extend(syns)
score.extend(np.repeat(1,len(syns)))
print('stage -1')
    
for i in range(2):
    print('loop',i)
    
    for j in temp_list1:
        word_in = j.split(' ')[0]
        syns = Dictionary.synonym(word_in)
        while syns is None:
            syns = Dictionary.synonym(word_in)
        temp_list2.extend(syns)
        
    words.extend(temp_list2)
    score.extend(np.repeat(1-0.005*i,len(temp_list2)))
    temp_list1 = temp_list2

    
    
#     syns = Dictionary.synonym(word)
#     while temp is None:
#         syns = Dictionary.synonym(word)
    
    

stage -1
loop 0
indifference has no Synonyms in the API
coolness has no Synonyms in the API
loop 1
inactivity has no Synonyms in the API
disinterest has no Synonyms in the API
apathy has no Synonyms in the API
apathy has no Synonyms in the API
inertness has no Synonyms in the API
sloth has no Synonyms in the API
negligence has no Synonyms in the API
negligence has no Synonyms in the API
carelessness has no Synonyms in the API
apathy has no Synonyms in the API
apathy has no Synonyms in the API
apathy has no Synonyms in the API
disinterest has no Synonyms in the API
indifference has no Synonyms in the API
indifference has no Synonyms in the API
indifference has no Synonyms in the API
indifference has no Synonyms in the API
resignation has no Synonyms in the API
resignation has no Synonyms in the API
insipidness has no Synonyms in the API
dreariness has no Synonyms in the API
dreariness has no Synonyms in the API
dreariness has no Synonyms in the API
dreariness has no Synonyms in the API


loll has no Synonyms in the API
loll has no Synonyms in the API
loaf has no Synonyms in the API
loaf has no Synonyms in the API
hooky has no Synonyms in the API
cut has no Synonyms in the API
AWOL has no Synonyms in the API
lethargy has no Synonyms in the API
torpor has no Synonyms in the API
lethargy has no Synonyms in the API
lethargy has no Synonyms in the API
lethargy has no Synonyms in the API
inertia has no Synonyms in the API
inertia has no Synonyms in the API
disinterest has no Synonyms in the API
disinterest has no Synonyms in the API
disinterest has no Synonyms in the API
disinterest has no Synonyms in the API
inertness has no Synonyms in the API
slackness has no Synonyms in the API
remissness has no Synonyms in the API
heedlessness has no Synonyms in the API
heedlessness has no Synonyms in the API
sluggishness has no Synonyms in the API
sluggishness has no Synonyms in the API
sluggishness has no Synonyms in the API
negligence has no Synonyms in the API
disdain has no Synonym

fallback has no Synonyms in the API
fallback has no Synonyms in the API
flight has no Synonyms in the API
escape has no Synonyms in the API
escape has no Synonyms in the API
escape has no Synonyms in the API
escape has no Synonyms in the API
escape has no Synonyms in the API
escape has no Synonyms in the API
escape has no Synonyms in the API
escape has no Synonyms in the API
retirement has no Synonyms in the API
retirement has no Synonyms in the API
departure has no Synonyms in the API
disengagement has no Synonyms in the API
disregard has no Synonyms in the API
disregard has no Synonyms in the API
disregard has no Synonyms in the API
composure has no Synonyms in the API
indifference has no Synonyms in the API
dullness has no Synonyms in the API
dullness has no Synonyms in the API
dullness has no Synonyms in the API
disdain has no Synonyms in the API
disdain has no Synonyms in the API
inertness has no Synonyms in the API
inattention has no Synonyms in the API
disinterest has no Synonym

equanimity has no Synonyms in the API
equanimity has no Synonyms in the API
equanimity has no Synonyms in the API
equanimity has no Synonyms in the API
equanimity has no Synonyms in the API
self-assurance has no Synonyms in the API
self-assurance has no Synonyms in the API
love has no Synonyms in the API
truce has no Synonyms in the API
truce has no Synonyms in the API
truce has no Synonyms in the API
truce has no Synonyms in the API
grit has no Synonyms in the API
grit has no Synonyms in the API
grit has no Synonyms in the API
diligence has no Synonyms in the API
diligence has no Synonyms in the API
hush has no Synonyms in the API
serenity has no Synonyms in the API
serenity has no Synonyms in the API
serenity has no Synonyms in the API
serenity has no Synonyms in the API
serenity has no Synonyms in the API
inactivity has no Synonyms in the API
inactivity has no Synonyms in the API
inactivity has no Synonyms in the API
patience has no Synonyms in the API
peace has no Synonyms in the A

thinking has no Synonyms in the API
thinking has no Synonyms in the API
slackness has no Synonyms in the API
oversight has no Synonyms in the API
oversight has no Synonyms in the API
heedlessness has no Synonyms in the API
disregard has no Synonyms in the API
disregard has no Synonyms in the API
disregard has no Synonyms in the API
failure has no Synonyms in the API
failure has no Synonyms in the API
failure has no Synonyms in the API
oversight has no Synonyms in the API
derision has no Synonyms in the API
aversion has no Synonyms in the API
nonchalance has no Synonyms in the API
nonchalance has no Synonyms in the API
nonchalance has no Synonyms in the API
nonchalance has no Synonyms in the API
nonchalance has no Synonyms in the API
inattention has no Synonyms in the API
inattention has no Synonyms in the API
inattention has no Synonyms in the API
negligence has no Synonyms in the API
negligence has no Synonyms in the API
negligence has no Synonyms in the API
negligence has no Synonyms

laxness has no Synonyms in the API
laxness has no Synonyms in the API
laxness has no Synonyms in the API
trance has no Synonyms in the API
tiredness has no Synonyms in the API
sluggishness has no Synonyms in the API
sluggishness has no Synonyms in the API
sluggishness has no Synonyms in the API
sluggishness has no Synonyms in the API
sluggishness has no Synonyms in the API
sluggishness has no Synonyms in the API
sluggishness has no Synonyms in the API
lethargy has no Synonyms in the API
torpor has no Synonyms in the API
dormancy has no Synonyms in the API
dormancy has no Synonyms in the API
dormancy has no Synonyms in the API
lethargy has no Synonyms in the API
lethargy has no Synonyms in the API
inattention has no Synonyms in the API
inattention has no Synonyms in the API
indifference has no Synonyms in the API
indifference has no Synonyms in the API
indifference has no Synonyms in the API
listlessness has no Synonyms in the API
listlessness has no Synonyms in the API
listlessness has

disruption has no Synonyms in the API
drop has no Synonyms in the API
drop has no Synonyms in the API
slump has no Synonyms in the API
slump has no Synonyms in the API
recession has no Synonyms in the API
failure has no Synonyms in the API
offense has no Synonyms in the API
offense has no Synonyms in the API
offense has no Synonyms in the API
failing has no Synonyms in the API
failing has no Synonyms in the API
failing has no Synonyms in the API
failing has no Synonyms in the API
failing has no Synonyms in the API
failing has no Synonyms in the API
failing has no Synonyms in the API
failing has no Synonyms in the API
failing has no Synonyms in the API
miscue has no Synonyms in the API
violation has no Synonyms in the API
disrespect has no Synonyms in the API
carelessness has no Synonyms in the API
carelessness has no Synonyms in the API
indifference has no Synonyms in the API
oversight has no Synonyms in the API
omission has no Synonyms in the API
aberration has no Synonyms in the API


nonchalance has no Synonyms in the API
nonchalance has no Synonyms in the API
nonchalance has no Synonyms in the API
inattention has no Synonyms in the API
neglect has no Synonyms in the API
laxity has no Synonyms in the API
laxity has no Synonyms in the API
omission has no Synonyms in the API
omission has no Synonyms in the API
remoteness has no Synonyms in the API
remoteness has no Synonyms in the API
contemplation has no Synonyms in the API
thinking has no Synonyms in the API
thinking has no Synonyms in the API
thinking has no Synonyms in the API
heedlessness has no Synonyms in the API
heedlessness has no Synonyms in the API
laxness has no Synonyms in the API
apathy has no Synonyms in the API
apathy has no Synonyms in the API
apathy has no Synonyms in the API
apathy has no Synonyms in the API
misstep has no Synonyms in the API
collapse has no Synonyms in the API
lapse has no Synonyms in the API
neglect has no Synonyms in the API
mistake has no Synonyms in the API
carelessness has no

indifference has no Synonyms in the API
indifference has no Synonyms in the API
indifference has no Synonyms in the API
indifference has no Synonyms in the API
indifference has no Synonyms in the API
listlessness has no Synonyms in the API
listlessness has no Synonyms in the API
lethargy has no Synonyms in the API
indifference has no Synonyms in the API
indifference has no Synonyms in the API
passiveness has no Synonyms in the API
coolness has no Synonyms in the API
coolness has no Synonyms in the API
coolness has no Synonyms in the API
coolness has no Synonyms in the API
coolness has no Synonyms in the API
torpidity has no Synonyms in the API
lethargy has no Synonyms in the API
slouch has no Synonyms in the API
sluggishness has no Synonyms in the API
sluggishness has no Synonyms in the API
idleness has no Synonyms in the API
disregard has no Synonyms in the API
failure has no Synonyms in the API
failure has no Synonyms in the API
oversight has no Synonyms in the API
neglect has no Syn

lethargy has no Synonyms in the API
inattention has no Synonyms in the API
inattention has no Synonyms in the API
listlessness has no Synonyms in the API
listlessness has no Synonyms in the API
listlessness has no Synonyms in the API
listlessness has no Synonyms in the API
unconcern has no Synonyms in the API
sloth has no Synonyms in the API
sloth has no Synonyms in the API
sloth has no Synonyms in the API
sloth has no Synonyms in the API
lethargy has no Synonyms in the API
apathy has no Synonyms in the API
apathy has no Synonyms in the API
inactivity has no Synonyms in the API
casualness has no Synonyms in the API
indifference has no Synonyms in the API
indifference has no Synonyms in the API
indifference has no Synonyms in the API
indifference has no Synonyms in the API
lethargy has no Synonyms in the API
passiveness has no Synonyms in the API
disdain has no Synonyms in the API
disdain has no Synonyms in the API
disdain has no Synonyms in the API
disdain has no Synonyms in the API
di

clearing has no Synonyms in the API
draining has no Synonyms in the API
draining has no Synonyms in the API
emptying has no Synonyms in the API
retirement has no Synonyms in the API
evacuation has no Synonyms in the API
withdrawal has no Synonyms in the API
fallback has no Synonyms in the API
departure has no Synonyms in the API
departure has no Synonyms in the API
exit has no Synonyms in the API
exit has no Synonyms in the API
disengagement has no Synonyms in the API
evacuation has no Synonyms in the API
evacuation has no Synonyms in the API
withdrawal has no Synonyms in the API
withdrawal has no Synonyms in the API
retirement has no Synonyms in the API
retirement has no Synonyms in the API
retirement has no Synonyms in the API
retirement has no Synonyms in the API
retirement has no Synonyms in the API
retirement has no Synonyms in the API
retirement has no Synonyms in the API
retirement has no Synonyms in the API
pullout has no Synonyms in the API
withdrawal has no Synonyms in the AP

arrogance has no Synonyms in the API
arrogance has no Synonyms in the API
inattention has no Synonyms in the API
inattention has no Synonyms in the API
inattention has no Synonyms in the API
inattention has no Synonyms in the API
negligence has no Synonyms in the API
negligence has no Synonyms in the API
inadvertence has no Synonyms in the API
inadvertence has no Synonyms in the API
inadvertence has no Synonyms in the API
remissness has no Synonyms in the API
lethargy has no Synonyms in the API
passiveness has no Synonyms in the API
coolness has no Synonyms in the API
lethargy has no Synonyms in the API
lethargy has no Synonyms in the API
lethargy has no Synonyms in the API
lethargy has no Synonyms in the API
lethargy has no Synonyms in the API
lethargy has no Synonyms in the API
inattention has no Synonyms in the API
inattention has no Synonyms in the API
negligence has no Synonyms in the API
negligence has no Synonyms in the API
carelessness has no Synonyms in the API
carelessness ha

In [None]:
words

In [None]:
len(score)

In [154]:
df2 = pd.DataFrame()
df2['word'] = ['apathy','bongu','chastav']
df2['attro']= [0.3,2,2]

In [155]:
df2

Unnamed: 0,word,attro
0,apathy,0.3
1,bongu,2.0
2,chastav,2.0


In [156]:
df1

Unnamed: 0,Apathy,word
0,5,bongu


In [157]:
pd.merge(df2,df1,on=['word'],how='outer')

Unnamed: 0,word,attro,Apathy
0,apathy,0.3,
1,bongu,2.0,5.0
2,chastav,2.0,


### Differentiate the Verbs and Nouns with POS tags 
#### 1) Form the Verbs sentence and 
#### 2) Form the Nouns sentence to point out
#### 3) most similar problems data extraction ( positive ) noun on what and verb (cause)

In [16]:
t1=time.time()
l=[]
for i in x:
    if isinstance(i,str):
        l.append(preprocess(i))
    else:
        l.append(' ')
print(time.time()-t1)

145.19116234779358


## Building responsive system on Natural Language
#### 1) Understand the verbs and nouns of given sentence 
### 2) either work done or not ( verbs ,Nouns ,plural,gender ,things)
#### 3) understand the Grammetical Tense(present,past or future) feed all to Neural Network
#### 4) Builda LSTM networks for all different variations of Grammetical sentences with trained information of WEB
#### 5) Build another neural network to differentiate the sentence variation 
#### 6) Final neural netword to replace the nouns and plurals 

In [83]:
nltk.corpus.wordnet.synset('run.n.01').definition()

'a score in baseball made by a runner touching all four bases safely'

In [20]:
stopwords(nltk.tokenize.word_tokenize(x[1]))

['This',
 'high',
 'quality',
 '8',
 'ply',
 'hose',
 '.',
 'I',
 'good',
 'luck',
 'Gilmour',
 'hoses',
 'past',
 '.',
 'A',
 'good',
 'choice',
 'hoses',
 '.']

In [88]:
preproc=' '.join(df.Preprocess)

In [89]:
preproc = nltk.tokenize.word_tokenize(preproc)

In [90]:
count =  Counter(preproc)

In [91]:
count.most_common(10)

[('i', 58620),
 ('nt', 13434),
 ('the', 12029),
 ('s', 10310),
 ('one', 9558),
 ('it', 9518),
 ('use', 7844),
 ('like', 6951),
 ('get', 6835),
 ('this', 6362)]

In [56]:
count.most_common(10)

[('i', 58620),
 ('nt', 13434),
 ('the', 12029),
 ('s', 10310),
 ('one', 9558),
 ('it', 9518),
 ('use', 7844),
 ('like', 6951),
 ('get', 6835),
 ('this', 6362)]

In [54]:
stop_words=nltk.corpus.stopwords.words('english')

In [93]:
's' in stop_words

True

In [None]:
x=[i for i in df.reviewText if isinstance(i,str)]
x=' '.join(x)
tokens=nltk.tokenize.word_tokenize(x)
counts=Counter(tokens)
counts.most_common(100)

In [None]:
from __future__ import division
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import brown
import math
import numpy as np
import sys

# Parameters to the algorithm. Currently set to values that was reported
# in the paper to produce "best" results.
ALPHA = 0.2
BETA = 0.45
ETA = 0.4
PHI = 0.2
DELTA = 0.85

brown_freqs = dict()
N = 0

######################### word similarity ##########################

def get_best_synset_pair(word_1, word_2):
    """ 
    Choose the pair with highest path similarity among all pairs. 
    Mimics pattern-seeking behavior of humans.
    """
    max_sim = -1.0
    synsets_1 = wn.synsets(word_1)
    synsets_2 = wn.synsets(word_2)
    if len(synsets_1) == 0 or len(synsets_2) == 0:
        return None, None
    else:
        max_sim = -1.0
        best_pair = None, None
        for synset_1 in synsets_1:
            for synset_2 in synsets_2:
               sim = wn.path_similarity(synset_1, synset_2)
               if sim > max_sim:
                   max_sim = sim
                   best_pair = synset_1, synset_2
        return best_pair

def length_dist(synset_1, synset_2):
    """
    Return a measure of the length of the shortest path in the semantic 
    ontology (Wordnet in our case as well as the paper's) between two 
    synsets.
    """
    l_dist = sys.maxint
    if synset_1 is None or synset_2 is None: 
        return 0.0
    if synset_1 == synset_2:
        # if synset_1 and synset_2 are the same synset return 0
        l_dist = 0.0
    else:
        wset_1 = set([str(x.name()) for x in synset_1.lemmas()])        
        wset_2 = set([str(x.name()) for x in synset_2.lemmas()])
        if len(wset_1.intersection(wset_2)) > 0:
            # if synset_1 != synset_2 but there is word overlap, return 1.0
            l_dist = 1.0
        else:
            # just compute the shortest path between the two
            l_dist = synset_1.shortest_path_distance(synset_2)
            if l_dist is None:
                l_dist = 0.0
    # normalize path length to the range [0,1]
    return math.exp(-ALPHA * l_dist)

def hierarchy_dist(synset_1, synset_2):
    """
    Return a measure of depth in the ontology to model the fact that 
    nodes closer to the root are broader and have less semantic similarity
    than nodes further away from the root.
    """
    h_dist = sys.maxint
    if synset_1 is None or synset_2 is None: 
        return h_dist
    if synset_1 == synset_2:
        # return the depth of one of synset_1 or synset_2
        h_dist = max([x[1] for x in synset_1.hypernym_distances()])
    else:
        # find the max depth of least common subsumer
        hypernyms_1 = {x[0]:x[1] for x in synset_1.hypernym_distances()}
        hypernyms_2 = {x[0]:x[1] for x in synset_2.hypernym_distances()}
        lcs_candidates = set(hypernyms_1.keys()).intersection(
            set(hypernyms_2.keys()))
        if len(lcs_candidates) > 0:
            lcs_dists = []
            for lcs_candidate in lcs_candidates:
                lcs_d1 = 0
                if hypernyms_1.has_key(lcs_candidate):
                    lcs_d1 = hypernyms_1[lcs_candidate]
                lcs_d2 = 0
                if hypernyms_2.has_key(lcs_candidate):
                    lcs_d2 = hypernyms_2[lcs_candidate]
                lcs_dists.append(max([lcs_d1, lcs_d2]))
            h_dist = max(lcs_dists)
        else:
            h_dist = 0
    return ((math.exp(BETA * h_dist) - math.exp(-BETA * h_dist)) / 
        (math.exp(BETA * h_dist) + math.exp(-BETA * h_dist)))
    
def word_similarity(word_1, word_2):
    synset_pair = get_best_synset_pair(word_1, word_2)
    return (length_dist(synset_pair[0], synset_pair[1]) * 
        hierarchy_dist(synset_pair[0], synset_pair[1]))

######################### sentence similarity ##########################

def most_similar_word(word, word_set):
    """
    Find the word in the joint word set that is most similar to the word
    passed in. We use the algorithm above to compute word similarity between
    the word and each word in the joint word set, and return the most similar
    word and the actual similarity value.
    """
    max_sim = -1.0
    sim_word = ""
    for ref_word in word_set:
      sim = word_similarity(word, ref_word)
      if sim > max_sim:
          max_sim = sim
          sim_word = ref_word
    return sim_word, max_sim
    
def info_content(lookup_word):
    """
    Uses the Brown corpus available in NLTK to calculate a Laplace
    smoothed frequency distribution of words, then uses this information
    to compute the information content of the lookup_word.
    """
    global N
    if N == 0:
        # poor man's lazy evaluation
        for sent in brown.sents():
            for word in sent:
                word = word.lower()
                if not brown_freqs.has_key(word):
                    brown_freqs[word] = 0
                brown_freqs[word] = brown_freqs[word] + 1
                N = N + 1
    lookup_word = lookup_word.lower()
    n = 0 if not brown_freqs.has_key(lookup_word) else brown_freqs[lookup_word]
    return 1.0 - (math.log(n + 1) / math.log(N + 1))
    
def semantic_vector(words, joint_words, info_content_norm):
    """
    Computes the semantic vector of a sentence. The sentence is passed in as
    a collection of words. The size of the semantic vector is the same as the
    size of the joint word set. The elements are 1 if a word in the sentence
    already exists in the joint word set, or the similarity of the word to the
    most similar word in the joint word set if it doesn't. Both values are 
    further normalized by the word's (and similar word's) information content
    if info_content_norm is True.
    """
    sent_set = set(words)
    semvec = np.zeros(len(joint_words))
    i = 0
    for joint_word in joint_words:
        if joint_word in sent_set:
            # if word in union exists in the sentence, s(i) = 1 (unnormalized)
            semvec[i] = 1.0
            if info_content_norm:
                semvec[i] = semvec[i] * math.pow(info_content(joint_word), 2)
        else:
            # find the most similar word in the joint set and set the sim value
            sim_word, max_sim = most_similar_word(joint_word, sent_set)
            semvec[i] = max_sim if max_sim > PHI else 0.0
            if info_content_norm:
                semvec[i] = semvec[i] * info_content(joint_word) * info_content(sim_word)
        i = i + 1
    return semvec                
            
def semantic_similarity(sentence_1, sentence_2, info_content_norm):
    """
    Computes the semantic similarity between two sentences as the cosine
    similarity between the semantic vectors computed for each sentence.
    """
    words_1 = nltk.word_tokenize(sentence_1)
    words_2 = nltk.word_tokenize(sentence_2)
    joint_words = set(words_1).union(set(words_2))
    vec_1 = semantic_vector(words_1, joint_words, info_content_norm)
    vec_2 = semantic_vector(words_2, joint_words, info_content_norm)
    return np.dot(vec_1, vec_2.T) / (np.linalg.norm(vec_1) * np.linalg.norm(vec_2))

######################### word order similarity ##########################

def word_order_vector(words, joint_words, windex):
    """
    Computes the word order vector for a sentence. The sentence is passed
    in as a collection of words. The size of the word order vector is the
    same as the size of the joint word set. The elements of the word order
    vector are the position mapping (from the windex dictionary) of the 
    word in the joint set if the word exists in the sentence. If the word
    does not exist in the sentence, then the value of the element is the 
    position of the most similar word in the sentence as long as the similarity
    is above the threshold ETA.
    """
    wovec = np.zeros(len(joint_words))
    i = 0
    wordset = set(words)
    for joint_word in joint_words:
        if joint_word in wordset:
            # word in joint_words found in sentence, just populate the index
            wovec[i] = windex[joint_word]
        else:
            # word not in joint_words, find most similar word and populate
            # word_vector with the thresholded similarity
            sim_word, max_sim = most_similar_word(joint_word, wordset)
            if max_sim > ETA:
                wovec[i] = windex[sim_word]
            else:
                wovec[i] = 0
        i = i + 1
    return wovec

def word_order_similarity(sentence_1, sentence_2):
    """
    Computes the word-order similarity between two sentences as the normalized
    difference of word order between the two sentences.
    """
    words_1 = nltk.word_tokenize(sentence_1)
    words_2 = nltk.word_tokenize(sentence_2)
    joint_words = list(set(words_1).union(set(words_2)))
    windex = {x[1]: x[0] for x in enumerate(joint_words)}
    r1 = word_order_vector(words_1, joint_words, windex)
    r2 = word_order_vector(words_2, joint_words, windex)
    return 1.0 - (np.linalg.norm(r1 - r2) / np.linalg.norm(r1 + r2))

######################### overall similarity ##########################

def similarity(sentence_1, sentence_2, info_content_norm):
    """
    Calculate the semantic similarity between two sentences. The last 
    parameter is True or False depending on whether information content
    normalization is desired or not.
    """
    return DELTA * semantic_similarity(sentence_1, sentence_2, info_content_norm) + \
        (1.0 - DELTA) * word_order_similarity(sentence_1, sentence_2)
        
######################### main / test ##########################

# the results of the algorithm are largely dependent on the results of 
# the word similarities, so we should test this first...
word_pairs = [
  ["asylum", "fruit", 0.21],
  ["autograph", "shore", 0.29],
  ["autograph", "signature", 0.55],
  ["automobile", "car", 0.64],
  ["bird", "woodland", 0.33],
  ["boy", "rooster", 0.53],
  ["boy", "lad", 0.66],
  ["boy", "sage", 0.51],
  ["cemetery", "graveyard", 0.73],
  ["coast", "forest", 0.36],
  ["coast", "shore", 0.76],
  ["cock", "rooster", 1.00],
  ["cord", "smile", 0.33],
  ["cord", "string", 0.68],
  ["cushion", "pillow", 0.66],
  ["forest", "graveyard", 0.55],
  ["forest", "woodland", 0.70],
  ["furnace", "stove", 0.72],
  ["glass", "tumbler", 0.65],
  ["grin", "smile", 0.49],
  ["gem", "jewel", 0.83],
  ["hill", "woodland", 0.59],
  ["hill", "mound", 0.74],
  ["implement", "tool", 0.75],
  ["journey", "voyage", 0.52],
  ["magician", "oracle", 0.44],
  ["magician", "wizard", 0.65],
  ["midday", "noon", 1.0],
  ["oracle", "sage", 0.43],
  ["serf", "slave", 0.39]
]
for word_pair in word_pairs:
    print "%s\t%s\t%.2f\t%.2f" % (word_pair[0], word_pair[1], word_pair[2], 
                                  word_similarity(word_pair[0], word_pair[1]))

sentence_pairs = [
    ["I like that bachelor.", "I like that unmarried man.", 0.561],
    ["John is very nice.", "Is John very nice?", 0.977],
    ["Red alcoholic drink.", "A bottle of wine.", 0.585],
    ["Red alcoholic drink.", "Fresh orange juice.", 0.611],
    ["Red alcoholic drink.", "An English dictionary.", 0.0],
    ["Red alcoholic drink.", "Fresh apple juice.", 0.420],
    ["A glass of cider.", "A full cup of apple juice.", 0.678],
    ["It is a dog.", "That must be your dog.", 0.739],
    ["It is a dog.", "It is a log.", 0.623],
    ["It is a dog.", "It is a pig.", 0.790],
    ["Dogs are animals.", "They are common pets.", 0.738],
    ["Canis familiaris are animals.", "Dogs are common pets.", 0.362],
    ["I have a pen.", "Where do you live?", 0.0],
    ["I have a pen.", "Where is ink?", 0.129],
    ["I have a hammer.", "Take some nails.", 0.508],
    ["I have a hammer.", "Take some apples.", 0.121]
]
for sent_pair in sentence_pairs:
    print "%s\t%s\t%.3f\t%.3f\t%.3f" % (sent_pair[0], sent_pair[1], sent_pair[2], 
        similarity(sent_pair[0], sent_pair[1], False),
        similarity(sent_pair[0], sent_pair[1], True))
