In [1]:
import numpy as np
import pandas as pd
import os
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import spacy
from spacy.language import Language

#ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
#load datasets
english = pd.read_csv("english.csv")

In [3]:
nlp_eng = spacy.load("en_core_web_sm")

# Getting the dependency parse

In [4]:
def find_root_of_sentence(text):
    doc = nlp_eng(text)
    root_token = None
    for token in doc:
        if (token.dep_ == "ROOT"):
            root_token = token
    return root_token

In [5]:
all_root_tokens = english["lemma_eng_review"].apply(lambda x: find_root_of_sentence(x)) 
all_roots = all_root_tokens.apply(lambda x: x.text if len(x.text) >= 2 else None) #text means add str type instead token type

In [6]:
roots_df = all_roots.to_frame()
roots_df.rename(columns={"lemma_eng_review":"Root"}, inplace=True)
roots = roots_df["Root"].value_counts().sort_values(ascending=False).head(30)
roots = pd.DataFrame(roots).rename_axis("Root")
roots.columns = roots.columns.str.replace('Root', 'Count')
roots.reset_index(inplace=True)
roots

Unnamed: 0,Root,Count
0,be,1377
1,service,330
2,thank,326
3,have,318
4,buy,301
5,recommend,243
6,advice,135
7,go,111
8,staff,102
9,like,80


In [7]:
import plotly.graph_objects as go
import plotly.io as pio

In [17]:
fig = go.Figure([go.Bar(x=roots['Root'], y=roots['Count'])])
fig.update_layout(title=go.layout.Title(text="Top 30 Roots in the reviews text after lemmatization"))
pio.renderers.default = 'iframe'
fig.show()

In [19]:
def find_other_verbs(doc, root_token):
    other_verbs = []
    for token in doc:
        ancestors = list(token.ancestors)
        if (token.pos_ == "VERB" and len(ancestors) == 1 and ancestors[0] == root_token):
            other_verbs.append(token)
    return other_verbs

In [20]:
sentences = english["lemma_eng_review"].to_list()
verbs = []
for sentence in sentences:
    doc = nlp_eng(sentence)
    root_token = find_root_of_sentence(doc)
    verbs.append(find_other_verbs(doc, root_token))

In [21]:
other_verbs = [item.text for sublist in verbs for item in sublist] # run only one time
other_verbs_df = pd.DataFrame(other_verbs, columns=["Verbs"])
verbs = other_verbs_df["Verbs"].sort_values().value_counts().head(30)
verbs = pd.DataFrame(verbs).reset_index()
verbs.columns = verbs.columns.str.replace('Verbs', 'Count')
verbs.columns = verbs.columns.str.replace('index', 'Verbs')
verbs = verbs[verbs["Verbs"].str.len() >= 2]
verbs

Unnamed: 0,Verbs,Count
0,be,718
1,have,397
2,thank,309
3,buy,261
4,recommend,218
5,go,195
6,take,152
7,get,149
8,look,125
9,make,122


In [22]:
fig = go.Figure([go.Bar(x=verbs['Verbs'], y=verbs['Count'])])
fig.update_layout(title=go.layout.Title(text="Top 30 Verbs in the reviews text after lemmatization"))
pio.renderers.default = 'iframe'
fig.show()

In [23]:
def get_parse(sentence):
    doc = nlp_eng(sentence)
    adverb_list = []
    adjective_list = []
    noun_list = []
    conjunction_list = []
    for token in doc:
        word = token.text
        pos = token.pos_
        if pos == "ADV":
            adv_word = word
            adverb_list.append(adv_word)
        elif pos == "ADJ":
            adj_word = word
            adjective_list.append(adj_word)
        elif pos == "NOUN":
            noun = word
            noun_list.append(noun)
        elif pos == "CONJ" or pos == "CCONJ":  # coordinating conjunction
            conjunction = word
            conjunction_list.append(conjunction)
    return adverb_list, adjective_list, noun_list, conjunction_list

In [24]:
sentences = english["lemma_eng_review"].to_list()
adv_list = []
adj_list = []
noun_list = []
conj_list = []
for sentence in sentences:
    adverbs, adjectives, nouns, conjunctions = get_parse(sentence)
    adv_list.append(adverbs)
    adj_list.append(adjectives)
    noun_list.append(nouns)
    conj_list.append(conjunctions)

In [25]:
#creata a list from nested lists for adverb and adjectives
adjectives =[item for sublist in adj_list for item in sublist] 
adverbs = [item for sublist in adv_list for item in sublist]
nouns = [item for sublist in noun_list for item in sublist]
conjunctions = [item for sublist in conj_list for item in sublist]

In [26]:
import collections
adjective_freq = dict(collections.Counter(adjectives))
adverb_freq = dict(collections.Counter(adverbs))
nouns_freq = dict(collections.Counter(nouns))
conjunctions_freq = dict(collections.Counter(conjunctions))

In [27]:
sorted_adjectives = {k: v for k, v in sorted(adjective_freq.items(), key=lambda item: item[1], reverse=True)}
sorted_adverbs = {k: v for k, v in sorted(adverb_freq.items(), key=lambda item: item[1], reverse=True)}
sorted_nouns = {k: v for k, v in sorted(nouns_freq.items(), key=lambda item: item[1], reverse=True)}
sorted_conjunctions = {k: v for k, v in sorted(conjunctions_freq.items(), key=lambda item: item[1], reverse=True)}

In [28]:
#get first 30 items from dictionary to visualize most frequents
sorted_adjectives = list(sorted_adjectives.items())
adjectives = dict(sorted_adjectives[:30])

sorted_adverbs = list(sorted_adverbs.items())
adverbs = dict(sorted_adverbs[:30])

sorted_nouns = list(sorted_nouns.items())
nouns = dict(sorted_nouns[:30])

sorted_conjunctions = list(sorted_conjunctions.items())
conjunctions = dict(sorted_conjunctions[:7])

In [29]:
adverbs_df = pd.DataFrame({"Adverb":adverbs.keys(), "Frequency":adverbs.values()})
adjective_df = pd.DataFrame({"Adjective":adjectives.keys(), "Frequency":adjectives.values()})
nouns_df = pd.DataFrame({"Noun":nouns.keys(), "Frequency":nouns.values()})
conjunctions_df = pd.DataFrame({"Conjunction":conjunctions.keys(), "Frequency":conjunctions.values()})

In [30]:
fig = go.Figure([go.Bar(x=adverbs_df['Adverb'], y=adverbs_df['Frequency'])])
fig.update_layout(title=go.layout.Title(text="Top 30 Adverbs in the reviews text after lemmatization"))
pio.renderers.default = 'iframe'
fig.show()

In [31]:
fig = go.Figure([go.Bar(x=adjective_df['Adjective'], y=adjective_df['Frequency'])])
fig.update_layout(title=go.layout.Title(text="Top 30 Adjective in the reviews text after lemmatization"))
pio.renderers.default = 'iframe'
fig.show()

In [32]:
fig = go.Figure([go.Bar(x=nouns_df['Noun'], y=nouns_df['Frequency'])])
fig.update_layout(title=go.layout.Title(text="Top 30 Nouns in the reviews text after lemmatization"))
pio.renderers.default = 'iframe'
fig.show()

In [33]:
fig = go.Figure([go.Bar(x=conjunctions_df['Conjunction'], y=conjunctions_df['Frequency'])])
fig.update_layout(title=go.layout.Title(text="Top 7 Conjunctions in the reviews text after lemmatization"))
pio.renderers.default = 'iframe'
fig.show()

In [34]:
# the tokens which "service" depend on
def dependency(text):
    doc = nlp_eng(text)
    for token in doc:
        if token.text == "service":
            print(token.text)
            ancestors = [t.text for t in token.ancestors]
            print(ancestors)

In [35]:
english["lemma_eng_review"].apply(lambda x: dependency(x))


service
['get']
service
['appointment', 'get']
service
['on', 'appointment', 'get']
service
['employee']
service
['look']
service
['desk', 'at', 'lady', 'be', 'find']
service
[]
service
[]
service
['staff']
service
[]
service
['have', 'have']
service
[]
service
['advisor', 'be', 'know']
service
['advisor', 'distinguish', 'detail', 'know']
service
['anything', 'about', 'worry', 'have', 'keep', 'know']
service
['be', 'stepsince', 'inform', 'vehicle', 'pick', 'call', 'selfevidentfrom', 'fact', 'worry', 'have', 'keep', 'know']
service
['advisor', 'up', 'pick', 'get', 'know']
service
['with', 'satisfied', 'be', 'have', 'include']
service
['be', 'have', 'come', 'have', 'include']
service
['tüv', 'to', 'relate', 'handle', 'be', 'have', 'come', 'have', 'include']
service
['center', 'at', 'experience', 'after', 'thank', 'find']
service
['thank']
service
['be']
service
['advisor', 'pick', 'service', 'be']
service
['thank']
service
['scheduling', 'by', 'reach', 'easy', 'be', 'thank']
service
['be

0       None
1       None
2       None
3       None
4       None
        ... 
6514    None
6515    None
6516    None
6517    None
6518    None
Name: lemma_eng_review, Length: 6519, dtype: object

In [36]:
# the tokens which depens on "service"
def dependency_on(text):
    doc=nlp_eng(text)
    for token in doc:
        if token.text == "service":
            print(token.text)
            children = [t.text for t in token.children]
            print(children)

In [37]:
english["lemma_eng_review"].apply(lambda x: dependency_on(x))


service
['excellent', 'and', 'experience']
service
[]
service
['orient']
service
['good']
service
['amazing', 'kind', 'but', 'expensive']
service
[]
service
['good', 'people']
service
['good']
service
['good']
service
['great']
service
['the', 'and', 'quality']
service
['good']
service
[]
service
[]
service
['transparency', 'customer']
service
['my', 'customer']
service
[]
service
['the', 'at', 'in']
service
['the']
service
[]
service
['audi']
service
['excellent', 'by']
service
['good', 'from', 'break', 'and', 'be', 'pick']
service
[]
service
['competent']
service
['the', 'competent', 'courteous', 'at']
service
['great', 'be']
service
[]
service
[]
service
[]
service
['delivery', 'and']
service
['friendly', 'from', 'after', 'buy', '😀']
service
[]
service
['this', 'good']
service
['and', 'customer']
service
['the']
service
[]
service
['and', 'oil']
service
[]
service
['reliable', 'friendly', 'good', 'and', 'service']
service
[]
service
['great', 'and', 'staff']
service
['top', 'friendl

0       None
1       None
2       None
3       None
4       None
        ... 
6514    None
6515    None
6516    None
6517    None
6518    None
Name: lemma_eng_review, Length: 6519, dtype: object