<h3>1) Import Data</h3>

In [1]:
import pandas as pd
import string
import glob

path = r'data'
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    print(filename)
    df = pd.read_csv(filename, index_col=0, header=0)
    file_id=filename.split(".")[0].split("\\")[1]
    df["source"]=file_id
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True)

data\amsterdamGVB.csv
data\amsterdamOVinfo.csv
data\barcelonaBus.csv
data\barcelonaTMB.csv
data\copenhagenRejseplanen.csv
data\dublinBus.csv
data\dublinNextBus.csv
data\dublinRTI.csv
data\lisbonCarris.csv
data\LondonMapWay.csv
data\LondonUrbanThings.csv
data\madridBus.csv
data\madridJSVM.csv


In [2]:
df.shape

(7468, 6)

<h3>2) Remove Punctuation</h3>

In [3]:
reviews=[]
for review in df.review:
    reviews.append(review.translate(str.maketrans("","", string.punctuation)))
df["review"]=reviews

In [4]:
df.head()

Unnamed: 0,name,rating,date,upvotes,review,source
0,A Google user,3,"September 30, 2018",20,Confusing for tourists with tourist tickets th...,amsterdamGVB
1,Ozgur Kalkan,3,"November 19, 2018",4,Its a good app more like an essential one but ...,amsterdamGVB
2,gabs igab,5,"November 7, 2019",2,Excellent app I have used it on our visit to A...,amsterdamGVB
3,Damian,5,"March 1, 2020",1,It works very well for traveling I use it ever...,amsterdamGVB
4,Pel Pan,1,"March 28, 2019",1,very bad accuracy and often not the shortest way,amsterdamGVB


In [5]:
df.shape

(7468, 6)

<h3>3) Add review length and remove short reviews</h3>

In [6]:
review_length=[]
for review in df.review:
    review_length.append(len(review))

In [7]:
df["review_length"]=pd.Series(review_length)

In [8]:
df=df[df.review_length>=20]

In [9]:
df.shape

(3580, 7)

<h3>4) Filter by good (5*) and bad (1*) experiences</h3>

In [10]:
df_five=df[df.rating==5]

In [11]:
five_length=df_five.shape[0]
five_length

1534

In [12]:
df_one=df[df.rating==1]

In [13]:
one_length=df_one.shape[0]
one_length

737

<h3>5) Extract frequently used words and calculate mentions per comment</h3>

In [14]:
counts_five=pd.Series(' '.join(df_five['review']).lower().split()).value_counts()

In [15]:
word_freq={}
for word in counts_five.index:
    word_freq[word]=[counts_five[word]/five_length,0]

In [16]:
counts_one=pd.Series(' '.join(df_one['review']).lower().split()).value_counts()

In [17]:
for word in counts_one.index:
    freq=counts_one[word]/one_length
    if word not in word_freq:
        word_freq[word]=[0,freq]
    else:
        word_freq[word][1]=freq

<h3>5) Insert data into Dataframe and calculate delta</h3>

In [18]:
mentions=pd.DataFrame.from_dict(word_freq,orient='index',columns=['good','bad'])

In [19]:
mentions['delta']=mentions.good-mentions.bad

In [20]:
sorted_mentions=mentions.sort_values(by='delta')

In [21]:
sorted_mentions[:50]

Unnamed: 0,good,bad,delta
the,0.473272,0.947083,-0.47381
to,0.370926,0.618725,-0.247799
a,0.210561,0.446404,-0.235844
it,0.307692,0.533243,-0.225551
not,0.040417,0.234735,-0.194318
i,0.278357,0.470828,-0.19247
this,0.180574,0.299864,-0.119291
is,0.202086,0.320217,-0.118131
of,0.110169,0.225237,-0.115068
at,0.036506,0.150611,-0.114105


In [22]:
sorted_mentions[-50:]

Unnamed: 0,good,bad,delta
found,0.01369,0.005427,0.008262
travelling,0.009778,0.001357,0.008422
super,0.009778,0.001357,0.008422
een,0.008475,0.0,0.008475
what,0.04824,0.039349,0.008891
application,0.013038,0.004071,0.008967
tfi,0.009778,0.0,0.009778
job,0.013038,0.002714,0.010324
life,0.011734,0.001357,0.010377
around,0.021512,0.010855,0.010658
