<h3>1) Import Data</h3>

In [1]:
import pandas as pd
import string
import glob

path = r'data'
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    print(filename)
    df = pd.read_csv(filename, index_col=0, header=0)
    file_id=filename.split(".")[0].split("\\")[1]
    df["source"]=file_id
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True)

data\amsterdamGVB.csv
data\amsterdamOVinfo.csv
data\barcelonaBus.csv
data\barcelonaTMB.csv
data\copenhagenRejseplanen.csv
data\dublinBus.csv
data\dublinNextBus.csv
data\dublinRTI.csv
data\lisbonCarris.csv
data\LondonMapWay.csv
data\LondonUrbanThings.csv
data\madridBus.csv
data\madridJSVM.csv


In [2]:
df.shape

(7468, 6)

<h3>2) Remove Punctuation</h3>

In [3]:
reviews=[]
for review in df.review:
    reviews.append(review.translate(str.maketrans("","", string.punctuation)))
df["review"]=reviews

In [4]:
df.head()

Unnamed: 0,name,rating,date,upvotes,review,source
0,A Google user,3,"September 30, 2018",20,Confusing for tourists with tourist tickets th...,amsterdamGVB
1,Ozgur Kalkan,3,"November 19, 2018",4,Its a good app more like an essential one but ...,amsterdamGVB
2,gabs igab,5,"November 7, 2019",2,Excellent app I have used it on our visit to A...,amsterdamGVB
3,Damian,5,"March 1, 2020",1,It works very well for traveling I use it ever...,amsterdamGVB
4,Pel Pan,1,"March 28, 2019",1,very bad accuracy and often not the shortest way,amsterdamGVB


In [5]:
df.shape

(7468, 6)

<h3>3) Add review length and remove short reviews</h3>

In [6]:
review_length=[]
for review in df.review:
    review_length.append(len(review.split()))

In [7]:
df["review_length"]=pd.Series(review_length)

In [8]:
df=df[df.review_length>=10]

In [9]:
df.shape

(1863, 7)

<h3>4) Filter by good (5*) and bad (1*) experiences</h3>

In [10]:
df_five=df[df.rating==5]

In [11]:
five_length=df_five.review_length.sum()
five_length

14656

In [12]:
df_one=df[df.rating==1]

In [13]:
one_length=df_one.review_length.sum()
one_length

15965

<h3>5) Extract frequently used words and calculate mentions per comment</h3>

In [14]:
counts_five=pd.Series(' '.join(df_five['review']).lower().split()).value_counts()

In [15]:
word_freq={}
for word in counts_five.index:
    word_freq[word]=[counts_five[word]/five_length*1000,0]

In [16]:
counts_one=pd.Series(' '.join(df_one['review']).lower().split()).value_counts()

In [17]:
for word in counts_one.index:
    freq=counts_one[word]/one_length*1000
    if word not in word_freq:
        word_freq[word]=[0,freq]
    else:
        word_freq[word][1]=freq

<h3>5) Insert data into Dataframe and calculate delta</h3>

In [18]:
mentions=pd.DataFrame.from_dict(word_freq,orient='index',columns=['good','bad'])

In [19]:
mentions['delta']=mentions.good-mentions.bad

In [20]:
sorted_mentions=mentions.sort_values(by='delta')

In [21]:
sorted_mentions[:50]

Unnamed: 0,good,bad,delta
not,3.548035,8.518635,-4.9706
was,1.569323,5.073598,-3.504275
at,3.479803,6.51425,-3.034446
doesnt,0.750546,3.319762,-2.569216
no,3.002183,5.512058,-2.509874
useless,0.0,2.31757,-2.31757
just,2.046943,4.259317,-2.212374
the,39.710699,41.778891,-2.068193
that,4.844432,6.890072,-2.04564
ads,0.409389,2.380207,-1.970818


In [22]:
sorted_mentions[-50:]

Unnamed: 0,good,bad,delta
stops,3.548035,2.505481,1.042554
stop,4.503275,3.445036,1.058239
around,1.569323,0.501096,1.068227
with,4.980895,3.883495,1.0974
nice,1.228166,0.125274,1.102892
always,2.046943,0.939555,1.107388
travel,1.296397,0.187911,1.108486
recommend,1.364629,0.250548,1.114081
one,4.093886,2.94394,1.149947
handy,1.296397,0.062637,1.23376
