In [58]:
from pymongo import MongoClient
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics

import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\FF0x\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [59]:
client = MongoClient(port=27017)
db = client.trip_advisor
collection = db.reviews_new
df = pd.DataFrame(list(collection.find()))

In [60]:
df.head()

Unnamed: 0,_id,poi_name,title,date,review_rating,text,date_of_visit,reviewer
0,"{'id': '822488814', 'poi_location_id': '380729...",To Tsai Thessaloniki,"Delicious tea, for tea lovers","December 19, 2021",50,"If you are a tea lover , this is the place to ...",December 2021,"{'name': 'Stefanos B', 'handle': 'stefanosb114..."
1,"{'id': '821022146', 'poi_location_id': '107988...",Koukos,Tasty croissants,"December 4, 2021",50,"The place has incredible decoration, very uniq...",October 2021,"{'name': 'Jimmys', 'handle': 'Poliorkitis96', ..."
2,"{'id': '831588926', 'poi_location_id': '151171...",Judah Club,WOW!,"March 18, 2022",50,Store and cafe - 5/5. Food and food presentati...,March 2022,"{'name': 'Reza S', 'handle': '392rezas', 'cont..."
3,"{'id': '793282537', 'poi_location_id': '380729...",To Tsai Thessaloniki,Awesome place!,"June 20, 2021",50,"The staff was amazing, the tea sublime and the...",June 2021,"{'name': 'Dee', 'handle': 'deesvs', 'location'..."
4,"{'id': '802555010', 'poi_location_id': '107988...",Koukos,Great pastry and sandwich shop,"August 8, 2021",50,Went to this shop out of curiosity and to try ...,August 2021,"{'name': 'Stefan Laurentiu', 'handle': 'stefan..."


In [61]:
# Create subset of users with age group info
age_subset = []

for item in collection.find():
    reviewer = item['reviewer']
    try:
        age_group = reviewer['age']
        age_subset.append(item)
    except:
        None

len(age_subset)

331

In [62]:
# Create subset of users with gender info
gender_subset = []

for item in collection.find():
    reviewer = item['reviewer']
    try:
        gender_group = reviewer['sex']
        gender_subset.append(reviewer)
        
    except:
        None

len(gender_subset)

419

In [63]:
ratings = ['1','2','3','4','5']

ratings_count = []
for rating in ratings:
    myquery = { "review_rating": { "$regex": f"^{rating}" } }
    count = collection.count_documents(myquery)
    ratings_count.append(count)
    print(rating, ': ', count)

print(ratings_count)

# negative: ratings below 4
# neutral: ratings equal to 4
# positive: ratings equal to 5

neutral_range = {"low": 40, "high": 50}
df["Sentiment"] = "neutral"
df["Sentiment"].loc[df["review_rating"].astype(int) < neutral_range["low"]] = "negative"
df["Sentiment"].loc[df["review_rating"].astype(int) >= neutral_range["high"]] = "positive"
df


1 :  43
2 :  24
3 :  74
4 :  271
5 :  840
[43, 24, 74, 271, 840]


Unnamed: 0,_id,poi_name,title,date,review_rating,text,date_of_visit,reviewer,Sentiment
0,"{'id': '822488814', 'poi_location_id': '380729...",To Tsai Thessaloniki,"Delicious tea, for tea lovers","December 19, 2021",50,"If you are a tea lover , this is the place to ...",December 2021,"{'name': 'Stefanos B', 'handle': 'stefanosb114...",positive
1,"{'id': '821022146', 'poi_location_id': '107988...",Koukos,Tasty croissants,"December 4, 2021",50,"The place has incredible decoration, very uniq...",October 2021,"{'name': 'Jimmys', 'handle': 'Poliorkitis96', ...",positive
2,"{'id': '831588926', 'poi_location_id': '151171...",Judah Club,WOW!,"March 18, 2022",50,Store and cafe - 5/5. Food and food presentati...,March 2022,"{'name': 'Reza S', 'handle': '392rezas', 'cont...",positive
3,"{'id': '793282537', 'poi_location_id': '380729...",To Tsai Thessaloniki,Awesome place!,"June 20, 2021",50,"The staff was amazing, the tea sublime and the...",June 2021,"{'name': 'Dee', 'handle': 'deesvs', 'location'...",positive
4,"{'id': '802555010', 'poi_location_id': '107988...",Koukos,Great pastry and sandwich shop,"August 8, 2021",50,Went to this shop out of curiosity and to try ...,August 2021,"{'name': 'Stefan Laurentiu', 'handle': 'stefan...",positive
...,...,...,...,...,...,...,...,...,...
1247,"{'id': '535123128', 'poi_location_id': '381560...",Iktinou Au Trottoir,a must for narrow street lover,"October 22, 2017",40,"hiden in the streets of thessaloniki, this caf...",August 2017,"{'name': 'Joseph G', 'handle': 'drjosephgerges...",neutral
1248,"{'id': '412225575', 'poi_location_id': '381560...",Iktinou Au Trottoir,Slow service,"August 29, 2016",20,Tables not cleaned after previous guests. Unat...,August 2016,"{'name': 'Orc_L', 'handle': 'Orc_L', 'location...",negative
1249,"{'id': '230095138', 'poi_location_id': '381560...",Iktinou Au Trottoir,Lovely,"September 21, 2014",50,Leisurely place for drinks... don't miss it. E...,,"{'name': 'LGedik', 'handle': 'LGedik', 'locati...",positive
1250,"{'id': '184435157', 'poi_location_id': '381560...",Iktinou Au Trottoir,Nice place to chill out,"November 11, 2013",50,We found this cafe-bar by chance and were happ...,November 2013,"{'name': 'Paula S', 'handle': 'psaldanaf', 'co...",positive


In [64]:
# Preprocessing text data ( Tokenization, removing stop words,lower case conversion) 

#for item in collection.find():
 #   text = item['text']
  #  text_tokens = word_tokenize(text)
   # tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]   

    #filtered_sentence = (" ").join(tokens_without_sw).lower()
    #df["filtered_text"] = filtered_sentence
    #print(filtered_sentence)



In [65]:
df

Unnamed: 0,_id,poi_name,title,date,review_rating,text,date_of_visit,reviewer,Sentiment
0,"{'id': '822488814', 'poi_location_id': '380729...",To Tsai Thessaloniki,"Delicious tea, for tea lovers","December 19, 2021",50,"If you are a tea lover , this is the place to ...",December 2021,"{'name': 'Stefanos B', 'handle': 'stefanosb114...",positive
1,"{'id': '821022146', 'poi_location_id': '107988...",Koukos,Tasty croissants,"December 4, 2021",50,"The place has incredible decoration, very uniq...",October 2021,"{'name': 'Jimmys', 'handle': 'Poliorkitis96', ...",positive
2,"{'id': '831588926', 'poi_location_id': '151171...",Judah Club,WOW!,"March 18, 2022",50,Store and cafe - 5/5. Food and food presentati...,March 2022,"{'name': 'Reza S', 'handle': '392rezas', 'cont...",positive
3,"{'id': '793282537', 'poi_location_id': '380729...",To Tsai Thessaloniki,Awesome place!,"June 20, 2021",50,"The staff was amazing, the tea sublime and the...",June 2021,"{'name': 'Dee', 'handle': 'deesvs', 'location'...",positive
4,"{'id': '802555010', 'poi_location_id': '107988...",Koukos,Great pastry and sandwich shop,"August 8, 2021",50,Went to this shop out of curiosity and to try ...,August 2021,"{'name': 'Stefan Laurentiu', 'handle': 'stefan...",positive
...,...,...,...,...,...,...,...,...,...
1247,"{'id': '535123128', 'poi_location_id': '381560...",Iktinou Au Trottoir,a must for narrow street lover,"October 22, 2017",40,"hiden in the streets of thessaloniki, this caf...",August 2017,"{'name': 'Joseph G', 'handle': 'drjosephgerges...",neutral
1248,"{'id': '412225575', 'poi_location_id': '381560...",Iktinou Au Trottoir,Slow service,"August 29, 2016",20,Tables not cleaned after previous guests. Unat...,August 2016,"{'name': 'Orc_L', 'handle': 'Orc_L', 'location...",negative
1249,"{'id': '230095138', 'poi_location_id': '381560...",Iktinou Au Trottoir,Lovely,"September 21, 2014",50,Leisurely place for drinks... don't miss it. E...,,"{'name': 'LGedik', 'handle': 'LGedik', 'locati...",positive
1250,"{'id': '184435157', 'poi_location_id': '381560...",Iktinou Au Trottoir,Nice place to chill out,"November 11, 2013",50,We found this cafe-bar by chance and were happ...,November 2013,"{'name': 'Paula S', 'handle': 'psaldanaf', 'co...",positive


In [66]:
dataf = pd.DataFrame(gender_subset)


print(dataf.isnull().sum())

name                0
handle              0
location            0
sex                 0
contributions      26
cities_visited     26
age                98
helpful_votes      50
photo             149
distribution       20
dtype: int64


In [67]:
updated_df = dataf.dropna(subset=['age'])
updated_df = updated_df.reset_index(drop= True)
updated_df['contributions']=updated_df['contributions'].fillna(updated_df['contributions'].mean())
updated_df['cities_visited']=updated_df['cities_visited'].fillna(updated_df['cities_visited'].mean())
updated_df['helpful_votes']=updated_df['helpful_votes'].fillna(updated_df['helpful_votes'].mean())
updated_df['photo']=updated_df['photo'].fillna(updated_df['photo'].mean())
updated_df.dropna(subset=['distribution'], inplace=True)
updated_df = updated_df.reset_index(drop= True)
#updated_df.info()

print(updated_df.isnull().sum())

name              0
handle            0
location          0
sex               0
contributions     0
cities_visited    0
age               0
helpful_votes     0
photo             0
distribution      0
dtype: int64


In [68]:
# Deal with Distributions 

from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()

#updated_df['distribution'].head()
vec_df = vec.fit_transform(updated_df['distribution']).toarray()

# creating a list of column names
column_values = vec.get_feature_names()

vec_df = pd.DataFrame(data = vec_df,columns= column_values)

updated_df = updated_df.join(vec_df).drop(columns=['distribution'])

In [69]:
updated_df

Unnamed: 0,name,handle,location,sex,contributions,cities_visited,age,helpful_votes,photo,Average,Excellent,Poor,Terrible,Very Good
0,ClaireMWrangsjo,ClaireMWrangsjo,"Stockholm, Sweden",woman,5.000000,141.000000,35-49,1.000000,1.000000,0.0,3.0,0.0,0.0,1.0
1,Ken,KenS990,"Stockholm, Sweden",man,61.000000,24.000000,35-49,42.000000,179.000000,5.0,35.0,1.0,0.0,20.0
2,MyIsla,MyIsla,"Thessaloniki, Greece",woman,87.000000,114.000000,35-49,56.000000,114.000000,11.0,52.0,4.0,2.0,18.0
3,Dekker51,Dekker51,"THESSALONIKI, GREECE",man,88.000000,131.000000,25-34,59.000000,18.000000,13.0,22.0,2.0,0.0,51.0
4,enterjim,enterjim,"Thessaloniki, Greece",man,24.000000,52.000000,35-49,5.000000,5.000000,0.0,22.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,daw75,daw75,"Berlin, Germany",man,63.000000,111.000000,35-49,39.000000,11.000000,8.0,21.0,0.0,0.0,34.0
305,Carol A S,carolas936,"Marietta, Georgia",woman,157.057432,124.533784,50-64,81.926316,123.130233,285.0,334.0,20.0,1.0,781.0
306,Panagiotis Hatziioannou,P_Hatz98,"Thessaloniki, Greece",man,9.000000,2.000000,18-24,1.000000,2.000000,2.0,4.0,0.0,0.0,3.0
307,Evita S,evitaS_12,"Thessaloniki, Greece",woman,905.000000,204.000000,35-49,81.926316,123.130233,107.0,544.0,38.0,28.0,189.0


In [70]:
# We will use one-hot-encoding to convert categorical data to numerical data
categorical_data_cols = ['age']

onehot_encoder = OneHotEncoder()

onehot_encoder_df = pd.DataFrame(onehot_encoder.fit_transform(updated_df[categorical_data_cols]).toarray())

updated_df = updated_df.join(onehot_encoder_df).drop(columns=categorical_data_cols)
updated_df



Unnamed: 0,name,handle,location,sex,contributions,cities_visited,helpful_votes,photo,Average,Excellent,Poor,Terrible,Very Good,0,1,2,3,4
0,ClaireMWrangsjo,ClaireMWrangsjo,"Stockholm, Sweden",woman,5.000000,141.000000,1.000000,1.000000,0.0,3.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,Ken,KenS990,"Stockholm, Sweden",man,61.000000,24.000000,42.000000,179.000000,5.0,35.0,1.0,0.0,20.0,0.0,0.0,1.0,0.0,0.0
2,MyIsla,MyIsla,"Thessaloniki, Greece",woman,87.000000,114.000000,56.000000,114.000000,11.0,52.0,4.0,2.0,18.0,0.0,0.0,1.0,0.0,0.0
3,Dekker51,Dekker51,"THESSALONIKI, GREECE",man,88.000000,131.000000,59.000000,18.000000,13.0,22.0,2.0,0.0,51.0,0.0,1.0,0.0,0.0,0.0
4,enterjim,enterjim,"Thessaloniki, Greece",man,24.000000,52.000000,5.000000,5.000000,0.0,22.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,daw75,daw75,"Berlin, Germany",man,63.000000,111.000000,39.000000,11.000000,8.0,21.0,0.0,0.0,34.0,0.0,0.0,1.0,0.0,0.0
305,Carol A S,carolas936,"Marietta, Georgia",woman,157.057432,124.533784,81.926316,123.130233,285.0,334.0,20.0,1.0,781.0,0.0,0.0,0.0,1.0,0.0
306,Panagiotis Hatziioannou,P_Hatz98,"Thessaloniki, Greece",man,9.000000,2.000000,1.000000,2.000000,2.0,4.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0
307,Evita S,evitaS_12,"Thessaloniki, Greece",woman,905.000000,204.000000,81.926316,123.130233,107.0,544.0,38.0,28.0,189.0,0.0,0.0,1.0,0.0,0.0


In [71]:
X = updated_df.drop(['sex','name','handle','location'], axis = 1)
y = updated_df['sex']

In [72]:
X

Unnamed: 0,contributions,cities_visited,helpful_votes,photo,Average,Excellent,Poor,Terrible,Very Good,0,1,2,3,4
0,5.000000,141.000000,1.000000,1.000000,0.0,3.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,61.000000,24.000000,42.000000,179.000000,5.0,35.0,1.0,0.0,20.0,0.0,0.0,1.0,0.0,0.0
2,87.000000,114.000000,56.000000,114.000000,11.0,52.0,4.0,2.0,18.0,0.0,0.0,1.0,0.0,0.0
3,88.000000,131.000000,59.000000,18.000000,13.0,22.0,2.0,0.0,51.0,0.0,1.0,0.0,0.0,0.0
4,24.000000,52.000000,5.000000,5.000000,0.0,22.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,63.000000,111.000000,39.000000,11.000000,8.0,21.0,0.0,0.0,34.0,0.0,0.0,1.0,0.0,0.0
305,157.057432,124.533784,81.926316,123.130233,285.0,334.0,20.0,1.0,781.0,0.0,0.0,0.0,1.0,0.0
306,9.000000,2.000000,1.000000,2.000000,2.0,4.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0
307,905.000000,204.000000,81.926316,123.130233,107.0,544.0,38.0,28.0,189.0,0.0,0.0,1.0,0.0,0.0


In [73]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [74]:
# Pipeline Estimator
#
pipeline = make_pipeline(LogisticRegression(random_state=1))
#
# Fit the model
#
pipeline.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('logisticregression', LogisticRegression(random_state=1))])

In [75]:
y_predicted = pipeline.predict(X_test)

In [76]:
Accuracy_score = metrics.accuracy_score(y_test, y_predicted)
print(Accuracy_score)

0.46774193548387094
