In [17]:
from pymongo import MongoClient
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch import nn
from torch import optim
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\FF0x\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
client = MongoClient(port=27017)
db = client.trip_advisor
collection = db.reviews
df = pd.DataFrame(list(collection.find()))

In [3]:
df.head()

Unnamed: 0,_id,poi_name,title,date,review_rating,text,date_of_visit,reviewer
0,"{'id': '822488814', 'poi_location_id': '380729...",To Tsai Thessaloniki,"Delicious tea, for tea lovers","December 19, 2021",50,"If you are a tea lover , this is the place to ...",December 2021,"{'name': 'Stefanos B', 'handle': 'stefanosb114..."
1,"{'id': '793282537', 'poi_location_id': '380729...",To Tsai Thessaloniki,Awesome place!,"June 20, 2021",50,"The staff was amazing, the tea sublime and the...",June 2021,"{'name': 'Dee', 'handle': 'deesvs', 'location'..."
2,"{'id': '757693667', 'poi_location_id': '380729...",To Tsai Thessaloniki,Detox,"June 19, 2020",50,"To Tsai is a wonderful place, calm, quiet, rel...",June 2020,"{'name': 'Milica Komadinic', 'handle': 'Milica..."
3,"{'id': '736608512', 'poi_location_id': '380729...",To Tsai Thessaloniki,Very good experience,"January 3, 2020",50,It is a non smoking shop. The Professionel is ...,January 2020,"{'name': 'Theodosios S', 'handle': 'theodosios..."
4,"{'id': '687226738', 'poi_location_id': '380729...",To Tsai Thessaloniki,Best break,"July 6, 2019",50,"Any kind of tea, hot or cold, all delicious. H...",July 2019,"{'name': 'ClaireMWrangsjo', 'handle': 'ClaireM..."


In [4]:
# Create subset of users with age group info
age_subset = []

for item in collection.find():
    reviewer = item['reviewer']
    try:
        age_group = reviewer['age']
        age_subset.append(item)
    except:
        None

len(age_subset)

170

In [5]:
# Create subset of users with gender info
gender_subset = []

for item in collection.find():
    reviewer = item['reviewer']
    try:
        gender_group = reviewer['sex']
        gender_subset.append(reviewer)
        
    except:
        None

len(gender_subset)

213

In [6]:

ratings = ['1','2','3','4','5']

ratings_count = []
for rating in ratings:
    myquery = { "review_rating": { "$regex": f"^{rating}" } }
    count = collection.count_documents(myquery)
    ratings_count.append(count)
    print(rating, ': ', count)

print(ratings_count)

# negative: ratings below 4
# neutral: ratings equal to 4
# positive: ratings equal to 5

neutral_range = {"low": 40, "high": 50}
df["Sentiment"] = "neutral"
df["Sentiment"].loc[df["review_rating"].astype(int) < neutral_range["low"]] = "negative"
df["Sentiment"].loc[df["review_rating"].astype(int) >= neutral_range["high"]] = "positive"
df


1 :  14
2 :  5
3 :  28
4 :  121
5 :  524
[14, 5, 28, 121, 524]


Unnamed: 0,_id,poi_name,title,date,review_rating,text,date_of_visit,reviewer,Sentiment
0,"{'id': '822488814', 'poi_location_id': '380729...",To Tsai Thessaloniki,"Delicious tea, for tea lovers","December 19, 2021",50,"If you are a tea lover , this is the place to ...",December 2021,"{'name': 'Stefanos B', 'handle': 'stefanosb114...",positive
1,"{'id': '793282537', 'poi_location_id': '380729...",To Tsai Thessaloniki,Awesome place!,"June 20, 2021",50,"The staff was amazing, the tea sublime and the...",June 2021,"{'name': 'Dee', 'handle': 'deesvs', 'location'...",positive
2,"{'id': '757693667', 'poi_location_id': '380729...",To Tsai Thessaloniki,Detox,"June 19, 2020",50,"To Tsai is a wonderful place, calm, quiet, rel...",June 2020,"{'name': 'Milica Komadinic', 'handle': 'Milica...",positive
3,"{'id': '736608512', 'poi_location_id': '380729...",To Tsai Thessaloniki,Very good experience,"January 3, 2020",50,It is a non smoking shop. The Professionel is ...,January 2020,"{'name': 'Theodosios S', 'handle': 'theodosios...",positive
4,"{'id': '687226738', 'poi_location_id': '380729...",To Tsai Thessaloniki,Best break,"July 6, 2019",50,"Any kind of tea, hot or cold, all delicious. H...",July 2019,"{'name': 'ClaireMWrangsjo', 'handle': 'ClaireM...",positive
...,...,...,...,...,...,...,...,...,...
684,"{'id': '783857174', 'poi_location_id': '189808...",Crema Roastery_Premium Breakfast,Καφεδάκι και γλυκάκι!,"March 2, 2021",50,Ωραία επιλογή να πάρεις το καφεδάκι σου και φυ...,March 2021,"{'name': 'Maria Zaf', 'handle': 'marizafe', 'l...",positive
685,"{'id': '773912541', 'poi_location_id': '189808...",Crema Roastery_Premium Breakfast,Καταπληκτικό,"October 11, 2020",50,Εξαιρετικός καφές και πεντανόστιμα φρέσκα γλυκ...,October 2020,"{'name': 'Anthimos R', 'handle': 'anthimosr', ...",positive
686,"{'id': '749923509', 'poi_location_id': '189808...",Crema Roastery_Premium Breakfast,το απολυτο μερος για ενα διαλλειμα,"March 8, 2020",50,εξαιρετικος καφες και φοβερο πρωινο...ενα μερο...,November 2019,"{'name': 'tasos m', 'handle': 'tasosm387', 'lo...",positive
687,"{'id': '749237897', 'poi_location_id': '189808...",Crema Roastery_Premium Breakfast,"Ό,τι καλύτερο στην Θεσσαλονίκη!!!!","March 5, 2020",50,Πολύ ωραίος χώρος!! Νόστιμα κ φρέσκα όλα! Άψογ...,March 2020,"{'name': 'Samira S', 'handle': '82samiras', 'l...",positive


In [19]:
# Preprocessing text data ( Tokenization, removing stop words,lower case conversion) 

for item in collection.find():
    text = item['text']
    text_tokens = word_tokenize(text)
    tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]   

    filtered_sentence = (" ").join(tokens_without_sw).lower()

    print(filtered_sentence)



if tea lover , place visit thessaloniki . great variety , nice atmosphere , perfect service .
the staff amazing , tea sublime ambience great ! i got yerba mate came back next day iced vanilla matcha latte . both teas good i n't feel like anything except drinking enjoying experience
to tsai wonderful place , calm , quiet , relaxing music , friendly staff , great variety teas - warm recommendations !
it smoking shop . the professionel polite . in general 's good place visit friends family alone .
any kind tea , hot cold , delicious . heat survival iced tea ( i recommend ask regular strong , black , lemon honey ) . friendly staff . go !
this best the-places i ’ ! if like place visit . buy really nice japanese bancha jasmine . or relax great cup .
no smoking aloud ! friendly environment ! friendly employees ! many tea choices ! you buy tea home ! also teapots .... amazing ! ! ! nice decor ! music bit loud conversation without yelling ! ! ! it ’ gem city center ! convenient break seeing sho

In [20]:
dataf = pd.DataFrame(gender_subset)

print(dataf.isnull().sum())

name               0
handle             0
location           0
age               50
sex                0
contributions     10
cities_visited    10
helpful_votes     19
photo             69
distribution      10
dtype: int64


In [21]:
dataf.dropna(subset=['age'], inplace= True)
dataf

Unnamed: 0,name,handle,location,age,sex,contributions,cities_visited,helpful_votes,photo,distribution
0,ClaireMWrangsjo,ClaireMWrangsjo,"Stockholm, Sweden",35-49,woman,5.0,141.0,1.0,1.0,"{'Excellent': 3, 'Very Good': 1, 'Average': 0,..."
1,Ken,KenS990,"Stockholm, Sweden",35-49,man,61.0,24.0,41.0,179.0,"{'Excellent': 35, 'Very Good': 20, 'Average': ..."
2,enterjim,enterjim,"Thessaloniki, Greece",35-49,man,24.0,52.0,5.0,5.0,"{'Excellent': 22, 'Very Good': 1, 'Average': 0..."
3,lefki8,lefki8,"Thessaloniki, Greece",25-34,woman,5.0,28.0,2.0,,"{'Excellent': 5, 'Very Good': 1, 'Average': 0,..."
6,Bonviveuse,Bonviveuse,"London, United Kingdom",35-49,woman,42.0,23.0,17.0,,"{'Excellent': 25, 'Very Good': 7, 'Average': 5..."
...,...,...,...,...,...,...,...,...,...,...
207,vivi s,vivis320,"Thessaloniki, Greece",35-49,woman,75.0,284.0,96.0,,"{'Excellent': 48, 'Very Good': 23, 'Average': ..."
208,jmal127,jmal127,"Thessaloniki Region, Greece",35-49,man,140.0,97.0,117.0,15.0,"{'Excellent': 96, 'Very Good': 33, 'Average': ..."
209,REDMAN,REDMAN5314,"Egaleo, Greece",35-49,man,,,,,
211,tasos m,tasosm387,"Thessaloniki, Greece",25-34,man,71.0,23.0,22.0,,"{'Excellent': 55, 'Very Good': 6, 'Average': 9..."


In [26]:
# We will use one-hot-encoding to convert categorical data to numerical data
categorical_data_cols = ['age']

onehot_encoder = OneHotEncoder()

onehot_encoder_df = pd.DataFrame(onehot_encoder.fit_transform(dataf[categorical_data_cols]).toarray())

dataf = dataf.join(onehot_encoder_df).drop(columns=categorical_data_cols)

dataf



Unnamed: 0,name,handle,location,sex,contributions,cities_visited,helpful_votes,photo,distribution,0,1,2,3,4
0,ClaireMWrangsjo,ClaireMWrangsjo,"Stockholm, Sweden",woman,5.0,141.0,1.0,1.0,"{'Excellent': 3, 'Very Good': 1, 'Average': 0,...",0.0,0.0,1.0,0.0,0.0
1,Ken,KenS990,"Stockholm, Sweden",man,61.0,24.0,41.0,179.0,"{'Excellent': 35, 'Very Good': 20, 'Average': ...",0.0,0.0,1.0,0.0,0.0
2,enterjim,enterjim,"Thessaloniki, Greece",man,24.0,52.0,5.0,5.0,"{'Excellent': 22, 'Very Good': 1, 'Average': 0...",0.0,0.0,1.0,0.0,0.0
3,lefki8,lefki8,"Thessaloniki, Greece",woman,5.0,28.0,2.0,,"{'Excellent': 5, 'Very Good': 1, 'Average': 0,...",0.0,1.0,0.0,0.0,0.0
6,Bonviveuse,Bonviveuse,"London, United Kingdom",woman,42.0,23.0,17.0,,"{'Excellent': 25, 'Very Good': 7, 'Average': 5...",0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,Vasiliki63,Vasiliki63,"Thessaloniki, Greece",woman,47.0,257.0,31.0,12.0,"{'Excellent': 8, 'Very Good': 31, 'Average': 7...",,,,,
188,Maria13Stella,Maria13Stella,"Athens, Greece",woman,43.0,35.0,21.0,7.0,"{'Excellent': 24, 'Very Good': 14, 'Average': ...",,,,,
189,fotsam,fotsam,Thessaloniki,man,103.0,48.0,52.0,1.0,"{'Excellent': 20, 'Very Good': 61, 'Average': ...",,,,,
190,Haris Grigoriou,harisgrigoriou,"Thessaloniki, Greece",man,21.0,25.0,11.0,15.0,"{'Excellent': 11, 'Very Good': 8, 'Average': 1...",,,,,


In [27]:
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()

dataf['distribution']
#print("Feature Matrix: "); print(vec.fit_transform(dataf['distribution']).toarray())

#print("Feature Name: "); print(vec.get_feature_names())

0      {'Excellent': 3, 'Very Good': 1, 'Average': 0,...
1      {'Excellent': 35, 'Very Good': 20, 'Average': ...
2      {'Excellent': 22, 'Very Good': 1, 'Average': 0...
3      {'Excellent': 5, 'Very Good': 1, 'Average': 0,...
6      {'Excellent': 25, 'Very Good': 7, 'Average': 5...
                             ...                        
187    {'Excellent': 8, 'Very Good': 31, 'Average': 7...
188    {'Excellent': 24, 'Very Good': 14, 'Average': ...
189    {'Excellent': 20, 'Very Good': 61, 'Average': ...
190    {'Excellent': 11, 'Very Good': 8, 'Average': 1...
191    {'Excellent': 19, 'Very Good': 31, 'Average': ...
Name: distribution, Length: 145, dtype: object

In [28]:
X = dataf.drop(['sex','name','handle','location'], axis = 1)
y = dataf['sex']

In [29]:
X

Unnamed: 0,contributions,cities_visited,helpful_votes,photo,distribution,0,1,2,3,4
0,5.0,141.0,1.0,1.0,"{'Excellent': 3, 'Very Good': 1, 'Average': 0,...",0.0,0.0,1.0,0.0,0.0
1,61.0,24.0,41.0,179.0,"{'Excellent': 35, 'Very Good': 20, 'Average': ...",0.0,0.0,1.0,0.0,0.0
2,24.0,52.0,5.0,5.0,"{'Excellent': 22, 'Very Good': 1, 'Average': 0...",0.0,0.0,1.0,0.0,0.0
3,5.0,28.0,2.0,,"{'Excellent': 5, 'Very Good': 1, 'Average': 0,...",0.0,1.0,0.0,0.0,0.0
6,42.0,23.0,17.0,,"{'Excellent': 25, 'Very Good': 7, 'Average': 5...",0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
187,47.0,257.0,31.0,12.0,"{'Excellent': 8, 'Very Good': 31, 'Average': 7...",,,,,
188,43.0,35.0,21.0,7.0,"{'Excellent': 24, 'Very Good': 14, 'Average': ...",,,,,
189,103.0,48.0,52.0,1.0,"{'Excellent': 20, 'Very Good': 61, 'Average': ...",,,,,
190,21.0,25.0,11.0,15.0,"{'Excellent': 11, 'Very Good': 8, 'Average': 1...",,,,,
