In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
import spacy
import dataset
import analyze

In [None]:
# likes_path = "easytour-likes-2022-03MAR-21.json"
# schede_path = "easytour-schede-2022-03MAR-21.json"
likes_path = "kuriu-likes-apr.json"
schede_path = "kuriu-schede-apr.json"
data_path = "data.csv"
data_analyzed_path = "data_analyzed.csv"

In [None]:
f_schede = open(schede_path)
data_schede = json.load(f_schede)
print("Fields: {}".format(data_schede[0].keys())) 
print("Number of samples: {}".format(len(data_schede)))

In [None]:
print("First sample: {}".format(data_schede[8]))

In [None]:
f_likes = open(likes_path)
data_likes = json.load(f_likes)
print("Number of samples: {}".format(len(data_likes)))
print("Fields: {}".format(data_likes[0].keys()))
print("First sample: {}".format(data_likes[0]))

In [None]:
df_schede = pd.read_json(schede_path)
df_likes = pd.read_json(likes_path)

<h3>Data Analysis</h3>

In [None]:
df_schede.info()

In [None]:
df_schede.describe()

In [None]:
df_schede_fields = df_schede[["id","userId", "title", "description", "duration", "datePublishing", "creationDate", "counterUseful", "categories", "viewCounter"]]
df_likes_ids = df_likes[["id", "userId"]]

In [None]:
df_schede_fields

<h3>Preprocessing</h3>

<h5>Null or empty values</h5>

In [None]:
#Check if there are null fields
df_schede_fields.isnull().sum()

In [None]:
def fill_null_fields(df):
    #empty description -> remove or use '' ?
    df["description"].fillna('', inplace=True)
    #if duration is null -> set to 0
    df["duration"].fillna(0, inplace=True)
    #if datePublishing is null -> set it equal to creationDate
    df["datePublishing"].fillna(df_schede_fields.creationDate, inplace=True)
    #if viewCounter is null -> set to 0
    df["viewCounter"].fillna(0, inplace=True)

In [None]:
fill_null_fields(df_schede_fields)

<h5>Categories into one hot encoding</h5>

In [None]:
def lower_list(l):
    if type(l)!=list:
        return []
    res = [x.lower() for x in l]
    return res

def encoding_categories(df):
  one_hot_enc_categories = df["categories"].apply(lambda x: lower_list(x))
  df["categories_encoding"] = one_hot_enc_categories
  mlb = MultiLabelBinarizer()

  schede_with_categories = df.join(
              pd.DataFrame(
                  mlb.fit_transform(df.pop('categories_encoding')),
                  index=df.index,
                  columns=mlb.classes_))
  return schede_with_categories

In [None]:
df_categories = encoding_categories(df_schede_fields)

<h5>Text fields</h5>

In [None]:
import string
from bs4 import BeautifulSoup
import nltk
import unidecode
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

In [None]:
#Downloads for stopwords and punctuation
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('italian')
nltk.download('wordnet')
string.punctuation

In [None]:
def remove_html(text: string):
    soup = BeautifulSoup(text, "html.parser")
    cleaned_text = soup.get_text(separator=" ")
    return cleaned_text

In [None]:
def remove_accented_chars(text: string):
    text = unidecode.unidecode(text)
    return text

In [None]:
def remove_stopwords(text: string):
  output= [i for i in text.split() if i not in stopwords]
  return output

In [None]:
#Removing punctuation
def remove_punctuation(text: string):
  list_without_punctuation = []
  for i in text:
    if i not in string.punctuation:
      list_without_punctuation.append(i)
    else:
      list_without_punctuation.append(' ')
  string_without_punctuation = ''.join(list_without_punctuation)
  return string_without_punctuation

In [None]:
#Removing numbers
def remove_numbers(text: string):
  list_without_numbers = []
  for i in text:
    if i not in '0123456789':
      list_without_numbers.append(i)
    else:
      list_without_numbers.append(' ')
  string_without_numbers = ''.join(list_without_numbers)
  return string_without_numbers

In [None]:
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatizer(text: string):
  lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
  return lemm_text

In [None]:
#Stemming
italian_stemmer = SnowballStemmer('italian')

def stemming(text: string):
  stem_text = [italian_stemmer.stem(word) for word in text if not word.isdigit()]
  return stem_text

In [None]:
def preprocessing(text: string, html=1, accent=1, punct=1, numb=1, stop=1, lemma=1, stem=1):
    # if html==1:
    #     text = remove_html(text)
    if accent==1:
        text = remove_accented_chars(text)
    if punct==1:
        text = remove_punctuation(text)
    if numb==1:
        text = remove_numbers(text)
    if stop==1:
        text = remove_stopwords(text)
    if lemma==1:
        text = lemmatizer(text)
    if stem==1:
        text = stemming(text)
    return text

In [None]:
nlp = spacy.load("it_core_news_lg")

def keep_just_nouns(field):
    doc = nlp(field.lower())
    return [token.lemma_ for token in doc if token.pos_=="NOUN"]

def remove_verbs_adj(field):
    doc = nlp(field.lower())
    return [token.lemma_ for token in doc if token.pos_ not in ["VERB", "ADJ", "ADV"]]

In [None]:
def clean_text(df):
    df["title"] = df["title"].apply(lambda x: analyze.demojize(x))
    df["description"] = df["description"].apply(lambda x: analyze.demojize(x))
    # Remove Verbs from description
    schede_description = df.apply(
        lambda row: remove_verbs_adj(row['description']),
        axis=1)

    # Remove hastags, punctuation, stop words and numbers
    schede_description = schede_description.apply(
        lambda row: dataset.clean_text_aslist(row)) 
        
    # Remove Verbs from title
    schede_title = df.apply(
        lambda row: remove_verbs_adj(row['title']),
        axis=1)
        
    # Remove hastags, punctuation, stop words and numbers
    schede_title = schede_title.apply(
        lambda row: dataset.clean_text_aslist(row))
    
    df["description"] = schede_description
    df["title"] = schede_title

In [None]:
prova_df = pd.DataFrame(columns = ["title", "description"])
prova_df.append({"title": "provare ciao \n\nops", "description": "provare alalalvnj \\\n acciaio cane, come stai? ole."})

In [31]:
clean_text(prova_df)

In [None]:
prova_df

<h5>Dates</h5>

In [32]:
def preprocess_dates(df):
    # Convert duration from ms to minutes
    df['duration_min'] = df.apply(lambda row: row["duration"] / 60000, axis=1)

    # Compute age of entry in days from today
    df['days'] = 0
    df.days = (pd.to_datetime("now").tz_localize('UTC') - pd.to_datetime(
        df["creationDate"])) // np.timedelta64(1, 'D')

In [33]:
preprocess_dates(df_categories)

  result, tz_parsed = tslib.array_to_datetime(


In [34]:
df_categories

Unnamed: 0,id,userId,title,description,duration,datePublishing,creationDate,counterUseful,categories,viewCounter,aria aperta,cultura,food,sport,tempo libero,viaggi,duration_min,days
0,62355f98272ae5672fbc3ac7,632,"[spiaggia, phuket, thailand]","[phuket, paradiso, amant, sole, mare, sole, co...",0,2022-03-19T04:44:09.503Z,2022-03-19T04:44:08.000+00:00,0,[Viaggi],7,0,0,0,0,0,1,0.0,31
1,620aa536ff8ae67f37cafb06,442,[tigr],"[marcio, maltrattamento, animal, andar, thaila...",10800000,2022-02-14T20:05:09.586Z,2022-02-14T18:53:42.000+00:00,0,[Viaggi],419,0,0,0,0,0,1,180.0,63
2,622c3b8ac992ca28f33c7f21,571,"[escursion, tramonto, pidurangala, rock]","[cuor, sri, lanka, zona, interess, \n\r\n, sig...",0,2022-03-12T06:19:55.882Z,2022-03-12T06:19:54.000+00:00,1,[Aria aperta],24,1,0,0,0,0,0,0.0,38
3,620a90c8ff8ae67f37caf97f,530,"[koh, samui]","[tour, bangkok, arcipelago, koh, samui, nott, ...",3600000,2022-02-16T15:15:51.387Z,2022-02-14T17:26:32.000+00:00,0,[Viaggi],453,0,0,0,0,0,1,60.0,63
4,622c398cc992ca28f33c7edd,571,"[visita, sito, dambulla, sri, lanka]","[sri, lanka, poter, mare, passeggiata, piantag...",0,2022-03-12T06:11:25.746Z,2022-03-12T06:11:24.000+00:00,1,[Viaggi],12,0,0,0,0,0,1,0.0,38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4642,620c32db25398565dadb5867,570,"[san, pedro, san, rafael, san, antonio, colombia]","[san, pedro, de, lo, milagro, nord, medellín, ...",194400000,2022-03-02T16:28:47.197Z,2022-02-15T23:10:19.000+00:00,0,[Viaggi],39,0,0,0,0,0,1,3240.0,62
4643,6213b3460f41f86438b07d9c,429,"[machu, picchu, meraviglia]","[alba, realtà, viaggio, rumor, cascata, capell...",21600000,2022-02-21T15:44:06.580Z,2022-02-21T15:44:06.000+00:00,0,[Viaggi],41,0,0,0,0,0,1,360.0,56
4644,6207b9cdb1d0940f122580b7,570,"[color, guatapé, colombia]","[guatapé, esplosion, color, maniera, novemila,...",72000000,2022-03-02T16:27:30.221Z,2022-02-12T13:44:45.000+00:00,0,[Viaggi],52,0,0,0,0,0,1,1200.0,65
4645,61f6a3e3306802033034d576,536,"[avventura, ecuador, quilotoa, cotopaxi]","[viaggio, bordo, pulman, bussola, viaggiator, ...",172800000,2022-01-30T14:43:17.547Z,2022-01-30T14:42:43.000+00:00,1,[Viaggi],43,0,0,0,0,0,1,2880.0,78


In [35]:
#df_categories.to_csv(data_path, index=False)

In [36]:
data = pd.read_csv(data_path)  

In [37]:
data

Unnamed: 0,id,userId,title,description,duration,datePublishing,creationDate,counterUseful,categories,viewCounter,aria aperta,cultura,food,sport,tempo libero,viaggi,duration_min,days
0,62355f98272ae5672fbc3ac7,632,"['spiaggia', 'phuket', 'thailand']","['phuket', 'paradiso', 'amant', 'sole', 'mare'...",0,2022-03-19T04:44:09.503Z,2022-03-19T04:44:08.000+00:00,0,['Viaggi'],7,0,0,0,0,0,1,0.0,31
1,620aa536ff8ae67f37cafb06,442,['tigr'],"['marcio', 'maltrattamento', 'animal', 'andar'...",10800000,2022-02-14T20:05:09.586Z,2022-02-14T18:53:42.000+00:00,0,['Viaggi'],419,0,0,0,0,0,1,180.0,63
2,622c3b8ac992ca28f33c7f21,571,"['escursion', 'tramonto', 'pidurangala', 'rock']","['cuor', 'sri', 'lanka', 'zona', 'interess', '...",0,2022-03-12T06:19:55.882Z,2022-03-12T06:19:54.000+00:00,1,['Aria aperta'],24,1,0,0,0,0,0,0.0,38
3,620a90c8ff8ae67f37caf97f,530,"['koh', 'samui']","['tour', 'bangkok', 'arcipelago', 'koh', 'samu...",3600000,2022-02-16T15:15:51.387Z,2022-02-14T17:26:32.000+00:00,0,['Viaggi'],453,0,0,0,0,0,1,60.0,63
4,622c398cc992ca28f33c7edd,571,"['visita', 'sito', 'dambulla', 'sri', 'lanka']","['sri', 'lanka', 'poter', 'mare', 'passeggiata...",0,2022-03-12T06:11:25.746Z,2022-03-12T06:11:24.000+00:00,1,['Viaggi'],12,0,0,0,0,0,1,0.0,38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4642,620c32db25398565dadb5867,570,"['san', 'pedro', 'san', 'rafael', 'san', 'anto...","['san', 'pedro', 'de', 'lo', 'milagro', 'nord'...",194400000,2022-03-02T16:28:47.197Z,2022-02-15T23:10:19.000+00:00,0,['Viaggi'],39,0,0,0,0,0,1,3240.0,62
4643,6213b3460f41f86438b07d9c,429,"['machu', 'picchu', 'meraviglia']","['alba', 'realtà', 'viaggio', 'rumor', 'cascat...",21600000,2022-02-21T15:44:06.580Z,2022-02-21T15:44:06.000+00:00,0,['Viaggi'],41,0,0,0,0,0,1,360.0,56
4644,6207b9cdb1d0940f122580b7,570,"['color', 'guatapé', 'colombia']","['guatapé', 'esplosion', 'color', 'maniera', '...",72000000,2022-03-02T16:27:30.221Z,2022-02-12T13:44:45.000+00:00,0,['Viaggi'],52,0,0,0,0,0,1,1200.0,65
4645,61f6a3e3306802033034d576,536,"['avventura', 'ecuador', 'quilotoa', 'cotopaxi']","['viaggio', 'bordo', 'pulman', 'bussola', 'via...",172800000,2022-01-30T14:43:17.547Z,2022-01-30T14:42:43.000+00:00,1,['Viaggi'],43,0,0,0,0,0,1,2880.0,78


<h3>Preprocess using functions in support</h3>

In [40]:
#analyze.analyze_dataset(schede_path, data_analyzed_path, [])