# Preguntas

- Cual es el numero de tarea? The doc says proyecto 1

In [None]:
"""

"""

- Read *everything* before sending over; "argumente brevemente sobre ello"

Proyecto 1 - Luis Eduardo Robles Jimenez

# Minería de Texto para Turismo

## Utils

In [None]:
import os
import numpy as np
import pandas as pd
from datetime import datetime
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn import preprocessing
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
class TouristSpot:
    def __init__(self, path, vocabSize = None):
        self.reviews = pd.read_csv(path)
        self.spotName = path.split('/')[-1][: -4]
        self.vocabSize = vocabSize
        self._cleanUp()
        self.nDashes = 30

    def _cleanUp(self):
        columns = ["Título de la opinión", "Opinión"]
        for col in columns:
            desc = []
            for o in self.reviews.loc[:, col]:
                if o[0] == '"': o = o[1:]
                if o[-1] == '"': o = o[:-1]
                desc.append(o)
            self.reviews.loc[:, col] = desc

        if self.vocabSize is not None:
            corpus = ' '.join(self.reviews.loc[:, "Opinión"])
            self.vocabulary = FreqDist(word_tokenize(corpus)).most_common(self.vocabSize)
            self.vocabulary = [word for word, _ in self.vocabulary]

            def useTopWords(text):
                words = word_tokenize(text)
                return ' '.join([word if word in self.vocabulary else '<unk>' for word in words])
            
            self.reviews["Opinión"] = self.reviews["Opinión"].apply(useTopWords)

    def topKwordcloud(self, k = 50):
        vectorizer = TfidfVectorizer(min_df = 1, stop_words = ['spanish', 'english'])
        bow = vectorizer.fit_transform(self.reviews["Opinión"])
        words = vectorizer.get_feature_names_out()
        
        feats = SelectKBest(chi2, k = k)
        feats.fit(bow, self.reviews['Escala'])
        best = feats.get_support(indices = True)
        words = ' '.join(words[best])

        wordcloud = WordCloud(width=800, height=800, background_color='white', min_font_size=3).generate(words)

        print(f"{'-' * self.nDashes} {self.spotName} {'-' * self.nDashes}")

        plt.figure(figsize=(8,8))
        plt.imshow(wordcloud)
        plt.axis("off")
        plt.tight_layout(pad=0)
        plt.show()

    def LSA(self):
        print(f'{"-" * self.nDashes} {self.spotName} {"-" * self.nDashes}')
        
        subgroups = {}
        subgroups['Hombres'] = self.reviews.loc[self.reviews.loc[:, 'Género'] == 'Masculino']
        subgroups['Mujeres'] = self.reviews.loc[self.reviews.loc[:, 'Género'] == 'Femenino']
        subgroups['Turistas Nacionales'] = self.reviews.loc[self.reviews.loc[:, 'Nacional ó Internacional'] == 'Nacional']
        subgroups['Turistas Internacionales'] = self.reviews.loc[self.reviews.loc[:, 'Nacional ó Internacional'] == 'Internacional']
        subgroups['Jovenes (10, 30)'] = self.reviews.loc[np.logical_and(self.reviews.loc[:, 'Edad'] > 0, self.reviews.loc[:, 'Edad'] < 30)]
        subgroups['Mayores (60, 100)'] = self.reviews.loc[np.logical_and(self.reviews.loc[:, 'Edad'] > 60, self.reviews.loc[:, 'Edad'] < 100)]

        vectorizer = TfidfVectorizer(min_df = 1, stop_words = ['spanish', 'english'])
        for sName in subgroups:
            s = subgroups[sName]
            print(f'Subgroup: {sName}')
            bow = vectorizer.fit_transform(s.loc[:, 'Opinión'])
            bow = preprocessing.normalize(bow, norm = 'l2')
            dictionary = vectorizer.get_feature_names_out()
            svd = TruncatedSVD(n_components = 3)
            lsa = svd.fit_transform(bow)

            for comp in svd.components_:
                sortedComp = np.argsort(np.abs(comp))[::-1]
                comp = comp[sortedComp]
                dictionary = dictionary[sortedComp]
                for term in dictionary[:10]: print(f'{term}', end = " ")
                print()
            print()

    def temporalAnalysis(self):

        self.reviews['newDate'] = ['/'.join(date.split('/')[1:]) for date in self.reviews.loc[:, 'Fecha']]

        dates = []
        bad, neutral, good, total = [], [], [], []
        for groupName, reviewsGroup in self.reviews.groupby('newDate'):
            dates.append(groupName)
            bad.append(np.sum(reviewsGroup.loc[:, 'Escala'] < 3))
            neutral.append(np.sum(reviewsGroup.loc[:, 'Escala'] == 3))
            good.append(np.sum(reviewsGroup.loc[:, 'Escala'] > 3))

        grades = np.array(bad) + np.array(neutral) + np.array(good)
        bad =       np.array(bad)       / grades * 100
        neutral =   np.array(neutral)   / grades * 100
        good =      np.array(good)      / grades * 100

        argsort = np.argsort([datetime.strptime(date, '%m/%y' if len(date) == 5 else '%m/%Y') for date in dates])[::-1]
        dates =     np.array([dates[i] for i in argsort])
        bad =       np.array([bad[i] for i in argsort])
        neutral =   np.array([neutral[i] for i in argsort])
        good =      np.array([good[i] for i in argsort])


        _, stackedChart = plt.subplots(figsize = (15, 15))
        stackedChart.barh(dates, good, label='Good reviews', color = '#00ff00')
        stackedChart.barh(dates, neutral, left = good, label='Neutral reviews', color = '#ffff00')
        stackedChart.barh(dates, bad, left = good + neutral, label='Bad reviews', color = '#ff0000')
        stackedChart.legend()
        stackedChart.set_title(f'Reviews of {self.spotName} by month')
        plt.show()

    def describe(self):
        nRows, nCols = 2, 1
        fig = plt.figure(figsize = (15, 10))
        fig.subplots_adjust(hspace = 0.5)

        print(self.nDashes * '-', self.spotName, self.nDashes * '-')

        grades = fig.add_subplot(321)
        avg = np.round(np.average(self.reviews.loc[:, 'Escala']), decimals = 2)
        sd = np.round(np.std(self.reviews.loc[:, 'Escala']), decimals = 2)
        grades.hist(self.reviews.loc[:, 'Escala'])
        grades.set_title(f"Grades (1 worst; 5 best); Average = {avg}; SD = {sd}")

        length = fig.add_subplot(322)
        lenOpinions = [len(word_tokenize(o)) for o in self.reviews.loc[:, 'Opinión']]
        avg = np.round(np.average(lenOpinions), decimals = 2)
        sd = np.round(np.std(lenOpinions), decimals = 2)
        length.hist(lenOpinions)
        length.set_title(f"Length of opinions (words); Average = {avg}; SD = {sd}")

        ages = fig.add_subplot(323)
        ages.hist(self.reviews.loc[:, 'Edad'], bins = 10)
        ages.set_title(f'Ages distribution')

        visitors = fig.add_subplot(324)
        visitors.hist(self.reviews.loc[:, 'Nacional ó Internacional'])
        visitors.set_title('Local or foreigner')

        monthNames = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
        popularity = fig.add_subplot(325)
        months = [int(d.split('/')[1]) for d in self.reviews.loc[:, 'Fecha']]
        m, c = np.unique(months, return_counts = True)
        popularity.bar(m, c)
        popularity.set_title('Popularity by month')
        popularity.set_xticks(np.arange(1, len(m) + 1), labels = monthNames)

        languages = fig.add_subplot(326)
        m, c = np.unique(self.reviews.loc[:, 'Idioma'], return_counts = True)
        languages.barh(m, c)
        languages.set_title('Language')

        plt.show()

## Actividades

### 1. Preprocesamiento y stats

In [None]:
path = "../../data/guanajuato/"
#spots = [TouristSpot(os.path.join(path, d), vocabSize = 10000) for d in os.listdir(path)]
spots = [TouristSpot(os.path.join(path, d), vocabSize = None) for d in os.listdir(path)]

In [None]:
for spot in spots: 
    break
    spot.describe()
    continue

In [None]:
for spot in spots: 
    break
    spot.topKwordcloud()
    continue

In [None]:
for spot in spots: 
    break
    spot.LSA()
    continue

In [None]:
for spot in spots: 
    break
    spot.temporalAnalysis()
    continue