In [60]:
import pandas as pd
from scipy import spatial
from pprint import pprint

from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation, preprocess_string, strip_short, stem_text
from gensim import corpora
from gensim.models import LsiModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.matutils import corpus2dense

import csv

## Data Preprocessing

In [2]:
# read in dataframe from csv
data = pd.read_csv('results_scrapping.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Link,Place,Content
0,0,https://www.roughguides.com/usa/hawaii/waikiki/,Waikiki,"Built on a reclaimed swamp, two miles east of ..."
1,1,https://www.roughguides.com/usa/florida/florid...,The Florida Keys,"Folklore, films and widespread hearsay have gi..."
2,2,https://www.roughguides.com/usa/rockies/yellow...,Yellowstone National Park,America’s oldest and easily its most famous na...
3,3,https://www.roughguides.com/usa/hawaii/big-isl...,The Big Island,Although the Big Island of Hawaii could hold a...
4,4,https://www.roughguides.com/usa/great-plains/,The Great Plains Travel Guide,The rolling hills and vast grasslands of the G...


In [21]:
# preprocess given text
def preprocess(text):
    # clean text based on given filters
    CUSTOM_FILTERS = [lambda x: x.lower(),
                                remove_stopwords,
                                strip_punctuation,
                                strip_short,
                                stem_text]
    text = preprocess_string(text, CUSTOM_FILTERS)

    return text

# apply function to all reviews
data['content_prep'] = data['Content'].apply(lambda x: preprocess(x))
data.head()

Unnamed: 0.1,Unnamed: 0,Link,Place,Content,content_prep
0,0,https://www.roughguides.com/usa/hawaii/waikiki/,Waikiki,"Built on a reclaimed swamp, two miles east of ...","[built, reclaim, swamp, mile, east, downtown, ..."
1,1,https://www.roughguides.com/usa/florida/florid...,The Florida Keys,"Folklore, films and widespread hearsay have gi...","[folklor, film, widespread, hearsai, given, fl..."
2,2,https://www.roughguides.com/usa/rockies/yellow...,Yellowstone National Park,America’s oldest and easily its most famous na...,"[america’, oldest, easili, famou, nation, park..."
3,3,https://www.roughguides.com/usa/hawaii/big-isl...,The Big Island,Although the Big Island of Hawaii could hold a...,"[big, island, hawaii, hold, island, room, spar..."
4,4,https://www.roughguides.com/usa/great-plains/,The Great Plains Travel Guide,The rolling hills and vast grasslands of the G...,"[roll, hill, vast, grassland, great, plain, ho..."


# Latent Semantic Analysis (LSA)
For LSA the following basic steps are required. These steps are combined in the Gensim LsiModel (yes for some reason they named it LSI) that will be used here.

## 1.  TF-IDF Vectorization
Goal is to create a document-term matrix that contains the tf-idf values for words within each document. A high tf-idf score represents a word that appears often in a document but not very often in the corpus. This means that this word is likely usefully for dokument classification. Words that appear often in a document but also often in the corpus will get a low tf-idf score.

## 2. Singular Value Decomposition (SVD) for dimensionality reduction
the resulting document-term matrix is a huge matrix with a lot of noisy and redundant information. Therefore, we want to reduce the dimensions to only a few latent topics that capture the relationships among the words and documents.

In [22]:
# create dictionary and corpus
corpus = data['content_prep']
dictionary = corpora.Dictionary(corpus)
print(dictionary)

Dictionary(14885 unique tokens: ['about', 'ala', 'apart', 'avenu', 'beach']...)


In [34]:
# convert corpus to bag of words
bow = [dictionary.doc2bow(text) for text in corpus]
len(bow)

100

In [29]:
# find the coherence score with a different number of topics
for i in range(2,11):
    lsi = LsiModel(bow, num_topics=i, id2word=dictionary)
    coherence_model = CoherenceModel(model=lsi, texts=data['content_prep'], dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    print('Coherence score with {} clusters: {}'.format(i, coherence_score))

Coherence score with 2 clusters: 0.4690299139512954
Coherence score with 3 clusters: 0.44827506063390704
Coherence score with 4 clusters: 0.3930312950222422
Coherence score with 5 clusters: 0.4207090045529037
Coherence score with 6 clusters: 0.4023455201488108
Coherence score with 7 clusters: 0.40298301925638425
Coherence score with 8 clusters: 0.4208852430100263
Coherence score with 9 clusters: 0.42935133681589255
Coherence score with 10 clusters: 0.4011205178470176


In [35]:
# build LSA model
lsa_model = LsiModel(bow, num_topics=10, id2word=dictionary)

In [42]:
dt_matrix = corpus2dense(lsa_model[bow], len(lsa_model.projection.s)).T / lsa_model.projection.s
dt_matrix

array([[ 6.85583084e-03, -4.44561346e-03,  6.62322456e-04,
        -3.36384971e-03,  4.09102947e-03, -2.21694236e-03,
         5.97758298e-03, -1.43858689e-03,  5.00846257e-04,
        -4.26279577e-03],
       [ 1.32057374e-02, -1.09655128e-02, -3.87641643e-04,
        -3.80003996e-03, -3.66397872e-03,  2.87211657e-03,
         5.56619858e-03, -1.36887986e-03,  3.55517222e-03,
        -4.76297179e-04],
       [ 4.87228397e-03, -5.28524156e-03, -4.31232051e-04,
         1.94964913e-05, -7.20955303e-04, -5.27816411e-04,
        -3.58718397e-03,  1.23166708e-02, -7.65088026e-04,
        -4.15053751e-03],
       [ 1.61470002e-02, -2.75650379e-02,  3.90867968e-03,
        -1.04197203e-02,  7.67055898e-03,  9.53642470e-03,
         3.58598066e-03,  2.09118076e-02,  7.54452127e-03,
        -8.53500202e-03],
       [ 1.23653396e-02, -8.73618192e-03,  4.67034550e-03,
        -5.91394071e-03,  1.48424297e-03,  4.58869925e-04,
        -3.44766002e-03,  2.75864439e-04, -5.27063288e-03,
        -2.

## Inspect Topics
The matrix plots a score for each document for each topic.
Todo
- Find corresponding topics for each number
  - might be difficult since we don't even know if there is a word for each topic
  - maybe find words that define each topic from tf-idf matrix
- figure out how many topics we want

In [61]:
pprint(lsa_model.print_topics())

[(0,
  '0.319*"citi" + 0.161*"town" + 0.160*"place" + 0.136*"centuri" + 0.133*"it’" '
  '+ 0.123*"dai" + 0.122*"build" + 0.117*"area" + 0.116*"centr" + '
  '0.111*"best"'),
 (1,
  '0.408*"citi" + 0.230*"mexico" + -0.213*"town" + -0.194*"beach" + '
  '-0.119*"villag" + 0.116*"pyramid" + 0.104*"build" + -0.103*"forest" + '
  '-0.103*"south" + -0.101*"rout"'),
 (2,
  '0.262*"mexico" + -0.233*"barcelona" + -0.220*"berlin" + -0.186*"museum" + '
  '-0.178*"art" + -0.167*"bar" + 0.140*"site" + 0.119*"mexican" + '
  '0.109*"pyramid" + -0.108*"catalan"'),
 (3,
  '-0.433*"berlin" + -0.195*"wall" + 0.191*"barcelona" + 0.191*"madrid" + '
  '-0.161*"berlin’" + -0.131*"build" + 0.130*"del" + 0.122*"plaza" + '
  '-0.117*"east" + -0.116*"centuri"'),
 (4,
  '-0.289*"madrid" + -0.220*"plaza" + 0.205*"barcelona" + -0.201*"berlin" + '
  '-0.173*"san" + -0.167*"gai" + 0.123*"art" + 0.109*"rambla" + 0.106*"port" + '
  '0.102*"artist"'),
 (5,
  '-0.391*"rio" + -0.170*"manau" + 0.162*"madrid" + -0.157*"amazon

## Predicting closest document to input based on document topic matrix

In [57]:
# get place of the closest document for each word
# transform svd matrix to spacial KDtree
tree = spatial.KDTree(dt_matrix)

# transform a list of words with the fitted model to get their vector-representation
input = [['beach'], ['city', 'town']]
# Todo: Run Preprocessing over input


# transform words with dict to bow
input_bow = [dictionary.doc2bow(words) for words in input]

input_topics = lsa_model.__getitem__(input_bow)
# reformat to keep just values in tuples
input_vecs = []
for line in input_topics:
    input_vecs.append([y[1] for y in line])

# get closest document vector for each word vector
for i, input_vec in enumerate(input_vecs):
    query = tree.query(input_vec)
    print(f'"{input[i]}" > "{data.Place[query[1]]}" Distance: {query[0]}')

[[0.08513637402782603, -0.1938736940285142, 0.025124262233920985, 0.06964420506619662, 0.04328281449187952, -0.134957452290811, 0.19046964646337594, -0.08270576574776886, 0.29901669594271174, -0.10604927113675221], [0.1609905702640511, -0.21344504204366083, 0.06920731795042735, -0.008188338162242702, 0.09299417257444216, 0.07025324586781093, 0.007476826044950392, 0.011529645436400188, 0.03761246773030037, 0.002420183212382463]]
"['beach']" > "Rio Grande do Norte" Distance: 0.35994981168132173
"['city', 'town']" > "The Côte d’Azur Travel Guide" Distance: 0.19184166189438917
