In [20]:
#from task2a_preprocessing import Preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import pandas as pd
from scipy import spatial

In [19]:
# read in dataframe from csv
df_raw = pd.read_csv('results_scrapping.csv')
df_raw.head()

Unnamed: 0.1,Unnamed: 0,Link,Place,Content
0,0,https://www.roughguides.com/usa/hawaii/waikiki/,Waikiki,"Built on a reclaimed swamp, two miles east of ..."
1,1,https://www.roughguides.com/usa/florida/florid...,The Florida Keys,"Folklore, films and widespread hearsay have gi..."
2,2,https://www.roughguides.com/usa/rockies/yellow...,Yellowstone National Park,America’s oldest and easily its most famous na...
3,3,https://www.roughguides.com/usa/hawaii/big-isl...,The Big Island,Although the Big Island of Hawaii could hold a...
4,4,https://www.roughguides.com/usa/great-plains/,The Great Plains Travel Guide,The rolling hills and vast grasslands of the G...


## TF-IDF Vectorization
Goal is to create a document-term matrix that contains the tf-idf values for words within each document. A high tf-idf score represents a word that appears often in a document but not very often in the corpus. This means that this word is likely usefully for dokument classification. Words that appear often in a document but also often in the corpus will get a low tf-idf score.

In [3]:
# generate tf-idf matrix vectorizer
vectorizer = TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=True)

## Singular Value Decomposition (SVD) for dimensionality reduction
the resulting document-term matrix is a huge matrix with a lot of noisy and redundant information. Therefore, we want to reduce the dimensions to only a few latent topics that capture the relationships among the words and documents.

In [23]:
# generate svd model
# n_components represents the number of topics
svd_model = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=10)

## Build Pipeline with tf-idf vectorization and Singular Value Decomposition

In [24]:
# build pipeline with tf-idf vectorizer and svd model
svd_transformer = Pipeline([('tfidf', vectorizer), ('svd', svd_model)])

In [25]:
svd_matrix = svd_transformer.fit_transform(df_raw['Content'])
svd_matrix

array([[ 0.1425481 , -0.00268226, -0.01535606, ...,  0.18465316,
         0.19008581, -0.03654833],
       [ 0.15875586,  0.00406477, -0.02183962, ...,  0.07710791,
         0.26868453, -0.11639705],
       [ 0.11244423,  0.1678391 , -0.01219773, ..., -0.10680956,
        -0.16764022, -0.07652924],
       ...,
       [ 0.22185615, -0.13958042, -0.05212646, ..., -0.16888817,
         0.01502187,  0.09777968],
       [ 0.25458364, -0.028505  ,  0.12570334, ..., -0.01210235,
         0.00225291, -0.00474641],
       [ 0.32172507, -0.21364246,  0.88344184, ..., -0.00973623,
         0.03469409,  0.01779815]])

In [26]:
# get place of the closest document for each word
# transform svd matrix to spacial KDtree
tree = spatial.KDTree(svd_matrix)

# transform a list of words with the fitted model to get their vector-representation
words = ['beach', 'mountains', 'city', 'town', 'house', 'nature', 'snow']
word_matrix = svd_transformer.transform(words)

# get closest document vector for each word vector
for i, word_vector in enumerate(word_matrix):
    query = tree.query(word_vector)
    print(f'"{words[i]}" > "{df_raw.Place[query[1]]}" Distance: {query[0]}')

"beach" > "Long Beach" Distance: 0.34039009391311387
"mountains" > "Piura" Distance: 0.21862914270697587
"city" > "Milan" Distance: 0.3190168265899463
"town" > "Piura" Distance: 0.3163740084996661
"house" > "Piura" Distance: 0.23433169321008537
"nature" > "Piura" Distance: 0.23864299618492868
"snow" > "Piura" Distance: 0.22513452212186089


## Topic extraction
The matrix plots a score for each document for each topic.
Todo
- Find corresponding topics for each number
  - might be difficult since we don't even know if there is a word for each topic
  - maybe find words that define each topic from tf-idf matrix
- figure out how many topics we want