In [8]:
#from task2a_preprocessing import Preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import pandas as pd
from scipy import spatial
import csv

In [2]:
# read in dataframe from csv
df_raw = pd.read_csv('results_scrapping.csv')
df_raw.head()

Unnamed: 0.1,Unnamed: 0,Link,Place,Content
0,0,https://www.roughguides.com/usa/hawaii/waikiki/,Waikiki,"Built on a reclaimed swamp, two miles east of ..."
1,1,https://www.roughguides.com/usa/florida/florid...,The Florida Keys,"Folklore, films and widespread hearsay have gi..."
2,2,https://www.roughguides.com/usa/rockies/yellow...,Yellowstone National Park,America’s oldest and easily its most famous na...
3,3,https://www.roughguides.com/usa/hawaii/big-isl...,The Big Island,Although the Big Island of Hawaii could hold a...
4,4,https://www.roughguides.com/usa/great-plains/,The Great Plains Travel Guide,The rolling hills and vast grasslands of the G...


## TF-IDF Vectorization
Goal is to create a document-term matrix that contains the tf-idf values for words within each document. A high tf-idf score represents a word that appears often in a document but not very often in the corpus. This means that this word is likely usefully for dokument classification. Words that appear often in a document but also often in the corpus will get a low tf-idf score.

In [3]:
# generate tf-idf matrix vectorizer
vectorizer = TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=True)

## Singular Value Decomposition (SVD) for dimensionality reduction
the resulting document-term matrix is a huge matrix with a lot of noisy and redundant information. Therefore, we want to reduce the dimensions to only a few latent topics that capture the relationships among the words and documents.

In [5]:
# generate svd model
# n_components represents the number of topics
svd_model = TruncatedSVD(n_components=10, algorithm='randomized', n_iter=10)

## Build Pipeline with tf-idf vectorization and Singular Value Decomposition

In [6]:
# build pipeline with tf-idf vectorizer and svd model
svd_transformer = Pipeline([('tfidf', vectorizer), ('svd', svd_model)])

In [10]:
svd_matrix = svd_transformer.fit_transform(df_raw['Content'])
print(svd_matrix)

[[ 1.42548103e-01 -2.73923173e-03 -1.55397103e-02  2.89191881e-02
  -1.28343592e-01  1.95758911e-01  8.62144929e-02  1.61990284e-01
  -5.62088851e-02 -1.49549286e-02]
 [ 1.58755866e-01  3.98083665e-03 -2.20742536e-02  2.64980414e-02
  -4.83308511e-02  1.00971235e-01  1.00023794e-01  4.02466257e-02
  -5.93972910e-02 -7.61715044e-02]
 [ 1.12444229e-01  1.67850801e-01 -1.20776275e-02  4.68400320e-03
  -5.49281652e-02  9.83073698e-02 -4.89249097e-02 -1.00291069e-02
  -2.89635594e-01 -7.44584669e-03]
 [ 1.74237365e-01  1.42443228e-01  1.24398631e-02  6.23385396e-02
  -1.57408263e-01  2.74652871e-01  1.02704900e-01 -1.80995600e-04
  -2.44024880e-01  1.37190977e-01]
 [ 1.36579132e-01 -3.17314216e-03 -2.44510316e-02  1.92970354e-02
  -3.81655462e-02  1.45148866e-02 -3.59898274e-02 -1.04024758e-01
  -6.46275067e-02 -1.20481083e-01]
 [ 2.05467574e-01  1.36601093e-01 -9.97606402e-03  3.49019418e-02
   3.61706064e-03  9.74331311e-02  9.44979306e-03 -1.59560165e-02
  -7.12500596e-02 -1.56499377e-01

In [26]:
# get place of the closest document for each word
# transform svd matrix to spacial KDtree
tree = spatial.KDTree(svd_matrix)

# transform a list of words with the fitted model to get their vector-representation
words = ['beach', 'mountains', 'city', 'town', 'house', 'nature', 'snow']
word_matrix = svd_transformer.transform(words)

# get closest document vector for each word vector
for i, word_vector in enumerate(word_matrix):
    query = tree.query(word_vector)
    print(f'"{words[i]}" > "{df_raw.Place[query[1]]}" Distance: {query[0]}')

"beach" > "Long Beach" Distance: 0.34039009391311387
"mountains" > "Piura" Distance: 0.21862914270697587
"city" > "Milan" Distance: 0.3190168265899463
"town" > "Piura" Distance: 0.3163740084996661
"house" > "Piura" Distance: 0.23433169321008537
"nature" > "Piura" Distance: 0.23864299618492868
"snow" > "Piura" Distance: 0.22513452212186089


## Topic extraction
The matrix plots a score for each document for each topic.
Todo
- Find corresponding topics for each number
  - might be difficult since we don't even know if there is a word for each topic
  - maybe find words that define each topic from tf-idf matrix
- figure out how many topics we want

In [11]:
# safe matrix as csv
with open("lsa_example_matrix.csv","w+", newline='') as file:
    csvWriter = csv.writer(file,delimiter=',')
    csvWriter.writerows(svd_matrix)