In [None]:
import pandas as pd
import re
import os
from scipy import spatial
from pprint import pprint
from datetime import date

from gensim import corpora
from gensim.models import LsiModel
from gensim.utils import simple_preprocess
from gensim.models.coherencemodel import CoherenceModel
from gensim.matutils import corpus2dense

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import task2a_prep_functions as p

today = date.today()
d1 = today.strftime('%d%m%Y')

## Data Preprocessing

In [None]:
# read in dataframe from csv
data = pd.read_csv('results_scrapping.csv')
data.head()

In [None]:
# preprocess given text
def preprocess(data):
    # Remove punctuation
    data['content_processed'] = data['Content'].map(lambda x: re.sub('[,\.!?]', '', x))

    # Convert text to lowercase
    data['content_processed'] = data['content_processed'].map(lambda x: x.lower())

    stop_words = stopwords.words('english')
    stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'de', 'km', 'one', 'two'])

    def sent_to_words(sentences):
        for sentence in sentences:
            yield simple_preprocess(str(sentence), deacc=True) # -> True removes punctuations

    def remove_stopwords(texts):
        return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

    # Tokenizing and remove punctuation of processed content
    data_list = data.content_processed.values.tolist()
    data_words = list(sent_to_words(data_list))

    # Lemmatization of processed content
    data_lemmatize = p.lemmatize_to_list([data_words])[0]

    # Remove stop words of processed content
    data_words = remove_stopwords(data_lemmatize)
    data['content_prep'] = data_words

    return data


data = preprocess(data)
data.head()

# Latent Semantic Analysis (LSA)
For LSA the following basic steps are required. These steps are combined in the Gensim LsiModel (yes for some reason they named it LSI) that will be used here.

## 1.  TF-IDF Vectorization
Goal is to create a document-term matrix that contains the tf-idf values for words within each document. A high tf-idf score represents a word that appears often in a document but not very often in the corpus. This means that this word is likely usefully for dokument classification. Words that appear often in a document but also often in the corpus will get a low tf-idf score.

## 2. Singular Value Decomposition (SVD) for dimensionality reduction
the resulting document-term matrix is a huge matrix with a lot of noisy and redundant information. Therefore, we want to reduce the dimensions to only a few latent topics that capture the relationships among the words and documents.

In [None]:
# create dictionary and corpus
corpus = data['content_prep']
dictionary = corpora.Dictionary(corpus)
print(dictionary)

In [None]:
# convert corpus to bag of words
bow = [dictionary.doc2bow(text) for text in corpus]
len(bow)

In [None]:
# find the coherence score with a different number of topics
for i in range(2,11):
    lsi = LsiModel(bow, num_topics=i, id2word=dictionary)
    coherence_model = CoherenceModel(model=lsi, texts=data['content_prep'], dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    print('Coherence score with {} clusters: {}'.format(i, coherence_score))

In [None]:
# build LSA model
lsa_model = LsiModel(bow, num_topics=10, id2word=dictionary)

In [None]:
dt_matrix = corpus2dense(lsa_model[bow], len(lsa_model.projection.s)).T / lsa_model.projection.s
dt_matrix

## Inspect Topics
The matrix plots a score for each document for each topic.
Todo
- Find corresponding topics for each number
  - might be difficult since we don't even know if there is a word for each topic
  - maybe find words that define each topic from tf-idf matrix
- figure out how many topics we want

In [None]:
pprint(lsa_model.print_topics())

## Predicting closest document to input based on document topic matrix

In [None]:
# get place of the closest document for each word
# transform svd matrix to spacial KDtree
tree = spatial.KDTree(dt_matrix)

# transform a list of words with the fitted model to get their vector-representation
input = [['sun', 'beach'], ['city', 'town'], ['mountain', 'hiking']]
# Todo: Run Preprocessing over input


# transform words with dict to bow
input_bow = [dictionary.doc2bow(words) for words in input]

input_topics = lsa_model.__getitem__(input_bow)
# reformat to keep just values in tuples
input_vecs = []
for line in input_topics:
    input_vecs.append([y[1] for y in line])

# get closest document vector for each word vector
for i, input_vec in enumerate(input_vecs):
    query = tree.query(input_vec)
    print(f'"{input[i]}" > "{data.Place[query[1]]}" Distance: {query[0]}')

## Script to train multiple LSA Models with different configurations and show differences in an Excel File

In [None]:
# find the coherence score with a different number of topics
for i in range(5, 35, 5):

    model_name = f'lsa_{d1}_' + f'{i}' + 'topics'
    dest_path = f'../results/{model_name}/'
    if not os.path.exists(dest_path):
        os.mkdir(f'../results/{model_name}/')

    lsi = LsiModel(bow, num_topics=i, id2word=dictionary)
    coherence_model = CoherenceModel(model=lsi, texts=data['content_prep'], dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    # save Coherence Score to lsa model results directory
    with open(f'../results/{model_name}/lsa_vis_prepared_' + str(i) + '_scores.txt', 'w') as f:
        f.write(f'Coherence Score: {coherence_score}\n')
        f.close()

    lsa_model = LsiModel(bow, num_topics=i, id2word=dictionary)

    dt_matrix = corpus2dense(lsa_model[bow], len(lsa_model.projection.s)).T / lsa_model.projection.s
    df_matrix = pd.DataFrame(dt_matrix)
    df_matrix.to_csv(f'../results/{model_name}/lsa_vis_prepared_' + str(i) + '_matrix.csv', index=False, header=False)

    # save dictionary to model results
    dictionary.save(f'../results/{model_name}/lda_vis_prepared_' + str(i) + '_dictionary')

    pprint(lsa_model.print_topics())

    # get place of the closest document for each word
    # transform svd matrix to spacial KDtree
    tree = spatial.KDTree(dt_matrix)

    # transform a list of words with the fitted model to get their vector-representation
    input = [['sun', 'beach'], ['city', 'town'], ['mountain', 'hiking']]
    # Todo: Run Preprocessing over input

    # transform words with dict to bow
    input_bow = [dictionary.doc2bow(words) for words in input]

    input_topics = lsa_model.__getitem__(input_bow)
    # reformat to keep just values in tuples
    input_vecs = []
    for line in input_topics:
        input_vecs.append([y[1] for y in line])

    # get closest document vector for each word vector
    with open(f'../results/{model_name}/lsa_vis_prepared_' + str(i) + '_prediction.txt', 'w') as f:
        for i, input_vec in enumerate(input_vecs):
            query = tree.query(input_vec)
            f.write(f'{input[i]} > "{data.Place[query[1]]}" - Distance: {query[0]}\n')
        f.close()

##### after running above cell you need to run "task2b_topicmodel_lda_lsa_excel-result.ipynb" with algorithm = lsa to merge model results into Excel File