# LDA Modelling for main topics

In [3]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('clean_dataset.csv')

**Get frequencies**

In [2]:
def get_frequencies(bag_words):
    '''
    This function gets a dictionary of frequencies for all the descriptions
    Input:
        bag_words: List of lists of words
    Output:
        freq_dict: a dictionary of the word and its frquency of all
    '''
    freq_dict = {}
    for description in bag_words:
        for word in description:
            freq_dict[word] = freq_dict.get(word,0)+1
    return freq_dict

In [None]:
# Get the frequencies and save it in a dictionary
freq_dict = get_frequencies(df['Descripción del Anuncio'].values)

In [None]:
def plot_freq():
    # Sort the dictionary by value to plot
    freq_dict_ = sorted(freq_dict.items(), key=lambda x: x[1], reverse=True)
    # Unzip the dictionary
    x, y = zip(*freq_dict_[:10])

**Vectorize the vocabulary to train LDA**

In [None]:
def vectorize_features(freq_dict, bag_words):
    '''
    It gets you the list of vocabulary to vectorize
    Input:
        The frequency dictionary and bag of words of descriptions
    Outpur:
        List of dictionaries with all vocabulary
    '''
    freq_vector = []
    words = sorted(freq_dict)
    for element in range(len(bag_words)):
        temp = []
        for word in words:
            if word in bag_words[element]:
                temp.append(1)
            else:
                temp.append(0)
        freq_vector.append(temp)
    return freq_vector

In [None]:
freq_vector = vectorize_features(freq_dict, df_new['Descripción del Anuncio'].values)

In [None]:
# load the library for LDA
from sklearn.decomposition import LatentDirichletAllocation as LDA

def create_lda():
    '''
    This function convert the corpus to vectors
    '''
    # Define the model
    lda = LDA(n_components=5, n_jobs=-1, random_state=10)
    # Fit the model
    lda.fit(freq_dict)