## Import Libraries

In [2]:
import pandas as pd
import numpy as np
import re
import glob
from pathlib import Path

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
from nltk import word_tokenize

In [5]:
from sklearn.decomposition import LatentDirichletAllocation

In [6]:
from sklearn.model_selection import GridSearchCV

In [7]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Functions

In [9]:
def cleaner(text):
    # Retweets
    text = re.sub('^RT ', '', text)
    # Mentions
    text = re.sub("@[A-Za-z0-9]+","", text)
    # Hashtags
    text = re.sub("#[A-Za-z0-9]+","", text)
    # URLs
    text = re.sub(r'http\S+', '', text)
    # Attachments
    text = re.sub('[.*?]','', text)
    # remove href
    text = re.sub("href", '', text)
    # Special
    text = re.sub("[^A-Za-z0-9\s.,;?!':àòèéìù]", '', text)
    text = re.sub("[♥♡\U00010000-\U0010ffff]", '', text)
    text = re.sub("\n", '', text)
    # Remove stop words and non-alphabet characters
    tokens = []
    for w in word_tokenize(text):
        if (len(w) == 1 and not w.isalpha() and not w.isnumeric()):
            continue
        tokens.append(w)
    text = ' '.join(tokens)
    

    return text

In [10]:
def return_top_words(model, feature_names, n_top_words):
    '''Function that return a dictionary with topics and relative frequent words
        Parameters
        ----------

        model :sklearn.decomposition._lda.LatentDirichletAllocation
                LDA with online variational Bayes algorithm 

        feature_names : list
                List obtained through the get_feature_names() method of the CountVectorizer() object

        n_top_words : int
                Number of most frequent words to show for each topic

        Returns
        -------
        topics : dict
                Dictionary that contains the indexes of the topics as keys and the most frequent words as values
    '''
    topics = {0 : '', 1 : '', 2 : ''}
    for topic_idx, topic in enumerate(model.components_):
        topics[topic_idx] = ', '.join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
    
    return topics

I followed this <a href="https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/">link</a> to choose the best number of topics through the gridsearch 

In [46]:
it_stopwords = stopwords.words('italian')

def top_3(input_path):
  #Function that returns a dataframe with the related topics for each month and other informations to the following visualization

  data = []
  indexes = []
  lda_models = []
  tfs = []
  tf_vectorizers = []
  
  for file in glob.glob(input_path + "/*.csv"):
    #Import dataframe and extract information from path string
    df = pd.read_csv(file)
    info = file.split('/')[-1][:-4].split('_')
    city = '_'.join([e for e in info[:len(info)-2] ])
    year = info[len(info)-2]
    month = info[len(info)-1]

    # Clean text column and transform into list 
    x_train = df['text'].apply(cleaner).tolist()

    # Add city name to stop_words list because it is useless information
    it_stopwords.extend(info[:-2])
    it_stopwords.append('sicilia')
    it_stopwords.append('canicattì')
  
    
    # Build a matrix of token counts
    tf_vectorizer = CountVectorizer(stop_words=it_stopwords, max_df=1.0, min_df= 1,max_features = 1000, ngram_range=(1,3))
    tf_vectorizers.append(tf_vectorizer)
    try:
      tf = tf_vectorizer.fit_transform(x_train)
      tfs.append(tf)

      #Apply GridSearchCV to find best model with optimal number of topics
      lda = LatentDirichletAllocation()
      search_params = {'n_components': [2,3]}
      model = GridSearchCV(lda, param_grid=search_params)
      model.fit(tf)

      n_top_words = 5
      tf_feature_names = tf_vectorizer.get_feature_names()
      lda = model.best_estimator_
      lda_models.append(lda)
      dizio = return_top_words(lda, tf_feature_names, n_top_words)
      
    except:
      dizio = {}
      dizio[0] = 'only stop words'
      dizio[1] = 'only stop words'
      dizio[2] = 'only stop words'
    
    data.append([f'{year}/{month}',dizio[0], dizio[1], dizio[2]])
    indexes.append((int(year[-1])*12) + int(month)-1)
    
  return (pd.DataFrame(data, index=indexes, columns=['date', 'topic 1', 'topic 2', 'topic 3']),lda_models, tfs, tf_vectorizers)


## Visualization with pyLDAvis library

In [None]:
cities = [
        'canicattì'
          ]

for city in cities:
  path = f'/content/drive/MyDrive/NLP/topic_notebooks/{city}'
  topic_info = top_3(path)
  df = topic_info[0]


In [65]:
lda_canicattì_2020_02 = topic_info[1][21]
tf_canicattì_2020_02 = topic_info[2][21]
tf_vectorizer_canicattì_2020_02 = topic_info[3][21]

In [66]:
import pyLDAvis
import pyLDAvis.sklearn

In [67]:
pyLDAvis.enable_notebook()

In [68]:
pyLDAvis.sklearn.prepare(lda_canicattì_2020_02,tf_canicattì_2020_02,tf_vectorizer_canicattì_2020_02)

  by='saliency', ascending=False).head(R).drop('saliency', 1)
