# Setup

In [None]:
import os
from pathlib import Path
import sys

# --- REPO ROOT ON sys.path (so `from src.*` works locally) ---
_REPO_ROOT = str(Path(os.getcwd()).resolve().parents[1])
if _REPO_ROOT not in sys.path:
    sys.path.insert(0, _REPO_ROOT)


# --- ENVIRONMENT SWITCH ---
# Set to True if running on local machine with Google Drive Desktop mounted
# Set to False if running in Google Colab cloud
RUNNING_LOCALLY = True

if RUNNING_LOCALLY:
    # Standard macOS path for Google Drive Desktop
    BASE_PATH = Path('/Volumes/GoogleDrive/My Drive/Colab Projects/AI Public Trust')
else:
    # Google Colab cloud path
    from google.colab import drive
    drive.mount('/content/drive')
    BASE_PATH = Path('/content/drive/My Drive/Colab Projects/AI Public Trust')

# Pre-compute critical paths used across notebooks
twits_folder = BASE_PATH / 'Raw Data/Twits/'
test_folder = BASE_PATH / 'Raw Data/'
datasets_folder = BASE_PATH / 'Data Sets'
cleanedds_folder = BASE_PATH / 'Data Sets/Cleaned Data'
networks_folder = BASE_PATH / 'Data Sets/Networks/'
literature_folder = BASE_PATH / 'Literature/'
topic_models_folder = BASE_PATH / 'Models/Topic Modeling/'


In [1]:
from datetime import datetime
from datetime import timedelta
import json
import time
import datetime
import os
import tqdm
import pickle
import numpy as np
import pandas as pd
import random
import networkx as nx
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
# Ensure that necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

# from google.colab import drive
# drive.mount('/content/drive')

# twits_folder = '/content/drive/MyDrive/AI Public Trust/Raw Data/Twits/'
# test_folder = '/content/drive/MyDrive/AI Public Trust/Raw Data/'
# print("Current Directory:", twits_folder)
# datasets_folder = '/content/drive/MyDrive/AI Public Trust/Data Sets/'
# cleanedds_folder = '/content/drive/MyDrive/AI Public Trust/Data Sets/Cleaned Data/'
# networks_folder = '/content/drive/MyDrive/AI Public Trust/Data Sets/Networks/'
# literature_folder = '/content/drive/MyDrive/AI Public Trust/Literature/'
# topic_models_folder = '/content/drive/MyDrive/AI Public Trust/Models/Topic Modeling/'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Mounted at /content/drive
Current Directory: /content/drive/My Drive/Colab Projects/AI Public Trust/Raw Data/Twits/


In [2]:
%%time
with open(topic_models_folder+"test_sentences_corpus.pkl", 'rb') as f:
    corpus_dict = pickle.load(f)
# Create Corpus
test_twit_corpus = []
for twid in tqdm.tqdm(corpus_dict):
  twit = corpus_dict[twid]
  text = twit['text']
  test_twit_corpus.append(text)

print(len(test_twit_corpus))
random.sample(test_twit_corpus,2)

100%|██████████| 630/630 [00:00<00:00, 744760.86it/s]

630
CPU times: user 16.6 ms, sys: 1.21 ms, total: 17.8 ms
Wall time: 1.08 s





['machinelearning steps daysofcode g ai analytics artificialintelligence bigdata cloud coding data datascience github iot javascript linux ml mlops nlp nodejs opensource python sql tscottclendaniel womenwhocode',
 'chinese researchers created cyborg pigeon watch end data datascience programming datascientist ai rob']

# LDA SKLEARN

- https://machinelearninggeek.com/latent-dirichlet-allocation-using-scikit-learn/
- https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html
- https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html

In [3]:
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from joblib import dump, load

In [4]:
%%time
# Initialize regex tokenizer
tokenizer = RegexpTokenizer(r'\w+')

# Vectorize document using TF-IDF
tfidf = TfidfVectorizer(lowercase=True,
                        #stop_words='english',
                        ngram_range = (1,1),
                        tokenizer = tokenizer.tokenize)

# Fit and Transform the documents
train_data = tfidf.fit_transform(test_twit_corpus)

# Save
dump(tfidf, topic_models_folder+'test_tfidf.joblib')
#tfidf_2 = load(topic_models_folder+'test_tfidf.joblib')



CPU times: user 67.1 ms, sys: 1.89 ms, total: 69 ms
Wall time: 379 ms


['/content/drive/My Drive/Colab Projects/AI Public Trust/Models/Topic Modeling/test_tfidf.joblib']

In [5]:
%%time
# Define the number of topics or components
num_components=5

# Create LDA object
model=LatentDirichletAllocation(n_components=num_components,verbose=2)

# Fit and Transform SVD model on data
lda_matrix = model.fit_transform(train_data)

#save this
dump(lda_matrix, topic_models_folder+'test_lda_matrix.joblib')
#lda_matrix_2 = load(topic_models_folder+'test_lda_matrix.joblib')

# Get Components
lda_components=model.components_

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
CPU times: user 1.13 s, sys: 0 ns, total: 1.13 s
Wall time: 1.44 s


In [6]:
# Print the topics with their terms
terms = tfidf.get_feature_names_out()

for index, component in enumerate(lda_components):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:7]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)

Topic 0:  ['ai', 'deeplearning', 'v', 'fint', 'theapplication', 'machinelearning', 'datascience']
Topic 1:  ['conventions', 'say', 'alley', 'exhibitors', 'sections', 'generated', 'artist']
Topic 2:  ['ai', 'python', 'iot', 'bigdata', 'cloud', 'g', 'theyre']
Topic 3:  ['ai', 'machinelearning', 'datascience', 'bot', 'amp', 'btc', 'artificialintelligence']
Topic 4:  ['ai', 'art', 'calling', 'free', 'watch', 'chinese', 'cyborg']


In [7]:
%%time
def do_sklearn_lda(n_comp=5):
  # Create LDA object
  model=LatentDirichletAllocation(n_components=n_comp,verbose=2)

  # Fit and Transform SVD model on data
  lda_matrix = model.fit_transform(train_data)

  #save this
  dump(lda_matrix, topic_models_folder+ str(n_comp)+'_test_lda_matrix.joblib')
  lda_matrix_2 = load(topic_models_folder+str(n_comp)+'_test_lda_matrix.joblib')

  # Get Components
  lda_components=model.components_

  # Print the topics with their terms
  terms = tfidf.get_feature_names_out()
  print('SKLearn LDA with '+ str(n_comp)+' components:')
  for index, component in enumerate(lda_components):
      zipped = zip(terms, component)
      top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:7]
      top_terms_list=list(dict(top_terms_key).keys())
      print("Topic "+str(index)+": ",top_terms_list)
  print('-----------------------------------')

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 10.7 µs


In [8]:
%%time
for n in [5,10,15,20]:
  do_sklearn_lda(n_comp=n)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
SKLearn LDA with 5 components:
Topic 0:  ['ai', 'neuralink', 'like', 'much', 'rbxs', 'coders', 'taught']
Topic 1:  ['generated', 'art', 'conventions', 'alley', 'exhibitors', 'sections', 'say']
Topic 2:  ['ai', 'art', 'certification', 'challenge', 'developer', 'optimise', 'smart']
Topic 3:  ['ai', 'machinelearning', 'bigdata', 'datascience', 'analytics', 'data', 'python']
Topic 4:  ['ai', 'foodspot', 'amp', 'artificialintelligence', 'machinelearning', 'bot', 'ml']
-----------------------------------
iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
it

In [None]:
# Coherence
# https://stackoverflow.com/questions/60613532/how-do-i-calculate-the-coherence-score-of-an-sklearn-lda-model

# Gensim LDA Topic Model

In [9]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary
import gensim.corpora as corpora
import gensim
from pprint import pprint
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel

In [10]:
gensim_corpora = []
for twit in test_twit_corpus:
  # Tokenize words
  tokens = word_tokenize(twit)
  gensim_corpora.append(tokens)
random.sample(gensim_corpora,2)

[['sometimes',
  'ai',
  'really',
  'bad',
  'hands',
  'sometimes',
  'almost',
  'gets',
  'right',
  'p'],
 ['overall',
  'awareness',
  'benefits',
  'using',
  'ai',
  'business',
  'artificialintelligence',
  'ml',
  'machinelearning',
  'cc']]

In [11]:
%%time
# Get Representation
id2word = corpora.Dictionary(gensim_corpora)
id2word.save(topic_models_folder+"test_id2word.pkl")

# Term Document Frequency (BOW)
bow_test_corpus = [id2word.doc2bow(text) for text in gensim_corpora]

with open(topic_models_folder+"bow_test_corpus.pkl", "wb") as fp:   #Pickling
  pickle.dump(bow_test_corpus, fp)

CPU times: user 30.9 ms, sys: 3.8 ms, total: 34.7 ms
Wall time: 685 ms


In [12]:
%%time
# Try Loading
with open(topic_models_folder+"bow_test_corpus.pkl", "rb") as fp:   # Unpickling
  bow_test_corpus = pickle.load(fp)

id2word = corpora.Dictionary.load(topic_models_folder+"test_id2word.pkl")

print(len(bow_test_corpus))

630
CPU times: user 2.21 ms, sys: 4 ms, total: 6.22 ms
Wall time: 11.9 ms


In [13]:
def create_topics(num_topics = 5,corpus=bow_test_corpus, id2word=id2word):
  # number of topics
  #num_topics = 5
  # Build LDA model
  lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                        id2word=id2word,
                                        alpha=0.1,
                                        eta=0.1,
                                        num_topics=num_topics)

  temp_file = topic_models_folder+"lda_model_"+ str(num_topics)
  lda_model.save(temp_file)

  # Print the Keyword in the 5 topics
  pprint(lda_model.print_topics())
  doc_lda = lda_model[corpus]
  #Show first 4 important words in the topics
  #doc_lda.show_topics(num_topics, 4)
  #pprint(lda_model.show_topics(num_topics, 4))
  cm = CoherenceModel(model=lda_model, corpus=corpus, coherence='u_mass')
  coherence = cm.get_coherence()
  print(coherence)

In [14]:
%%time
for k in [5,10]:
  print('---------------------------------------------')
  create_topics(num_topics = k,corpus=bow_test_corpus, id2word=id2word)
  print('---------------------------------------------')



---------------------------------------------




[(0,
  '0.061*"ai" + 0.013*"art" + 0.011*"datascience" + 0.009*"data" + 0.008*"amp" '
  '+ 0.007*"free" + 0.007*"python" + 0.007*"machinelearning" + '
  '0.006*"programming" + 0.005*"ml"'),
 (1,
  '0.046*"ai" + 0.009*"art" + 0.008*"amp" + 0.007*"python" + 0.005*"iot" + '
  '0.005*"technology" + 0.005*"daysofcode" + 0.005*"foodspot" + '
  '0.004*"javascript" + 0.004*"data"'),
 (2,
  '0.068*"ai" + 0.010*"data" + 0.009*"machinelearning" + 0.006*"day" + '
  '0.006*"intelligence" + 0.006*"artificial" + 0.006*"new" + 0.005*"artists" + '
  '0.005*"art" + 0.005*"learn"'),
 (3,
  '0.058*"ai" + 0.035*"art" + 0.021*"generated" + 0.021*"artist" + 0.019*"say" '
  '+ 0.019*"exhibitors" + 0.019*"alley" + 0.019*"conventions" + '
  '0.019*"sections" + 0.005*"make"'),
 (4,
  '0.057*"ai" + 0.013*"machinelearning" + 0.011*"art" + 0.009*"datascience" + '
  '0.009*"bigdata" + 0.009*"like" + 0.007*"amp" + '
  '0.007*"artificialintelligence" + 0.006*"python" + 0.005*"much"')]
-11.82437792895503
--------------