# Notebook for topic modeling 

# 0. Imports

In [1]:
# If you're in a managed environment, you may not need these installs.
# Uncomment if needed.
## ! conda activate qss20
## ! conda install pyLDAvis gensim bertopic sentence-transformers umap-learn hdbscan wordcloud

import pandas as pd
import re
import numpy as np
import warnings

## nltk imports
#!pip install nltk # can install on terminal or by uncommenting this line
#import nltk; nltk.download('punkt'); nltk.download('stopwords')
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

## sklearn imports
from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora
import gensim

import pyLDAvis
warnings.filterwarnings("ignore", category=DeprecationWarning)
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

## print mult things
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## random
import random
import string; punctlist = [char for char in string.punctuation] # list of english punctuation marks

# 0. Load data

In [3]:
ab = pd.read_csv("../../public_data/airbnb_text.csv")
ab.head()

Unnamed: 0,id,name,name_upper,neighbourhood_group,price
0,2539,Clean & quiet apt home by the park,CLEAN & QUIET APT HOME BY THE PARK,Brooklyn,149
1,2595,Skylit Midtown Castle,SKYLIT MIDTOWN CASTLE,Manhattan,225
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,THE VILLAGE OF HARLEM....NEW YORK !,Manhattan,150
3,3831,Cozy Entire Floor of Brownstone,COZY ENTIRE FLOOR OF BROWNSTONE,Brooklyn,89
4,5022,Entire Apt: Spacious Studio/Loft by central park,ENTIRE APT: SPACIOUS STUDIO/LOFT BY CENTRAL PARK,Manhattan,80


# 1. Preprocess documents

In this case, each name/name_upper, or listing title, we're treating as a document

## 1.1 Load stopwords list and augment with our own custom ones

In [4]:
list_stopwords = stopwords.words("english")

custom_words_toadd = ['apartment', 'new york', 'nyc',
                      'bronx', 'brooklyn',
                     'manhattan', 'queens', 
                      'staten island']

list_stopwords_new = list_stopwords + custom_words_toadd
list_stopwords_new

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

## 1.2 Remove stopwords from lowercase version of corpus


In [5]:
## convert to lowercase and a list
corpus_lower = ab.name.str.lower().to_list()
corpus_lower[0:5]

## use wordpunct tokenize and filter out with one
example_listing = corpus_lower[3]
nostop_listing = [word for word in wordpunct_tokenize(example_listing) 
                          if word not in list_stopwords_new]
nostop_listing

['clean & quiet apt home by the park',
 'skylit midtown castle',
 'the village of harlem....new york !',
 'cozy entire floor of brownstone',
 'entire apt: spacious studio/loft by central park']

['cozy', 'entire', 'floor', 'brownstone']

## 1.3 stem and remove non-alpha

Other contexts we may want to leave digits in

In [6]:
## initialize stemmer
porter = PorterStemmer()

## apply to one by iterating
## over the tokens in the list
example_listing_preprocess = [porter.stem(token) 
                            for token in nostop_listing 
                            if token.isalpha() and 
                            len(token) > 2]

example_listing_preprocess

['cozi', 'entir', 'floor', 'brownston']

In [7]:
example_listing
example_listing_preprocess

'cozy entire floor of brownstone'

['cozi', 'entir', 'floor', 'brownston']

## 1.4 Activity 1

The above example performed preprocessing on a single Airbnb listing. We want to generalize this preprocessing across all listings.

- Embed step two (remove stopwords) and step three (stem) into one or two functions that take in a raw string (eg the raw text of an Airbnb review) and return a preprocessed string 
- Apply the function iteratively to preprocess all the texts in `corpus_lower`. Output could either be a list where each list element is a string of a list (e.g., `cozy brownstone apt`), or a list of lists where each element is a tokenized string (e.g., `['cozy', 'brownstone', 'apt'])`

Output is flexible: it could be a list of lists containing tokenized/stemmed text or a list of strings.

In [8]:
# Reusable text preprocessor
def preprocess_text(s, stopwords=list_stopwords_new, stemmer=porter):
    if not isinstance(s, str) or not s.strip():
        return ""
    # lowercase + tokenize
    toks = wordpunct_tokenize(s.lower())
    # keep words: alphabetic, length > 2
    toks = [t for t in toks if t.isalpha() and len(t) > 2]
    # remove stopwords
    toks = [t for t in toks if t not in stopwords]
    # stem
    toks = [stemmer.stem(t) for t in toks]
    # return space-joined string (what CountVectorizer expects)
    return " ".join(toks)

# Apply on our sampled dataframe used in section 2
ab['name_clean'] = ab['name'].apply(preprocess_text)
ab[['name', 'name_clean']].head()


Unnamed: 0,name,name_clean
0,Clean & quiet apt home by the park,clean quiet apt home park
1,Skylit Midtown Castle,skylit midtown castl
2,THE VILLAGE OF HARLEM....NEW YORK !,villag harlem new york
3,Cozy Entire Floor of Brownstone,cozi entir floor brownston
4,Entire Apt: Spacious Studio/Loft by central park,entir apt spaciou studio loft central park


# 2. Create a document-term matrix and do some basic diagnostics (more manual approach)

Here we'll create a DTM first using the raw documents; in the activity, you'll create one using the preprocessed docs
that you created in activity 1

## 2.1 Define the dtm function and select data to transform into a document-term matrix

In [9]:
## function provided
def create_dtm(list_of_strings, metadata):
    """ 
    Function to create dense document-term matrix (DTM) from a list of strings and provided metadata. 
    A sparse DTM is a list of term_index/doc_index tuples: if a given term occurs in a given doc at least once, 
        then this count is listed as a tuple; if not, that term/doc pair is omitted. 
    In a dense DTM, each row is one text (e.g., an Airbnb listing), each column is a term, and 
        each cell indicates the frequency of that word in that text. 
    
    Parameters:
        list_of_strings (Series): each row contains a preprocessed string (need not be tokenized)
        metadata (DataFrame): contains document-level covariates
    
    Returns:
        Dense DTM with metadata on left and then one column per word in lexicon
    """
    
    # initialize a sklearn tokenizer; this helps us tokenize the preprocessed string input
    vectorizer = CountVectorizer(lowercase = True) 
    dtm_sparse = vectorizer.fit_transform(list_of_strings)
    print('Sparse matrix form:\n', dtm_sparse[:3]) # take a look at sparse representation
    print()
    
    # switch the dataframe from the sparse representation to the normal dense representation (so we can treat it as regular dataframe)
    dtm_dense_named = pd.DataFrame(dtm_sparse.todense(), columns=vectorizer.get_feature_names_out ())
    print('Dense matrix form:\n', dtm_dense_named.head()) # take a look at dense representation
    dtm_dense_named_withid = pd.concat([metadata.reset_index(), dtm_dense_named], axis = 1) # add back document-level covariates

    return(dtm_dense_named_withid)

In [10]:

## filter out na's
## for shorter runtime, random sampling of 1000
## get metadata for those
## and also renaming price col since it's likely to be corpus word
ab_small = ab.loc[~ab.name.isnull(),
           ['id', 'neighbourhood_group', 'price', 'name']].copy().rename(columns = {'price':
            'price_rawdata'}).sample(n = 1000, random_state = 422)

ab_small['name_lower'] = ab_small['name'].str.lower()
ab_small.head()

Unnamed: 0,id,neighbourhood_group,price_rawdata,name,name_lower
23821,19227560,Queens,100,Super Cozy!,super cozy!
22905,18560625,Brooklyn,30,Beautiful Private Bedroom by Prospect Park,beautiful private bedroom by prospect park
20426,16289576,Manhattan,80,Best Location on the Upper West Side! - Part II,best location on the upper west side! - part ii
2018,893413,Manhattan,2500,Architecturally Stunning Former Synagogue!,architecturally stunning former synagogue!
18790,14882137,Queens,50,"Large, beautiful room near Bushwick","large, beautiful room near bushwick"


## 2.2 Execute the dtm function to create the document-term matrix

In [11]:
## example application on raw lowercase texts; 
dtm_nopre = create_dtm(list_of_strings= ab_small.name_lower,
                      metadata = ab_small[['id', 'neighbourhood_group', 'price_rawdata']])

Sparse matrix form:
   (0, 841)	1
  (0, 281)	1
  (1, 152)	1
  (1, 693)	1
  (1, 157)	1
  (1, 205)	1
  (1, 698)	1
  (1, 653)	1
  (2, 165)	1
  (2, 537)	1
  (2, 637)	1
  (2, 856)	1
  (2, 902)	1
  (2, 939)	1
  (2, 774)	1
  (2, 657)	1
  (2, 471)	1

Dense matrix form:
    001  10  10m  10min  10mins  1100  12mins  14  15  15min  ...  yoga  york  \
0    0   0    0      0       0     0       0   0   0      0  ...     0     0   
1    0   0    0      0       0     0       0   0   0      0  ...     0     0   
2    0   0    0      0       0     0       0   0   0      0  ...     0     0   
3    0   0    0      0       0     0       0   0   0      0  ...     0     0   
4    0   0    0      0       0     0       0   0   0      0  ...     0     0   

   you  your  yu  zen  ღღღsteps  法拉盛中心私人房間獨立衛浴  溫馨大套房  獨一無二的紐約閣樓  
0    0     0   0    0         0              0      0          0  
1    0     0   0    0         0              0      0          0  
2    0     0   0    0         0              0      0  

In [12]:
## show first set of rows/cols
dtm_nopre.head()

## show arbitrary later cols in resulting data
dtm_nopre.shape
dtm_nopre.iloc[0:5, 480:500]

Unnamed: 0,index,id,neighbourhood_group,price_rawdata,001,10,10m,10min,10mins,1100,...,yoga,york,you,your,yu,zen,ღღღsteps,法拉盛中心私人房間獨立衛浴,溫馨大套房,獨一無二的紐約閣樓
0,23821,19227560,Queens,100,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,22905,18560625,Brooklyn,30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,20426,16289576,Manhattan,80,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2018,893413,Manhattan,2500,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,18790,14882137,Queens,50,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


(1000, 974)

Unnamed: 0,inclusive,incredible,incredibly,indoor,inn,inq,insane,int,interior,international,interns,invincible,inviting,inwood,island,it,italy,its,jefferson,jewel
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## 2.3 Use that matrix/column sums to get basic summary stats of top words

In [13]:
## summing each col
top_terms = dtm_nopre[dtm_nopre.columns[4:]].sum(axis = 0)

## sorting from most frequent to least frequent
top_terms.sort_values(ascending = False)

in           367
room         244
private      163
bedroom      152
apartment    130
            ... 
gay            1
gente          1
geodesic       1
george         1
獨一無二的紐約閣樓      1
Length: 970, dtype: int64

## 2.4 Activity 2: repeat the above but using the preprocessed text data

- Stick with the same random sample of 1000 `ab_small`
- Apply the preprocessing steps from activity 1 to create a new column in `ab_small` with the preprocessed text (if you got stuck on that, try just removing stopwords)
- Use the `create_dtm` function to create a document-term matrix from the preprocessed data
- Use colsums to summarize

In [15]:
# Create DTM from cleaned texts (dense matrix with metadata on the left)
dtm_pre = create_dtm(ab['name_clean'], metadata=ab_small[['id','neighbourhood_group','price_rawdata']])

# Peek at shape and a slice
display(dtm_pre.shape)
dtm_pre.iloc[:5, :15]

# Top terms by corpus frequency (sum each column excluding metadata)
term_cols = [c for c in dtm_pre.columns if c not in ['id','neighbourhood_group','price_rawdata']]
top_terms_pre = dtm_pre[term_cols].sum(axis=0).sort_values(ascending=False)
top_terms_pre.head(20)

Sparse matrix form:
   (0, 964)	1
  (0, 3695)	1
  (0, 197)	1
  (0, 2194)	1
  (0, 3380)	1
  (1, 4125)	1
  (1, 2953)	1
  (1, 827)	1
  (2, 4886)	1
  (2, 2093)	1
  (2, 3170)	1
  (2, 5132)	1

Dense matrix form:
    aaa  abc  abcd  abducen  abigail  abl  aboard  abod  absolut  abund  ...  \
0    0    0     0        0        0    0       0     0        0      0  ...   
1    0    0     0        0        0    0       0     0        0      0  ...   
2    0    0     0        0        0    0       0     0        0      0  ...   
3    0    0     0        0        0    0       0     0        0      0  ...   
4    0    0     0        0        0    0       0     0        0      0  ...   

   웨스트빌리지에  위치한  조용한  ｱｸｾｽ抜群  ｷｯﾁﾝ付き  ｷﾚｲなお部屋  ﾆｭｰﾖｰｸの中心  ﾌﾞﾙｯｸﾘﾝの大人気ｴﾘｱ  \
0        0    0    0       0       0        0          0               0   
1        0    0    0       0       0        0          0               0   
2        0    0    0       0       0        0          0               0   
3        0    

(48895, 5427)

Unnamed: 0,index,id,neighbourhood_group,price_rawdata,aaa,abc,abcd,abducen,abigail,abl,aboard,abod,absolut,abund,ac
0,23821.0,19227560.0,Queens,100.0,0,0,0,0,0,0,0,0,0,0,0
1,22905.0,18560625.0,Brooklyn,30.0,0,0,0,0,0,0,0,0,0,0,0
2,20426.0,16289576.0,Manhattan,80.0,0,0,0,0,0,0,0,0,0,0,0
3,2018.0,893413.0,Manhattan,2500.0,0,0,0,0,0,0,0,0,0,0,0
4,18790.0,14882137.0,Queens,50.0,0,0,0,0,0,0,0,0,0,0,0


index           24361187.0
room               10418.0
bedroom             8484.0
privat              7422.0
cozi                5101.0
apt                 4744.0
studio              4122.0
spaciou             3796.0
park                3359.0
east                3107.0
sunni               2945.0
williamsburg        2743.0
beauti              2630.0
near                2373.0
villag              2347.0
bed                 2342.0
loft                2147.0
larg                2084.0
heart               2071.0
luxuri              1963.0
dtype: float64

# 3. Use gensim to more automatically preprocess/estimate a topic model

## 3.1 Creating the objects to feed the LDA modeling function

Different outputs described below: 
- Tokenized and preprocessed text 
- Dictionary 
- Corpus 

In [16]:

## Step 1: re-tokenize and store in list
## here, i'm doing with the raw random sample of text
## in activity, you should do with the preprocessed texts
text_raw_tokens = [wordpunct_tokenize(one_text) for one_text in 
                  ab_small.name_lower]


## Step 2: use gensim create dictionary - gets all unique words across documents
text_raw_dict = corpora.Dictionary(text_raw_tokens)
raw_len = len(text_raw_dict) # get length for comparison below

### explore first few keys and values
### see that key is just an arbitrary counter; value is the word itself
{k: text_raw_dict[k] for k in list(text_raw_dict)[:5]}


## Step 3: filter out very rare and very common words
## here, i'm using the threshold that a word needs to appear in at least
## 5% of docs but not more than 95%
## this is an integer count of docs so i round
lower_bound = round(ab_small.shape[0]*0.05)
upper_bound = round(ab_small.shape[0]*0.95)

### apply filtering to dictionary
text_raw_dict.filter_extremes(no_below = lower_bound,
                             no_above = upper_bound)
print(f'Filtering out very rare and very common words reduced the \
length of dictionary from {str(raw_len)} to {str(len(text_raw_dict))}.')
{k: text_raw_dict[k] for k in list(text_raw_dict)[:5]} # show first five entries after filtering


## Step 4: apply dictionary to TOKENIZED texts
## this creates a mapping between each word 
## in a specific listing and the key in the dictionary.
## for words that remain in the filtered dictionary,
## output is a list where len(list) == n documents
## and each element in the list is a list of tuples
## containing the mappings
corpus_fromdict = [text_raw_dict.doc2bow(one_text) 
                   for one_text in text_raw_tokens]

### can apply doc2bow(one_text, return_missing = True) to print words
### eliminated from the listing bc they're not in filtered dictionary.
### but feeding that one with missing values to
### the lda function can cause errors
corpus_fromdict_showmiss = [text_raw_dict.doc2bow(one_text, return_missing = True)
                            for one_text in text_raw_tokens]
print('Sample of documents represented in dictionary format (with omitted words noted):')
corpus_fromdict_showmiss[:10]

{0: '!', 1: 'cozy', 2: 'super', 3: 'beautiful', 4: 'bedroom'}

Filtering out very rare and very common words reduced the length of dictionary from 1047 to 31.


{0: '!', 1: 'cozy', 2: 'beautiful', 3: 'bedroom', 4: 'park'}

Sample of documents represented in dictionary format (with omitted words noted):


[([(0, 1), (1, 1)], {'super': 1}),
 ([(2, 1), (3, 1), (4, 1), (5, 1)], {'by': 1, 'prospect': 1}),
 ([(0, 1), (6, 1), (7, 1)],
  {'best': 1,
   'ii': 1,
   'location': 1,
   'on': 1,
   'part': 1,
   'side': 1,
   'upper': 1,
   'west': 1}),
 ([(0, 1)],
  {'architecturally': 1, 'former': 1, 'stunning': 1, 'synagogue': 1}),
 ([(2, 1), (8, 1), (9, 1), (10, 1), (11, 1)], {'bushwick': 1}),
 ([(4, 1), (8, 1), (9, 1), (12, 1), (13, 2)],
  {'bath': 1, 'bed': 1, 'by': 1, 'central': 1, 'college': 1, 'hunter': 1}),
 ([(9, 1), (11, 1), (14, 1), (15, 1)], {'bohemian': 1, 'brownstone': 1}),
 ([(16, 1)],
  {'fidi': 1, 'huge': 1, 'loft': 1, 'views': 1, 'w': 1, 'water': 1}),
 ([], {'hillside': 1, 'hotel': 1}),
 ([(5, 1), (9, 1), (11, 1), (14, 1), (15, 1)], {'airy': 1})]

##  3.2 Estimating the model

In [17]:
## Step 5: we're finally ready to estimate the model!
## full documentation here - https://radimrehurek.com/gensim/models/ldamodel.html
## here, we're feeding the lda function:
## (1) the corpus we created from the dictionary,
## (2) a parameter we decide on for the number of topics (k),
## (3) the dictionary itself,
## (4) parameter for number of passes through training data (more means slower), and
## (5) parameter that returns, for each word remaining in dict, the topic probabilities.
## see documentation for many other arguments you can vary
ldamod = gensim.models.ldamodel.LdaModel(corpus_fromdict, 
                                         num_topics = 5, 
                                         id2word=text_raw_dict, 
                                         passes=6, 
                                         alpha = 'auto',
                                         per_word_topics = True,
                                         random_state=0)

print(type(ldamod))

<class 'gensim.models.ldamodel.LdaModel'>


## 3.3  Seeing what topics the estimated model discovers

In [18]:
## Post-model 1: explore corpus-wide summary of topics
### getting the topics and top words; can retrieve diff top words
topics = ldamod.print_topics(num_words = 10)
for topic in topics:
    print(topic)

(0, '0.238*"," + 0.092*"apartment" + 0.080*"near" + 0.075*"beautiful" + 0.061*"park" + 0.052*"!" + 0.050*"room" + 0.047*"sunny" + 0.041*"-" + 0.035*"brooklyn"')
(1, '0.234*"in" + 0.095*"the" + 0.089*"brooklyn" + 0.083*"williamsburg" + 0.070*"of" + 0.063*"room" + 0.050*"-" + 0.046*"apt" + 0.044*"." + 0.044*"apartment"')
(2, '0.166*"bedroom" + 0.122*"to" + 0.116*"1" + 0.109*"-" + 0.065*"2" + 0.041*"large" + 0.039*"," + 0.038*"in" + 0.035*"spacious" + 0.032*"east"')
(3, '0.157*"studio" + 0.147*"cozy" + 0.136*"in" + 0.092*"and" + 0.078*"manhattan" + 0.065*"room" + 0.043*"." + 0.038*"apartment" + 0.037*"park" + 0.036*"private"')
(4, '0.156*"private" + 0.147*"room" + 0.136*"in" + 0.115*"!" + 0.090*"/" + 0.051*"bedroom" + 0.048*"with" + 0.043*"spacious" + 0.037*"sunny" + 0.030*"2"')


In [19]:
    ## Post-model 2: explore topics associated with each document
### for each item in our original dictionary, get list of topic probabilities
l = [ldamod.get_document_topics(item) for item in corpus_fromdict]
### print result
text_raw_tokens[0:5]
l[0:5]

[['super', 'cozy', '!'],
 ['beautiful', 'private', 'bedroom', 'by', 'prospect', 'park'],
 ['best',
  'location',
  'on',
  'the',
  'upper',
  'west',
  'side',
  '!',
  '-',
  'part',
  'ii'],
 ['architecturally', 'stunning', 'former', 'synagogue', '!'],
 ['large', ',', 'beautiful', 'room', 'near', 'bushwick']]

[[(0, 0.055665575),
  (1, 0.06323104),
  (2, 0.052744307),
  (3, 0.40368736),
  (4, 0.4246717)],
 [(0, 0.4809366),
  (1, 0.03687382),
  (2, 0.031092707),
  (3, 0.028580178),
  (4, 0.42251673)],
 [(0, 0.041016307),
  (1, 0.5992156),
  (2, 0.03906125),
  (3, 0.036074802),
  (4, 0.28463206)],
 [(0, 0.08659294),
  (1, 0.09856516),
  (2, 0.08218045),
  (3, 0.07640578),
  (4, 0.65625566)],
 [(0, 0.88974),
  (1, 0.030692171),
  (2, 0.025476173),
  (3, 0.023628667),
  (4, 0.030463042)]]

### Visualizing 

In [20]:
lda_display = gensimvis.prepare(ldamod, corpus_fromdict, text_raw_dict)
pyLDAvis.display(lda_display)

## 3.4 Activity 3

- Preprocess the texts if you haven't already
- Run the topic model with preprocessed texts
- Play around with other parameters like `n_topics` to find a configuration that produces useful topics

If you get stuck on the preprocessing part, you can use below function and example code for applying it. Then continue as above (start with tokenizing).

In [21]:
# If gensim isn't imported yet, ensure it's available in this kernel
try:
    import gensim
    from gensim import corpora
    from gensim.models.ldamodel import LdaModel
except Exception as e:
    # In many classroom environments gensim is already installed;
    # if not, uncomment the next two lines in your own environment:
    # !pip install gensim pyLDAvis
    from gensim import corpora
    from gensim.models.ldamodel import LdaModel

# Token lists from our cleaned strings
tokens = [row.split() for row in ab['name_clean'].fillna("")]

# Build dictionary and corpus
text_dict = corpora.Dictionary(tokens)
# Filter extremes a bit (tune thresholds as needed)
text_dict.filter_extremes(no_below=5, no_above=0.5)
corpus = [text_dict.doc2bow(toks) for toks in tokens]

# Train an LDA model (try different num_topics/passes to see stability)
ldamod2 = LdaModel(
    corpus=corpus,
    num_topics=10,
    id2word=text_dict,
    passes=5,
    random_state=422,
    alpha="auto",
    eta="auto"
)

# Show top words per topic
for tid, terms in ldamod2.show_topics(num_topics=10, num_words=10, formatted=False):
    print(f"Topic {tid:02d}:", ", ".join([w for w,_ in terms]))

# Optional: inspect per-document topic mixture for first few docs
doc_topics = [ldamod2.get_document_topics(corpus[i]) for i in range(5)]
doc_topics

Topic 00: williamsburg, loft, sonder, duplex, soho, artist, rooftop, bright, downtown, stylish
Topic 01: bed, villag, east, kitchen, queen, west, full, king, bath, hell
Topic 02: east, side, upper, west, love, backyard, street, lower, perfect, access
Topic 03: park, home, central, renov, away, comfi, step, newli, townhous, chic
Topic 04: room, privat, cozi, spaciou, sunni, new, larg, close, clean, modern
Topic 05: luxuri, heart, min, time, squar, midtown, gorgeou, astoria, walk, build
Topic 06: studio, charm, amaz, garden, brownston, huge, chelsea, place, space, brand
Topic 07: bedroom, one, jfk, height, apt, lga, hill, cute, spaciou, airport
Topic 08: beauti, locat, view, great, train, best, prime, floor, entir, apt
Topic 09: near, bushwick, subway, bright, hous, suit, park, oasi, prospect, columbia


[[(0, 0.032206573),
  (1, 0.023951875),
  (2, 0.017366065),
  (3, 0.3190446),
  (4, 0.42622957),
  (5, 0.03592549),
  (6, 0.03778193),
  (7, 0.042757355),
  (8, 0.036323782),
  (9, 0.02841283)],
 [(0, 0.042872656),
  (1, 0.031792555),
  (2, 0.023140866),
  (3, 0.037196852),
  (4, 0.13543606),
  (5, 0.21255016),
  (6, 0.21440555),
  (7, 0.053099416),
  (8, 0.046352565),
  (9, 0.20315328)],
 [(0, 0.03673934),
  (1, 0.16963318),
  (2, 0.019830672),
  (3, 0.031891588),
  (4, 0.54261845),
  (5, 0.039753504),
  (6, 0.042002603),
  (7, 0.045535594),
  (8, 0.03972141),
  (9, 0.032273628)],
 [(0, 0.03673936),
  (1, 0.027244385),
  (2, 0.01983047),
  (3, 0.031875554),
  (4, 0.2582718),
  (5, 0.039753534),
  (6, 0.18392852),
  (7, 0.045584388),
  (8, 0.32449836),
  (9, 0.032273646)],
 [(0, 0.13136539),
  (1, 0.019143747),
  (2, 0.013904555),
  (3, 0.24487902),
  (4, 0.19242015),
  (5, 0.028653806),
  (6, 0.14931676),
  (7, 0.034833275),
  (8, 0.16264293),
  (9, 0.022840383)]]