In [43]:
import pickle
# gensim
from gensim import corpora, models, similarities, matutils
# sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
# logging for gensim (set to INFO)
from nltk.corpus import stopwords
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
with open("zillow_df.pkl", 'rb') as picklefile: 
    df = pickle.load(picklefile)

In [4]:
df.head()

Unnamed: 0,address,city,state,zip,price,sqft,bedrooms,bathrooms,days_on_zillow,sale_type,url,description
0,20014 59th Ave SE,Snohomish,WA,98296,1868000,7359,4,6,,House For Sale,http://www.zillow.com/homes/for_sale//homedeta...,Walk-through Video: www.seetheproperty.com/250...
1,13506 70th Dr SE,Snohomish,WA,98296,674950,2957,4,3,3.0,House For Sale,http://www.zillow.com/homes/for_sale//homedeta...,Wonderful Highlands East home that has been up...
2,14313 214th St SE,Snohomish,WA,98296,546000,2237,3,3,6.0,Auction,http://www.zillow.com/homes/for_sale//homedeta...,Investor opportunity! This property is being o...
3,11621 60th Ave SE,Snohomish,WA,98296,489950,2213,4,3,4.0,House For Sale,http://www.zillow.com/homes/for_sale//homedeta...,"Newer 4 br, 2.5 bths 2-story backs to greenbel..."
4,13325 81st Ave SE,Snohomish,WA,98296,720000,3091,4,3,12.0,House For Sale,http://www.zillow.com/homes/for_sale//homedeta...,Beautiful Murray Franklin re-sale in the desir...


In [5]:
desc = df['description']

In [8]:
desc[0]

"Walk-through Video: www.seetheproperty.com/250910    Just a hop over the border to Snohomish County, but still in Northshore School District! A meandering driveway is the grand approach to this superbly-built 5832 sq ft home with 1527 sq ft guest house/apartment, that will make every covered porch-lovers heart skip a beat. Enjoy country living with elegance and charm, in a pastoral setting. A stroll around the shy 5-acre parcel reveals surprises, too, including an outdoor fireplace, sport court, and a barn equipped with chandeliers, perfect for a large gathering or for your mini-farm. The main home has 4 bedrooms + bonus + a finished room with closet (alternate bedroom). The apartment has 3 finished rooms that can be used as bedrooms. There's also a basement level workout room and storage roomExcellent for VRBO, Airbnb or potential retreat/event center.Convenient location is close to Woodinville, Canyon Park and Seattle.Just a few minutes to Costco and Woodinville's Brightwater Park, 

In [9]:
# Create a CountVectorizer for parsing/counting words
count_vectorizer = CountVectorizer(ngram_range=(1, 2),  
                                   stop_words='english', token_pattern="\\b[a-z][a-z]+\\b")
count_vectorizer.fit(desc)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='\\b[a-z][a-z]+\\b',
        tokenizer=None, vocabulary=None)

In [14]:
counts = count_vectorizer.transform(desc).transpose()

In [15]:
counts.shape

(77242, 2639)

In [16]:
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(counts)

In [17]:
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())

In [18]:
len(id2word)

77242

In [19]:
# Create lda model (equivalent to "fit" in sklearn)
lda = models.LdaModel(corpus=corpus, num_topics=10, minimum_probability=.03, id2word=id2word, passes=10)

2018-02-28 11:43:54,861 : INFO : using symmetric alpha at 0.1
2018-02-28 11:43:54,864 : INFO : using symmetric eta at 0.1
2018-02-28 11:43:54,878 : INFO : using serial LDA version on this node
2018-02-28 11:43:58,683 : INFO : running online (multi-pass) LDA training, 10 topics, 10 passes over the supplied corpus of 2639 documents, updating model once every 2000 documents, evaluating perplexity every 2639 documents, iterating 50x with a convergence threshold of 0.001000
2018-02-28 11:43:58,740 : INFO : PROGRESS: pass 0, at document #2000/2639
2018-02-28 11:44:02,405 : INFO : merging changes from 2000 documents into a model of 2639 documents
2018-02-28 11:44:02,815 : INFO : topic #8 (0.100): 0.009*"home" + 0.006*"room" + 0.005*"new" + 0.004*"kitchen" + 0.004*"large" + 0.003*"great" + 0.003*"property" + 0.003*"bath" + 0.003*"master" + 0.003*"living"
2018-02-28 11:44:02,818 : INFO : topic #0 (0.100): 0.005*"room" + 0.005*"home" + 0.004*"property" + 0.003*"large" + 0.003*"located" + 0.003*"

2018-02-28 11:44:17,082 : INFO : topic #9 (0.100): 0.004*"new" + 0.003*"water" + 0.002*"great" + 0.002*"street" + 0.002*"home" + 0.002*"views" + 0.002*"garage" + 0.002*"seattle" + 0.002*"gas" + 0.002*"lot"
2018-02-28 11:44:17,085 : INFO : topic #0 (0.100): 0.004*"home" + 0.003*"lots" + 0.003*"property" + 0.003*"lot" + 0.003*"great" + 0.003*"lake" + 0.002*"located" + 0.002*"private" + 0.002*"acres" + 0.002*"access"
2018-02-28 11:44:17,090 : INFO : topic diff=0.391952, rho=0.481153
2018-02-28 11:44:17,153 : INFO : PROGRESS: pass 3, at document #2000/2639
2018-02-28 11:44:18,984 : INFO : merging changes from 2000 documents into a model of 2639 documents
2018-02-28 11:44:19,324 : INFO : topic #6 (0.100): 0.013*"room" + 0.009*"home" + 0.006*"master" + 0.006*"kitchen" + 0.006*"living" + 0.005*"large" + 0.005*"bath" + 0.005*"floor" + 0.004*"bedroom" + 0.004*"garage"
2018-02-28 11:44:19,326 : INFO : topic #8 (0.100): 0.011*"home" + 0.008*"new" + 0.007*"room" + 0.006*"kitchen" + 0.005*"large" +

2018-02-28 11:44:30,797 : INFO : topic #1 (0.100): 0.010*"room" + 0.007*"kitchen" + 0.006*"home" + 0.006*"master" + 0.005*"bath" + 0.004*"main" + 0.004*"suite" + 0.004*"floor" + 0.003*"open" + 0.003*"great"
2018-02-28 11:44:30,800 : INFO : topic #8 (0.100): 0.012*"home" + 0.010*"new" + 0.007*"room" + 0.006*"kitchen" + 0.006*"large" + 0.004*"living" + 0.004*"great" + 0.004*"bath" + 0.004*"garage" + 0.003*"area"
2018-02-28 11:44:30,804 : INFO : topic diff=0.157845, rho=0.369623
2018-02-28 11:44:30,868 : INFO : PROGRESS: pass 6, at document #2000/2639
2018-02-28 11:44:32,536 : INFO : merging changes from 2000 documents into a model of 2639 documents
2018-02-28 11:44:32,891 : INFO : topic #8 (0.100): 0.011*"home" + 0.009*"new" + 0.007*"room" + 0.006*"kitchen" + 0.006*"large" + 0.004*"living" + 0.004*"great" + 0.004*"bath" + 0.003*"garage" + 0.003*"master"
2018-02-28 11:44:32,894 : INFO : topic #6 (0.100): 0.014*"room" + 0.010*"home" + 0.006*"master" + 0.006*"kitchen" + 0.006*"large" + 0.00

2018-02-28 11:44:44,172 : INFO : topic #8 (0.100): 0.012*"home" + 0.011*"new" + 0.007*"room" + 0.007*"kitchen" + 0.006*"large" + 0.004*"living" + 0.004*"great" + 0.004*"bath" + 0.004*"garage" + 0.003*"area"
2018-02-28 11:44:44,175 : INFO : topic #2 (0.100): 0.009*"home" + 0.005*"large" + 0.004*"great" + 0.003*"new" + 0.003*"room" + 0.003*"neighborhood" + 0.002*"bedrooms" + 0.002*"close" + 0.002*"yard" + 0.002*"love"
2018-02-28 11:44:44,178 : INFO : topic diff=0.115551, rho=0.311294
2018-02-28 11:44:44,243 : INFO : PROGRESS: pass 9, at document #2000/2639
2018-02-28 11:44:45,842 : INFO : merging changes from 2000 documents into a model of 2639 documents
2018-02-28 11:44:46,205 : INFO : topic #0 (0.100): 0.003*"lots" + 0.003*"lake" + 0.002*"home" + 0.002*"great" + 0.002*"property" + 0.002*"located" + 0.002*"private" + 0.002*"views" + 0.002*"access" + 0.001*"lot"
2018-02-28 11:44:46,207 : INFO : topic #5 (0.100): 0.012*"home" + 0.010*"washington" + 0.010*"located" + 0.009*"square" + 0.009

In [20]:
lda.print_topics()

2018-02-28 11:45:16,539 : INFO : topic #0 (0.100): 0.003*"lots" + 0.003*"lake" + 0.002*"home" + 0.002*"great" + 0.002*"property" + 0.002*"located" + 0.002*"private" + 0.001*"lot" + 0.001*"views" + 0.001*"access"
2018-02-28 11:45:16,543 : INFO : topic #1 (0.100): 0.010*"room" + 0.007*"kitchen" + 0.006*"home" + 0.006*"master" + 0.005*"bath" + 0.004*"main" + 0.004*"suite" + 0.004*"floor" + 0.003*"open" + 0.003*"great"
2018-02-28 11:45:16,547 : INFO : topic #2 (0.100): 0.009*"home" + 0.005*"large" + 0.004*"great" + 0.003*"new" + 0.003*"room" + 0.003*"neighborhood" + 0.002*"close" + 0.002*"bedrooms" + 0.002*"yard" + 0.002*"love"
2018-02-28 11:45:16,550 : INFO : topic #3 (0.100): 0.012*"lot" + 0.011*"property" + 0.008*"home" + 0.007*"acres" + 0.006*"water" + 0.006*"build" + 0.005*"acre" + 0.005*"land" + 0.004*"building" + 0.004*"lots"
2018-02-28 11:45:16,554 : INFO : topic #4 (0.100): 0.023*"properties" + 0.020*"auction" + 0.017*"com" + 0.017*"sale" + 0.017*"auction com" + 0.015*"property" +

[(0,
  '0.003*"lots" + 0.003*"lake" + 0.002*"home" + 0.002*"great" + 0.002*"property" + 0.002*"located" + 0.002*"private" + 0.001*"lot" + 0.001*"views" + 0.001*"access"'),
 (1,
  '0.010*"room" + 0.007*"kitchen" + 0.006*"home" + 0.006*"master" + 0.005*"bath" + 0.004*"main" + 0.004*"suite" + 0.004*"floor" + 0.003*"open" + 0.003*"great"'),
 (2,
  '0.009*"home" + 0.005*"large" + 0.004*"great" + 0.003*"new" + 0.003*"room" + 0.003*"neighborhood" + 0.002*"close" + 0.002*"bedrooms" + 0.002*"yard" + 0.002*"love"'),
 (3,
  '0.012*"lot" + 0.011*"property" + 0.008*"home" + 0.007*"acres" + 0.006*"water" + 0.006*"build" + 0.005*"acre" + 0.005*"land" + 0.004*"building" + 0.004*"lots"'),
 (4,
  '0.023*"properties" + 0.020*"auction" + 0.017*"com" + 0.017*"sale" + 0.017*"auction com" + 0.015*"property" + 0.008*"details" + 0.008*"foreclosure" + 0.006*"opportunity" + 0.006*"search"'),
 (5,
  '0.012*"home" + 0.009*"located" + 0.009*"washington" + 0.009*"square" + 0.009*"foot" + 0.008*"square foot" + 0.007*

I want to remove some of the over used or meaningless words like "great", "lots", "home", "love", "room", "ft", "foot", "square foot", "com", "search", "details", "large", "perfect", "main", "area", "space", "floor", "located", "beautiful", "enjoy", "one", "easy", "location", "property", "properties", "throughout", "including", "features", "also", "many", "offer", "available", "like", "currently", "ready", "time", "bonus", "come", "information", "calisto", "less", "selected", "westridge", "square", "found", "see", "additional", "homes", "find", "sale", "sales", "design".

In [125]:
nltk_stopwords = stopwords.words("english")+["great", "lots", "home", "love", "room", "ft",
                                             "foot", "square foot", "com", "search", "details",
                                             "large", "perfect", "main", "area", "space", "floor",
                                             "located", "beautiful", "enjoy", "one", "easy",
                                             "location", "property", "properties", "throughout",
                                             "including", "features","also", "many", "offer",
                                             "available", "like", "currently", "ready", "time",
                                             "bonus", "come", "information", "calisto", "less",
                                             "selected", "westridge", "square", "found", "see",
                                             "additional", "homes", "find","sale", "sales",
                                             "design"]

In [126]:
count_vectorizer = CountVectorizer(ngram_range=(1, 2), 
                                   stop_words=nltk_stopwords,
                                   token_pattern="\\b[a-z][a-z]+\\b")
count_vectorizer.fit(desc)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None,
        stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs',...d', 'westridge', 'square', 'found', 'see', 'additional', 'homes', 'find', 'sale', 'sales', 'design'],
        strip_accents=None, token_pattern='\\b[a-z][a-z]+\\b',
        tokenizer=None, vocabulary=None)

In [127]:
counts = count_vectorizer.transform(desc).transpose()

In [128]:
counts.shape

(76717, 2639)

In [129]:
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(counts)

In [130]:
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())

In [131]:
len(id2word)

76717

In [132]:
# Create lda model (equivalent to "fit" in sklearn)
lda = models.LdaModel(corpus=corpus, num_topics=6, minimum_probability=.03, id2word=id2word, passes=10)

2018-02-28 14:17:40,959 : INFO : using symmetric alpha at 0.16666666666666666
2018-02-28 14:17:40,963 : INFO : using symmetric eta at 0.16666666666666666
2018-02-28 14:17:40,984 : INFO : using serial LDA version on this node
2018-02-28 14:17:43,187 : INFO : running online (multi-pass) LDA training, 6 topics, 10 passes over the supplied corpus of 2639 documents, updating model once every 2000 documents, evaluating perplexity every 2639 documents, iterating 50x with a convergence threshold of 0.001000
2018-02-28 14:17:43,237 : INFO : PROGRESS: pass 0, at document #2000/2639
2018-02-28 14:17:46,591 : INFO : merging changes from 2000 documents into a model of 2639 documents
2018-02-28 14:17:46,998 : INFO : topic #3 (0.167): 0.004*"new" + 0.004*"kitchen" + 0.003*"lot" + 0.003*"master" + 0.003*"open" + 0.003*"views" + 0.002*"living" + 0.002*"bedroom" + 0.002*"garage" + 0.002*"bath"
2018-02-28 14:17:47,000 : INFO : topic #4 (0.167): 0.003*"new" + 0.003*"kitchen" + 0.003*"lot" + 0.002*"lake" +

2018-02-28 14:18:02,014 : INFO : topic #3 (0.167): 0.009*"new" + 0.005*"kitchen" + 0.004*"garage" + 0.004*"master" + 0.003*"bath" + 0.003*"living" + 0.003*"appliances" + 0.003*"lot" + 0.003*"bedroom" + 0.003*"open"
2018-02-28 14:18:02,018 : INFO : topic #1 (0.167): 0.013*"auction" + 0.006*"opportunity" + 0.006*"lot" + 0.005*"foreclosure" + 0.004*"scheduled" + 0.003*"view" + 0.003*"build" + 0.003*"county" + 0.003*"value" + 0.002*"residential"
2018-02-28 14:18:02,025 : INFO : topic diff=0.353960, rho=0.481153
2018-02-28 14:18:02,080 : INFO : PROGRESS: pass 3, at document #2000/2639
2018-02-28 14:18:03,840 : INFO : merging changes from 2000 documents into a model of 2639 documents
2018-02-28 14:18:04,761 : INFO : topic #1 (0.167): 0.012*"auction" + 0.005*"lot" + 0.005*"opportunity" + 0.005*"foreclosure" + 0.003*"scheduled" + 0.003*"view" + 0.003*"build" + 0.003*"county" + 0.002*"acre" + 0.002*"residential"
2018-02-28 14:18:04,764 : INFO : topic #5 (0.167): 0.006*"kitchen" + 0.006*"master"

2018-02-28 14:18:16,500 : INFO : topic #0 (0.167): 0.005*"kitchen" + 0.004*"living" + 0.004*"master" + 0.003*"bath" + 0.003*"garage" + 0.003*"views" + 0.003*"walk" + 0.003*"new" + 0.002*"custom" + 0.002*"car"
2018-02-28 14:18:16,502 : INFO : topic #2 (0.167): 0.005*"family" + 0.005*"bedrooms" + 0.004*"washington" + 0.004*"new" + 0.003*"bathrooms" + 0.003*"bedrooms bathrooms" + 0.003*"living" + 0.003*"single" + 0.003*"single family" + 0.003*"kitchen"
2018-02-28 14:18:16,505 : INFO : topic #3 (0.167): 0.009*"new" + 0.005*"kitchen" + 0.004*"garage" + 0.003*"master" + 0.003*"bath" + 0.003*"appliances" + 0.003*"living" + 0.003*"bedroom" + 0.003*"lot" + 0.002*"close"
2018-02-28 14:18:16,510 : INFO : topic diff=0.171173, rho=0.369623
2018-02-28 14:18:16,564 : INFO : PROGRESS: pass 6, at document #2000/2639
2018-02-28 14:18:18,181 : INFO : merging changes from 2000 documents into a model of 2639 documents
2018-02-28 14:18:18,555 : INFO : topic #5 (0.167): 0.007*"kitchen" + 0.006*"master" + 0.0

2018-02-28 14:18:28,633 : INFO : topic #1 (0.167): 0.013*"auction" + 0.006*"lot" + 0.006*"foreclosure" + 0.006*"opportunity" + 0.004*"scheduled" + 0.003*"view" + 0.003*"build" + 0.003*"county" + 0.003*"residential" + 0.003*"value"
2018-02-28 14:18:28,636 : INFO : topic #4 (0.167): 0.005*"lot" + 0.004*"water" + 0.004*"acres" + 0.003*"land" + 0.003*"view" + 0.003*"build" + 0.002*"lake" + 0.002*"building" + 0.002*"level" + 0.002*"site"
2018-02-28 14:18:28,639 : INFO : topic #5 (0.167): 0.007*"kitchen" + 0.007*"master" + 0.006*"bath" + 0.005*"living" + 0.004*"walk" + 0.004*"dining" + 0.004*"open" + 0.004*"suite" + 0.003*"bedroom" + 0.003*"fireplace"
2018-02-28 14:18:28,642 : INFO : topic #2 (0.167): 0.005*"family" + 0.005*"bedrooms" + 0.004*"washington" + 0.003*"bathrooms" + 0.003*"new" + 0.003*"bedrooms bathrooms" + 0.003*"single" + 0.003*"living" + 0.003*"single family" + 0.003*"kitchen"
2018-02-28 14:18:28,646 : INFO : topic diff=0.130167, rho=0.311294
2018-02-28 14:18:28,702 : INFO : P

In [133]:
lda.print_topics()

2018-02-28 14:18:32,617 : INFO : topic #0 (0.167): 0.005*"kitchen" + 0.004*"living" + 0.004*"master" + 0.003*"bath" + 0.003*"garage" + 0.003*"views" + 0.003*"walk" + 0.003*"custom" + 0.002*"private" + 0.002*"car"
2018-02-28 14:18:32,621 : INFO : topic #1 (0.167): 0.013*"auction" + 0.006*"lot" + 0.006*"foreclosure" + 0.006*"opportunity" + 0.004*"scheduled" + 0.003*"view" + 0.003*"build" + 0.003*"county" + 0.003*"residential" + 0.003*"value"
2018-02-28 14:18:32,624 : INFO : topic #2 (0.167): 0.005*"family" + 0.005*"bedrooms" + 0.004*"washington" + 0.004*"bathrooms" + 0.003*"new" + 0.003*"bedrooms bathrooms" + 0.003*"single" + 0.003*"living" + 0.003*"single family" + 0.003*"kitchen"
2018-02-28 14:18:32,628 : INFO : topic #3 (0.167): 0.010*"new" + 0.005*"kitchen" + 0.004*"garage" + 0.003*"master" + 0.003*"bath" + 0.003*"appliances" + 0.003*"living" + 0.003*"bedroom" + 0.003*"lot" + 0.002*"floors"
2018-02-28 14:18:32,632 : INFO : topic #4 (0.167): 0.005*"lot" + 0.004*"water" + 0.004*"acres"

[(0,
  '0.005*"kitchen" + 0.004*"living" + 0.004*"master" + 0.003*"bath" + 0.003*"garage" + 0.003*"views" + 0.003*"walk" + 0.003*"custom" + 0.002*"private" + 0.002*"car"'),
 (1,
  '0.013*"auction" + 0.006*"lot" + 0.006*"foreclosure" + 0.006*"opportunity" + 0.004*"scheduled" + 0.003*"view" + 0.003*"build" + 0.003*"county" + 0.003*"residential" + 0.003*"value"'),
 (2,
  '0.005*"family" + 0.005*"bedrooms" + 0.004*"washington" + 0.004*"bathrooms" + 0.003*"new" + 0.003*"bedrooms bathrooms" + 0.003*"single" + 0.003*"living" + 0.003*"single family" + 0.003*"kitchen"'),
 (3,
  '0.010*"new" + 0.005*"kitchen" + 0.004*"garage" + 0.003*"master" + 0.003*"bath" + 0.003*"appliances" + 0.003*"living" + 0.003*"bedroom" + 0.003*"lot" + 0.002*"floors"'),
 (4,
  '0.005*"lot" + 0.004*"water" + 0.004*"acres" + 0.003*"land" + 0.003*"view" + 0.003*"build" + 0.002*"lake" + 0.002*"building" + 0.002*"level" + 0.002*"site"'),
 (5,
  '0.007*"kitchen" + 0.007*"master" + 0.006*"bath" + 0.005*"living" + 0.004*"walk" 