In [1]:
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation

import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
color = sns.color_palette()

In [2]:
def display_topics(model, feature_names, n_top_words):
    topic_top_words = []
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        top_words_str = " ".join(top_words)
        print(top_words_str)
        topic_top_words.append(top_words_str)
    return topic_top_words

In [3]:
proptype = 'CON'

with open('data/features/{}_feats_remarks.pkl'.format(proptype), 'rb') as file:
    feats_train = pickle.load(file)

with open('data/features/TEST_{}_feats_remarks.pkl'.format(proptype), 'rb') as file:
    feats_test = pickle.load(file)

In [4]:
rmk_features = [col for col in feats_train.columns if 'lda_' in col or 'nmf_' in col]
feats_train.drop(rmk_features, axis=1, inplace=True)
feats_test.drop(rmk_features, axis=1, inplace=True)
print('feats_train shape:', feats_train.shape)
print('feats_test shape:', feats_test.shape)

feats_train shape: (40936, 2086)
feats_test shape: (3563, 2086)


In [5]:
# merge remarks with all other features
if proptype == 'SF':
    rmk_train = pd.read_csv('data/merge_sold_data/MERGE_{}_SOLD.csv'.format(proptype), usecols=['MLSNUM','REMARKS'])
    rmk_test = pd.read_csv('data/test_set/MERGE_TEST.csv', usecols=['MLSNUM','REMARKS'])
else:
    rmk_train = pd.read_csv('data/merge_sold_data/MERGE_{}_SOLD.csv'.format(proptype), usecols=['MLSNUM','REMARKS'],
                       dtype={'MLSNUM': str})
    rmk_test = pd.read_csv('data/test_set/TEST_{}.csv'.format(proptype), usecols=['MLSNUM','REMARKS'])

feats_rmk_train = feats_train.merge(rmk_train, how='inner', on='MLSNUM')
feats_rmk_test = feats_test.merge(rmk_test, how='inner', on='MLSNUM')

print('TRAIN: shape(rmk + feats):', feats_rmk_train.shape)
print('TEST: shape(rmk + feats):', feats_rmk_test.shape)

TRAIN: shape(rmk + feats): (40936, 2087)
TEST: shape(rmk + feats): (3563, 2087)


In [6]:
# fill missing remarks
feats_rmk_train['REMARKS'].fillna('', inplace=True)
feats_rmk_test['REMARKS'].fillna('', inplace=True)
sum(feats_rmk_train.REMARKS.isnull()), sum(feats_rmk_test.REMARKS.isnull())

(0, 0)

In [7]:
# Process remarks
doc_train = list(feats_rmk_train['REMARKS'])
doc_test = list(feats_rmk_test['REMARKS'])
print('TRAIN: Total {} remarks (documents)'.format(len(doc_train)))
print('TEST: Total {} remarks (documents)'.format(len(doc_test)))

# read the NLTK listed stop_words
stop_words = []
f = open('nltk_stop_words.txt')
for word in f.readlines():
    stop_words.append(word.strip())

print('Total {} stop_words'.format(len(stop_words)))

# transform documents to tfidf for NMF
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words)
tfidf_train = tfidf_vectorizer.fit_transform(doc_train)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# transform documents to raw term counts for LDA
tf_vectorizer = CountVectorizer(stop_words=stop_words)
tf_train = tf_vectorizer.fit_transform(doc_train)
tf_feature_names = tf_vectorizer.get_feature_names()
print('transformed tfidf_train.shape = {}, tf_train.shape = {}'.format(tfidf_train.shape, tf_train.shape))

tfidf_test = tfidf_vectorizer.transform(doc_test)
tf_test = tf_vectorizer.transform(doc_test)
print('transformed tfidf_test.shape = {}, tf_test.shape = {}'.format(tfidf_test.shape, tf_test.shape))

TRAIN: Total 40936 remarks (documents)
TEST: Total 3563 remarks (documents)
Total 127 stop_words
transformed tfidf_train.shape = (40936, 23031), tf_train.shape = (40936, 23031)
transformed tfidf_test.shape = (3563, 23031), tf_test.shape = (3563, 23031)


In [None]:
# tfidf_vocabulary = tfidf_vectorizer.vocabulary_
# tf_vocabulary = tf_vectorizer.vocabulary_

In [8]:
n_components = 10
# Run NMF
nmf = NMF(n_components=n_components, random_state=9001, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf_train)
nmf_train = nmf.transform(tfidf_train)
nmf_test = nmf.transform(tfidf_test)

# Run LDA
lda = LatentDirichletAllocation(n_components=n_components, learning_method='online', learning_offset=50., random_state=9001).fit(tf_train)
lda_train = lda.transform(tf_train)
lda_test = lda.transform(tf_test)

# Print top topics
n_top_words = 6
print('=== NMF top topics ===')
nmf_topics = display_topics(nmf, tfidf_feature_names, n_top_words)
print('\n=== LDA top topics ===')
lda_topics = display_topics(lda, tf_feature_names, n_top_words)

=== NMF top topics ===
Topic 0:
master closet suite walk custom bath
Topic 1:
unit great condo location close well
Topic 2:
boston building street high parking square
Topic 3:
new brand flooring painted updated appliances
Topic 4:
pool tennis room courts amenities club
Topic 5:
community baths today homes call appointment
Topic 6:
room level family home living finished
Topic 7:
water fee hot heat condo includes
Topic 8:
house 12 30 sunday 00 open
Topic 9:
floor first second bath full two

=== LDA top topics ===
Topic 0:
views building concierge center boston fitness
Topic 1:
boston views home square city restaurants
Topic 2:
room floor bath living master kitchen
Topic 3:
charles park longwood mall score brookline
Topic 4:
buyer property seller offers condo owner
Topic 5:
new custom appliances stainless high lighting
Topic 6:
floor condo community unit new units
Topic 7:
davis finish gated lofts arboretum loft
Topic 8:
unit kitchen condo new bedroom room
Topic 9:
minutes mass 495 harvar

In [9]:
# Build dataframe for remark topics
nmf_cols = ['nmf_{}'.format(i) for i in range(nmf_train.shape[1])]
lda_cols = ['lda_{}'.format(i) for i in range(lda_train.shape[1])]

df_nmf_train = pd.DataFrame(nmf_train, index=feats_rmk_train.index, columns=nmf_cols)
df_lda_train = pd.DataFrame(lda_train, index=feats_rmk_train.index, columns=lda_cols)
df_nmf_test = pd.DataFrame(nmf_test, index=feats_rmk_test.index, columns=nmf_cols)
df_lda_test = pd.DataFrame(lda_test, index=feats_rmk_test.index, columns=lda_cols)

In [10]:
# Merge dataframes
feats_rmk_transformed_train = pd.concat((feats_rmk_train, df_nmf_train, df_lda_train), axis=1)
feats_rmk_transformed_test = pd.concat((feats_rmk_test, df_nmf_test, df_lda_test), axis=1)
print('TRAIN: shape(feats + transfromed rmk): ', feats_rmk_transformed_train.shape)
print('TEST: shape(feats + transfromed rmk): ', feats_rmk_transformed_test.shape)

TRAIN: shape(feats + transfromed rmk):  (40936, 2107)
TEST: shape(feats + transfromed rmk):  (3563, 2107)


In [11]:
# Save 
feats_rmk_transformed_train.to_pickle('data/features/TRAIN_{}.pkl'.format(proptype))
feats_rmk_transformed_test.to_pickle('data/features/TEST_{}.pkl'.format(proptype))