In [29]:
import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup as bsp
import selenium as sel
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains

from random import randint
import re
import datetime as dt

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from textblob import TextBlob

import string

import seaborn as sns

import matplotlib
from matplotlib import pyplot as plt

import pickle

from time import sleep, time

from collections import defaultdict

import gensim
from gensim import corpora, models, similarities, matutils

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, GlobalAveragePooling2D, InputLayer
from tensorflow.keras.preprocessing import image_dataset_from_directory
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.utils import to_categorical

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.manifold import MDS
import sklearn.datasets as skdt
from sklearn.metrics.pairwise import manhattan_distances, euclidean_distances
from sklearn.decomposition import PCA

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold

from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (cross_val_score, train_test_split, 
                                     KFold, GridSearchCV)
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


%run CHROMEDRIVER_DATA.ipynb

In [None]:
crawl_lv_1='https://www.realtor.com/realestateandhomes-search/New-York_NY/show-recently-sold/pg-'
crawl_lv_2='https://www.realtor.com'

In [None]:
def soft_cluster(tokens):
    cluster_score={}
    for i in range(cluster_topics):
        cluster_score[f'lda_topic{i}']=0
    for token in listify(tokens):
        if token in xtffn:
            tok_num=xtffn.index(token)
            for topic in range(len(topic_matrix1)):
                cluster_score[f'lda_topic{topic}']+=topic_matrix1[topic][tok_num]
    return(cluster_score)


def unicodify(to_uni):
    return(''.join(r'\u{:04X}'.format(ord(chr)) for chr in to_uni))

def listify(to_listify, uni=False):
    if uni:
        listed=re.findall("'(.+?)'",to_listify)
        for i in range(len(listed)):
            listed[i]=unicodify(listed[i])
        return(listed)
    else:
        return(re.findall("'(.+?)'",to_listify))

def getURL(n):
    return(basic_link+links[n])

def clear_cookie_profile(cookienum):
    with open(f'cookie_folder/cookies{cookienum}.pkl','wb') as f:
        pickle.dump('',f)
    f.close()
    return(True)

def sel(url, sleeptime=0, use_chrome=False, cookienum=0):
    
    browser = None
    if use_chrome:
        browser = webdriver.Chrome(chrome_path)
    else:
        browser = webdriver.Safari()
    
    #Make different cookie profiles - realtor.com tracks you partially based on cookies
    try:
        cookies = pickle.load(open(f"cookie_folder/cookies{cookienum}.pkl", "rb"))
        for cookie in cookies:
            browser.add_cookie(cookie)
        print(f'({cookienum})',end='.')
    except:
        pass
    
    browser.get(url)
    scroller=0
    for i in range(sleeptime):
        scroller+=randint(2,4)
        browser.execute_script(f'window.scrollTo(0,{scroller})') 
    code = browser.page_source
    pickle.dump(browser.get_cookies(), open(f"cookie_folder/cookies{cookienum}.pkl","wb"))
    
    browser.close()
    return(code)

def req(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    code=requests.get(url, headers=headers)
    return(code)

def soupIt(code):
    return (bsp(code, 'html.parser'))

def getAllPages(pages):
    webpages=[]
    for page in range(1,pages+1):
        url=listing_page+str(page)
        webpages.append(sel(url))


def create_house_dict(html):

    house={}

    house['beds']=re.findall('([0-9.]+)[ ]?bed',html)
    house['baths']=re.findall('([0-9.]+)[ ]?bath',html)
    house['price']=[entry.replace(',','').split('$')[1].strip('"') for entry in re.findall('Last Sold for.{0,10}[0-9,.]+',html)]
    house['description']=re.findall('Property Overview(.+?)</p>',html,re.DOTALL)
    try:
        house['address']=[re.sub('[ \n]+',' ',re.sub('<.+?>','',re.findall('<span itemprop="streetAddress">.{0,500}?postalCode.{0,50}?</span>',html,re.DOTALL)[0],re.DOTALL),re.DOTALL)]
    except:
        house['address']=['N/A']
    house['sqft']=[entry.replace(',','') for entry in re.findall('<li data-label="property-meta-sqft">\n      <span class="data-value">([0-9,.]+)</span> sq ft',html,re.DOTALL)]
    house['sale_date']=re.findall('Sold on ([A-Z][a-z]{0,15}.{0,2}[0-9]{1,2}.{0,2}[0-9]{4})',html)
    house['lot_size']=re.findall('Lot Size Square Feet:.?([0-9]{0,6})',html)
    house['year_built']=re.findall('Year Built:.?([0-9]{0,4})',html)
    house['stories']=re.findall('Stories:.?([0-9]{0,3})',html)
    house['rooms']=re.findall('Total Rooms:.?[0-9]{0,3}',html)
    house['property_type']=re.findall('<div>Type</div>.+?data-original-title=.+?>(.+?)</div>',html,re.DOTALL)
    house['neighborhood']=re.findall('is located in <.+?>([A-Za-z]+)<.+?>',html,re.DOTALL)
    house['borough']=re.findall('neighborhood in the city of <.+?>([A-Za-z]+?, NY)<.+?>',html,re.DOTALL)
    
    #Check public records if no other info available
    if len(house['property_type'])==0:
        house['property_type']=re.findall('Property type: ([A-Za-z]+)',html,re.DOTALL)
    if len(house['year_built'])==0:
        house['year_built']=re.findall('Year built: ([0-9]+)',html,re.DOTALL)


    for key in house:
        if len(house[key])==0:
            house[key]='N/A'
        else:
            house[key]=house[key][0]
    return(house)

    
def startTimer():
    global start
    start=time()
    print('Timer started at system time',start)
    
def timeSince():
    print('Time elapsed:',time()-start)

def botDetected(code):
    bot_text='As you were browsing, something about your browser made us think you might be a bot.'
    if bot_text in code:
        return(True)
    return(False)

#Used to fix null neighborhood/borough values.
def geocode(address):
    address+=',New York City, NY'
    link='https://maps.googleapis.com/maps/api/geocode/json?address='
    key='&key=AIzaSyCBJXbbXfVyb8IW44rJ2suo_ltfVo31h3Y'
    address=address.replace(' ', '+')
    address=address.replace('#', '')
    r=requests.get(link+address+key)
    r=r.text
    r.replace('  ','')
    #r.split('Denied'
    
    if len(raw)==1:
        neighborhood="-3"
        borough=raw[0].split('long_name')[1].split('short_name')[0].replace("'",'').replace(":",'').replace('"','').replace(',','').strip()
    else:
        neighborhood=raw[0].split('long_name')[1].split('short_name')[0].replace("'",'').replace(":",'').replace('"','').replace(',','').strip()
        borough=raw[1].split('long_name')[1].split('short_name')[0].replace("'",'').replace(":",'').replace('"','').replace(',','').strip()
     
    return(neighborhood, borough)

def geocode_raw(address, textify=False):
    address+=',New York City, NY'
    link='https://maps.googleapis.com/maps/api/geocode/json?address='
    key='&key=AIzaSyCBJXbbXfVyb8IW44rJ2suo_ltfVo31h3Y'
    address=address.replace(' ', '+')
    address=address.replace('#', '')
    r=requests.get(link+address+key)
    if textify:
        r=r.text
        r.replace('  ','')
    return(r)

def get_time():
    times={}

    seconds=time()%(60)
    minutes=(time()-seconds)%(3600)/60
    hours=(time()-seconds-minutes*60)%(3600*24)/60/60
    days=(time()-seconds-minutes*60-hours*3600)%(3600*24*365)/60/60/24
    years=(time()-seconds-minutes*60-hours*3600-days*3600*24)/60/60/24/365

    times['seconds']=int(seconds)
    times['minutes']=int(minutes)
    times['hours']=int(hours)
    times['days']=int(days)
    times['years']=int(years)
    
    return(times)


# browser = None
# browser = webdriver.Chrome(chrome_path)
# browser.get(crawl_lv_1+'1')
# code = browser.page_source
# #browser.execute_script("window.scrollTo(0,400)")
# #elem = browser.find_element_by_tag_name('body')
# elem = browser.find_element_by_class_name('note')
# #scroll_shim(browser,elem)
# ac = ActionChains(browser)
# sleep(1)
# ac.move_to_element(elem)
# #ac.move_by_offset(400, 650)
# ac.move_by_offset(50, 100).perform()
# ac.click_and_hold().perform()
# print(elem)
# sleep(5)



# Scrape data for the first time.

In [None]:
class RealtorCrawlerLv1:
    def __init__(self, pages_to_scrape=2**32-1,sleep_time=0, chrome=False):
        self.page_num=1
        self.pages_to_scrape=pages_to_scrape
        self.pages=[]
        self.current_page=None
        self.sleep_time=sleep_time
        self.use_chrome=chrome
        self.tempfindall=None
        self.currently_walled=False
        
    def crawl_next(self):
        print(self.page_num,end='...')
        if self.currently_walled:
            print('walled')
            clear_cookie_profile(randint(0,256))
        
        self.current_page=sel(crawl_lv_1+str(self.page_num), self.sleep_time, self.use_chrome, randint(0,256))
        
        if botDetected(self.current_page):
            self.currently_walled=True
            self.current_page=sel(crawl_lv_1+str(self.page_num), self.sleep_time, self.use_chrome, randint(0,256))
            return(False)
        self.page_num+=1
        return(True)
    
    def distill_page(self):
        if not self.currently_walled:
            self.tempfindall=re.findall('(/realestateandhomes-detail/.+?)"',self.current_page)
            self.temp_pages=[link for link in self.tempfindall]
            self.pages+=self.temp_pages
    
    def drop_duplicate_pages(self):
        self.pages=list(set(crawler.pages))
    
    def output_to_textfile(self):
        drop_duplicate_pages(self)
        with open('RealtorCrawlerLv1_links.txt','w') as f:
            for link in self.pages:
                f.write(link)
        f.close()

    def debug(self):
        print(f'Debugging RealtorCrawlerLv1 object:')
        print(f'page_num={self.page_num}')
        print(f'pages_to_scrape={self.pages_to_scrape}')
        print(f'sleep_time={self.sleep_time}')
        print(f'use_chrome={self.use_chrome}')
        print(f'currently_walled={self.currently_walled}')
        print('Variables not shown: current_page')

In [None]:
# crawler=RealtorCrawlerLv1(206, 1, False)

# for i in range(206):
#     crawler.crawl_next()
#     crawler.distill_page()

In [None]:
class RealtorCrawlerLv2:
    def __init__(self, scrape_filepath,sleep_time=10, chrome=False):
        self.page_num=0
        self.filepath=scrape_filepath
        self.data=[]
        self.current_page=None
        self.sleep_time=sleep_time
        self.use_chrome=chrome
        self.tempfindall=None
        self.currently_walled=False
        
        with open(self.filepath,'r') as f:
            self.links=[link for link in f]
        f.close()
        
    def crawl_next(self):
        print(self.page_num,end='...')
        
        cookienum=randint(0,256)
        self.current_page=sel(crawl_lv_2+self.links[self.page_num], self.sleep_time, self.use_chrome, cookienum)
        
        if botDetected(self.current_page):
            clear_cookie_profile(cookienum)
            self.page_num-=1
        self.page_num+=1
        return(True)
    
    process_current_page=lambda self:self.data.append(create_house_dict(self.current_page))
    def save_data(self):
        ct=get_time()
        save_name=f'listing_dict_y{ct["years"]}d{ct["days"]}h{ct["hours"]}m{ct["minutes"]}s{ct["seconds"]}'
        with open(save_name,'wb') as f:
            pickle.dump(self.data,f)
        f.close()
    
    def load_data(self,filepath):
        with open(filepath,'rb') as f:
            self.data=pickle.load(f)
        self.page_num=len(self.data)
        f.close()
    
    def debug(self):
        print(f'Debugging RealtorCrawlerLv2 object:')
        print(f'page_num={self.page_num}')
        print(f'filepath={self.filepath}')
        print(f'len(links)={len(self.links)}')
        print(f'sleep_time={self.sleep_time}')
        print(f'use_chrome={self.use_chrome}')
        print(f'currently_walled={self.currently_walled}')
        print('Variables not shown: current_page, links, data')

In [None]:
# crawl_lv2=RealtorCrawlerLv2('RealtorCrawlerLv1_links.txt', 1, False)
# crawl_lv2.load_data('listing_dict_y51d359h3m40s35')

In [None]:
# resume_from=8511
# for i in range(resume_from,len(crawl_lv2.links)):
#     crawl_lv2.crawl_next()
#     if not crawl_lv2.currently_walled:  
#         crawl_lv2.process_current_page()



# Clean data.

In [None]:
# listing_frame=pd.DataFrame(crawl_lv2.data)

In [None]:
#Filter out listings with missing data, prepare data for modeling

assert 1==2 # Stop this code from running - we already ran it.

listing_frame_v2=listing_frame.copy()

na_indicate=lambda s: 0 if s=='N/A' else s

for column in ['beds','baths','price','address','sqft','year_built','property_type', 'stories','rooms']:
    listing_frame_v2[column]=[na_indicate(item) for item in listing_frame_v2[column]]
    listing_frame_v2=listing_frame_v2[listing_frame_v2[column]!=0]

listing_frame_v2['building_age']=[int(dt.datetime.now().date().strftime("%Y"))-int(year) for year in listing_frame_v2['year_built']]
listing_frame_v2.drop(['year_built'],axis=1,inplace=True)
cut_borough=lambda s: s.split(',')[0].capitalize() if ',' in s else s
listing_frame_v2['borough']=[cut_borough(n) for n in listing_frame_v2['borough']]
cut_description=lambda s: s.replace('</span> - ','') if '</span> -' in s else s
listing_frame_v2['description']=[cut_description(n) for n in listing_frame_v2['description']]
cut_rooms=lambda s: s.replace('Total Rooms:','') if 'Total Rooms:' in s else s
listing_frame_v2['rooms']=[cut_rooms(n) for n in listing_frame_v2['rooms']]
listing_frame_v2=listing_frame_v2[listing_frame_v2['stories']!=0]

for column in ['beds','baths','price','sqft','stories','rooms', 'stories','building_age']:
    listing_frame_v2[column]=listing_frame_v2[column].astype(float)

listing_frame_v2.drop(['lot_size'],axis=1,inplace=True)

with open('listing_frame_v2','wb') as f:
    pickle.dump(listing_frame_v2,f)


with open('listing_frame_v2','rb') as f:
    listing_frame_v3=pickle.load(f)
f.close()

In [None]:
assert 1==2 # Stop this code from running - we already ran it.

with open('listing_frame_v2','rb') as f:
    listing_frame_v3=pickle.load(f)
f.close()

In [None]:
#Fill in missing location data for listings

assert 1==2 # Stop this code from running - we already ran it.

indices_to_geocode=[]
indices_to_geocode+=list(listing_frame_v3.index[listing_frame_v3['neighborhood']=='N/A'])
indices_to_geocode+=list(listing_frame_v3.index[listing_frame_v3['neighborhood']=='others'])
indices_to_geocode+=list(listing_frame_v3.index[listing_frame_v3['borough']=='N/A'])
indices_to_geocode=list(set(indices_to_geocode))
indices_to_geocode.sort()

geo_codes={}
for idx in indices_to_geocode:
    print(idx,end='.')
    geo_codes[idx]=geocode_raw(listing_frame_v3.loc[idx].address)

with open('geo_codes','wb') as f:
    pickle.dump(geo_codes,f)
f.close()

for item in geo_codes:
    temp_neighborhood=[comp['long_name'] for comp in geo_codes[item].json()['results'][0]['address_components'] if 'neighborhood' in comp['types']]
    if len(temp_neighborhood)!=0:
        listing_frame_v3.at[item,'neighborhood']=temp_neighborhood[0]
        
for item in geo_codes:
    temp_borough=[comp['long_name'] for comp in geo_codes[item].json()['results'][0]['address_components'] if 'sublocality' in comp['types']]
    if len(temp_borough)!=0:
        listing_frame_v3.at[item,'borough']=temp_borough[0]

listing_frame_v3=listing_frame_v3[listing_frame_v3['borough']!='N/A']
listing_frame_v3=listing_frame_v3[listing_frame_v3['neighborhood']!='N/A']        

#Add polarity and subjectivity to dataframe
pol,sub=[TextBlob(sen).sentiment[0] for sen in listing_frame_v3.description],[TextBlob(sen).sentiment[1] for sen in listing_frame_v3.description]
listing_frame_v3['pol']=pol
listing_frame_v3['sub']=sub

with open('listing_frame_v3','wb') as f:
    pickle.dump(listing_frame_v3,f)
f.close()  

# Tokenize Sentences

In [None]:
assert 1==2 # Stop this code from running - we already ran it.

with open('listing_frame_v3','rb') as f:
    listing_frame_v4=pickle.load(f)
f.close()  

In [None]:
assert 1==2 # Stop this code from running - we already ran it.

stop_words = stopwords.words('english')

cases_dict={}

def special_cases(tokens):
    for i in range(len(tokens)):
        if tokens[i] in cases_dict:
            tokens[i]=cases_dict[tokens[i]]
#     if tokens in cases_dict:
#         tokens=cases_dict[tokens]
    return(tokens)


def remove_noise(tweet_tokens, stop_words = stop_words):
    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|''(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)
        token = re.sub("[0-9]+","number_", token)
        token = re.sub("[,.'-:;!]"," ", token)
        token = re.sub("&amp"," ", token)
        token = re.sub("  "," ", re.sub("  "," ", token))
        token=special_cases(token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        elif tag.startswith('JJ') or tag.startswith('NNP'):
            pos = 'del'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        if pos!='del':
            token = lemmatizer.lemmatize(token, pos)
        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words and pos!='adj':
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [None]:
assert 1==2 # Stop this code from running - we already ran it.

descriptions=listing_frame_v4.description.tolist()
descriptions_noise=remove_noise(descriptions)

In [None]:
assert 1==2 # Stop this code from running - we already ran it.

blacklist_word('')

def blacklist_word(word):
    with open('word_blacklist.txt') as f:
        word_blacklist=[line.strip('\n') for line in f]
    word_blacklist.append(word)
    with open('word_blacklist.txt','w') as f:
        for w in word_blacklist:
            f.write(w+'\n')
    f.close()

with open('word_blacklist.txt','r') as f:
    word_blacklist=[line.strip('\n') for line in f]
f.close()

denoised_tokens=[[word for word in dnoise.split(' ') if word not in word_blacklist and word not in ['','\n'] and len(word)>2] for dnoise in descriptions_noise]

In [None]:
assert 1==2 # Stop this code from running - we already ran it.

with open('denoised_tokens','wb') as f:
    pickle.dump(denoised_tokens,f)
f.close()
with open('denoised_tokens','rb') as f:
    denoised_tokens=pickle.load(f)
f.close()

In [None]:
assert 1==2 # Stop this code from running - we already ran it.

listing_frame_v4['denoised']=denoised_tokens
listing_frame_v4['denoised']=listing_frame_v4['denoised'].astype(str)
with open('listing_frame_v4','wb') as f:
    pickle.dump(listing_frame_v4,f)
f.close()

In [None]:
assert 1==2 # Stop this code from running - we already ran it.

with open('listing_frame_v4','rb') as f:
    listing_frame_v5=pickle.load(f)
f.close()

In [36]:
# Last-minute tweaks and changes
neighborhood_translator={}
neighborhood_translator['Bronx']='Bronx'
neighborhood_translator['Brooklyn']='Brooklyn'
neighborhood_translator['Manhattan']='Manhattan'
neighborhood_translator['Queens']='Queens'
neighborhood_translator['Staten Island']='Staten Island'

neighborhood_translator['Astoria']='Queens'
neighborhood_translator['Flushing']='Queens'
neighborhood_translator['Plainview']='Long Island'
neighborhood_translator['Maspeth']='Queens'
neighborhood_translator['Roslyn']='Long Island'
neighborhood_translator['Merrick']='Long Island'
neighborhood_translator['Glendale']='Queens'
neighborhood_translator['Bayside']='Queens'
neighborhood_translator['Melville']='Long Island'
neighborhood_translator['Manhasset']='Long Island'
neighborhood_translator['Woodside']='Queens'
neighborhood_translator['Ridgewood']='Queens'
neighborhood_translator['Smithtown']='Long Island'
neighborhood_translator['Kings']='Brooklyn'



In [None]:
translated=[]
for untranslated in listing_frame_v5['borough']:
    if untranslated in neighborhood_translator:
        translated.append(neighborhood_translator[untranslated])
    else:
        translated.append('Long Island')
        
listing_frame_v5['borough']=translated
listing_frame_v5=listing_frame_v5[listing_frame_v5['borough']!='Long Island']

In [None]:
def property_translator(p_type):
    if 'single' in p_type.lower():
        return 'Single-Family Home'
    elif 'condo' in p_type.lower():
        return 'Condo'
    elif 'multi' in p_type.lower():
        return 'Multi-Family Home'
    elif 'commercial' in p_type.lower():
        return 'Commercial'
    else:
        return 'Other'

listing_frame_v5['property_type']=[property_translator(ptype) for ptype in listing_frame_v5['property_type']]

In [None]:
with open('listing_frame_v5','wb') as f:
    pickle.dump(listing_frame_v5,f)
f.close()

# Perform topic modeling 

In [None]:
with open('listing_frame_v5','rb') as f:
    listing_frame_v6=pickle.load(f)
f.close()

In [None]:
# Create LDA topic model.

MAXDF=0.5
MINDF=0.001

tf=TfidfVectorizer(stop_words=stop_words, max_df=MAXDF, min_df=MINDF,use_idf=False)
xtf=tf.fit_transform(listing_frame_v6.denoised)
xtfa=xtf.toarray()
xtffn=tf.get_feature_names()

In [None]:
cluster_topics=64

text_list=listing_frame_v6.denoised.tolist()
for i in range(len(text_list)):
    text_list[i]=listify(text_list[i])

dictionary=corpora.Dictionary(text_list)
doc_term_matrix = []

for doc in listing_frame_v6.denoised:
    doc_term_matrix.append(dictionary.doc2bow(listify(doc,True)))

doc_word = tf.transform(listing_frame_v6.denoised).transpose()
pd.DataFrame(doc_word.toarray(), tf.get_feature_names()).head()

corpus = matutils.Sparse2Corpus(doc_word)
dictionary = corpora.Dictionary()
id2word = dict((v, k) for k, v in tf.vocabulary_.items())

lda1 = models.LdaModel(corpus=corpus, num_topics=cluster_topics, id2word=id2word, passes=16)

topic_matrix1=lda1.get_topics()

with open('topic_matrix','wb') as f:
    pickle.dump(topic_matrix1,f)
f.close()

with open('feature_names','wb') as f:
    pickle.dump(xtffn,f)
f.close()

In [None]:
with open('topic_matrix','rb') as f:
    topic_matrix1=pickle.load(f)
f.close()

with open('feature_names','rb') as f:
    xtffn=pickle.load(f)
f.close()

In [None]:
clusterlist=[soft_cluster(tokenlist) for tokenlist in listing_frame_v6.denoised]
clusterlist_df=pd.DataFrame(clusterlist)

for column in clusterlist_df:
    listing_frame_v6[column]=list(clusterlist_df[column])

In [None]:
#listing_frame_v6=listing_frame_v6[listing_frame_v6['property_type']!='Other']

In [None]:
with open('listing_frame_v6','wb') as f:
    pickle.dump(listing_frame_v6,f)
f.close()
with open('listing_frame_v6','rb') as f:
    listing_frame_v7=pickle.load(f)
f.close()

In [None]:
listing_frame_v7

In [None]:
#Create dummy columns for categorical variables
dummies=pd.get_dummies(listing_frame_v7['property_type'])
for column in dummies:
    listing_frame_v7[column]=dummies[column]
    
#dummies=pd.get_dummies(listing_frame_v7['neighborhood'])
#for column in dummies:
#    listing_frame_v7[column]=dummies[column]
    
dummies=pd.get_dummies(listing_frame_v7['borough'])
for column in dummies:
    listing_frame_v7[column]=dummies[column]
    
listing_frame_v7.drop(['description','address','sale_date','property_type','borough','neighborhood','denoised'],axis=1,inplace=True)

In [None]:
with open('listing_frame_final','wb') as f:
    pickle.dump(listing_frame_v7,f)
f.close()

# Okay, now that topic modeling's done, move on to regression.

In [3]:
#Perform train-test split with regularization


with open('listing_frame_final.bin','rb') as f:
    listing_frame_v8=pickle.load(f)
f.close()

#for i in range(32):
#    listing_frame_v8.drop([f'lda_topic{i}'],axis=1,inplace=True)

#to_drop=['rooms','stories']
#listing_frame_v8.drop(to_drop,axis=1,inplace=True)


#listing_frame_v8.reset_index(drop=True, inplace=True)
X = listing_frame_v8.drop(['price'], axis=1)
regularizer=StandardScaler()
X_reg = regularizer.fit_transform(X)

y = listing_frame_v8['price']

X_train, X_test, y_train, y_test=train_test_split(X_reg,y, test_size=0.2,random_state=hash('240blazeit')%2**32)

reg = LinearRegression()
reg.fit(X_train, y_train)

r2train=reg.score(X_train, y_train)
r2test=reg.score(X_test, y_test)

r2train,r2test

FileNotFoundError: [Errno 2] No such file or directory: 'listing_frame_final.bin'

In [None]:
MLPC = Sequential()
MLPC.add(InputLayer(input_shape=(X_train.shape[1])))
MLPC.add(Dense(256, activation='linear'))
MLPC.add(Dense(256, activation='linear'))
MLPC.add(Dense(256, activation='linear'))
MLPC.add(Dense(1, activation='linear'))
MLPC.compile(
    loss='mae',
    optimizer='adam',
    metrics=['mae'],
)
#MLPC.summary()

In [None]:
callback = EarlyStopping(monitor='val_loss', patience=3)
MLPC.fit(X_train, y_train, batch_size=32,callbacks=[callback], verbose=True, validation_split=0.2, epochs=256)

In [None]:
preds=MLPC.predict(X_reg)
t=r2_score(y,preds)

preds=MLPC.predict(X_train)
tr=r2_score(y_train,preds)

preds=MLPC.predict(X_test)
ts=r2_score(y_test,preds)

print(t,tr,ts)

In [None]:


X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = 96)

print('Fitting...')
model = RandomForestRegressor(n_estimators=256,max_features=1.0, random_state=420)
model.fit(X_train2,y_train2)
print('Scoring...')

# Get the r2 on the validation data
predicted_prices = model.predict(X_train2)
trf = r2_score(y_train2 , predicted_prices)
print('Random forest train r2 = ', trf)

predicted_prices = model.predict(X_val)
trf = r2_score(y_val , predicted_prices)
print('Random forest validation r2 = ', trf)

predicted_prices = model.predict(X_test)
trf = r2_score(y_test , predicted_prices)
print('Random forest test r2 = ', trf)

# Not good. Let's try Ridge and Lasso

In [None]:
#Run Ridge Regression and find best alpha value.
max_i=0
max_r2=0
max_tr=0

for i in np.arange(0,20000,10):
    reg = Ridge(alpha=i,max_iter=10000)
    reg.fit(X_train, y_train)
    
    r2train=reg.score(X_train, y_train)
    r2test=reg.score(X_test, y_test)
    
    if r2train>max_tr:
        max_tr=r2train
    
    if r2test>max_r2:
        max_r2=r2test
        max_i=i

(max_tr,max_r2,max_i)

In [None]:
#Run Lasso Regression and find best alpha value.
max_i=0
max_r2=0
max_tr=0

for i in np.arange(0,2000000,1000):

    reg = Lasso(alpha=i, max_iter=10000)
    reg.fit(X_train, y_train)

    r2train=reg.score(X_train, y_train)
    r2test=reg.score(X_test, y_test)
    
    if r2train>max_tr:
        max_tr=r2train
    
    if r2test>max_r2:
        max_r2=r2test
        max_i=i
        
        
(max_tr,max_r2,max_i)


In [None]:
#Run Elastic Net Regression and find best alpha value.
max_i=0
max_r2=0
max_tr=0

for i in np.arange(0,5,0.1):

    reg = ElasticNet(alpha=i, max_iter=10000)
    reg.fit(X_train, y_train)

    r2train=reg.score(X_train, y_train)
    r2test=reg.score(X_test, y_test)
    if r2test>max_r2:
        max_r2=r2test
        max_i=i
        
    if r2train>max_tr:
        max_tr=r2train    
    
(max_tr,max_r2,max_i)

# Pipeline for real-time data

In [None]:
month_translator={}
month_translator[1]='Jan'
month_translator[2]='Feb'
month_translator[3]='Mar'
month_translator[4]='Apr'
month_translator[5]='May'
month_translator[6]='Jun'
month_translator[7]='Jul'
month_translator[8]='Aug'
month_translator[9]='Sep'
month_translator[10]='Oct'
month_translator[11]='Nov'
month_translator[12]='Dec'


In [None]:
pipeline_crawler_lv1=RealtorCrawlerPipeline_Lv1(4)

In [None]:
while True:
    lv1_pipeline=pipeline_crawler_lv1.crawl_next()
    if lv1_pipeline!=True:
        break

In [None]:
pipeline_to_scrape=pipeline_crawler_lv1.pages

with open('new_links_to_scrape.txt','w') as f:
    for line in pipeline_to_scrape:
        f.write(line+'\n')
f.close()

pipeline_crawler_lv2=RealtorCrawlerLv2('new_links_to_scrape.txt')

In [None]:
resume_from=0
for i in range(resume_from,len(pipeline_crawler_lv2.links)):
    pipeline_crawler_lv2.crawl_next()
    if not pipeline_crawler_lv2.currently_walled:  
        pipeline_crawler_lv2.process_current_page()

In [15]:
with open('new_links_to_scrape.txt') as f:
    lnks=[line for line in f]

In [6]:
lff

Unnamed: 0,beds,baths,price,sqft,stories,rooms,building_age,pol,sub,lda_topic0,...,Commercial,Condo,Multi-Family Home,Other,Single-Family Home,Bronx,Brooklyn,Manhattan,Queens,Staten Island
3,3.0,3.0,665000.0,1350.0,3.0,6.0,19.0,0.417532,0.530519,0.009693,...,0.0,0,0,0,1,0.0,0,0,0,1
8,1.0,1.0,459000.0,532.0,4.0,2.0,5.0,0.122857,0.495714,0.009472,...,0.0,1,0,0,0,0.0,1,0,0,0
15,5.0,5.0,1999999.0,3114.0,3.0,12.0,122.0,0.262141,0.528006,0.056832,...,0.0,1,0,0,0,0.0,1,0,0,0
18,6.0,5.0,1190000.0,3700.0,3.0,12.0,51.0,0.330556,0.701190,0.006829,...,0.0,0,1,0,0,0.0,1,0,0,0
20,2.0,1.0,849000.0,1200.0,6.0,5.0,104.0,0.303194,0.571307,0.066964,...,0.0,1,0,0,0,0.0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,1.0,1.0,725000.0,786.0,32.0,3.0,36.0,0.133917,0.482117,0.019385,...,0.0,1,0,0,0,0.0,0,1,0,0
75,2.0,3.0,379000.0,1354.0,3.0,6.0,25.0,0.356746,0.542659,0.011675,...,0.0,0,0,0,1,0.0,0,0,0,1
79,2.0,2.0,1695000.0,2000.0,7.0,4.0,109.0,0.296512,0.593700,0.032381,...,0.0,1,0,0,0,0.0,0,1,0,0
80,2.0,1.0,760750.0,793.0,6.0,4.0,122.0,0.151384,0.348802,0.028637,...,0.0,1,0,0,0,0.0,1,0,0,0


In [32]:
with open('listing_frame_final.bin','rb') as f:
    lff=pickle.load(f)
f.close()

In [28]:
lff['price'].median()

810000.0

In [33]:
lff

Unnamed: 0,beds,baths,price,sqft,stories,rooms,building_age,pol,sub,lda_topic0,...,Commercial,Condo,Multi-Family Home,Other,Single-Family Home,Bronx,Brooklyn,Manhattan,Queens,Staten Island
3,3.0,3.0,665000.0,1350.0,3.0,6.0,19.0,0.417532,0.530519,0.009693,...,0.0,0,0,0,1,0.0,0,0,0,1
8,1.0,1.0,459000.0,532.0,4.0,2.0,5.0,0.122857,0.495714,0.009472,...,0.0,1,0,0,0,0.0,1,0,0,0
15,5.0,5.0,1999999.0,3114.0,3.0,12.0,122.0,0.262141,0.528006,0.056832,...,0.0,1,0,0,0,0.0,1,0,0,0
18,6.0,5.0,1190000.0,3700.0,3.0,12.0,51.0,0.330556,0.701190,0.006829,...,0.0,0,1,0,0,0.0,1,0,0,0
20,2.0,1.0,849000.0,1200.0,6.0,5.0,104.0,0.303194,0.571307,0.066964,...,0.0,1,0,0,0,0.0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30,4.0,5.0,2032000.0,4803.0,2.0,9.0,2.0,0.239585,0.485160,0.065423,...,0.0,0,0,0,1,0.0,0,0,0,1
32,6.0,8.0,5807000.0,6597.0,6.0,15.0,120.0,0.279174,0.544069,0.077982,...,0.0,1,0,0,0,0.0,0,1,0,0
36,4.0,2.0,627500.0,1020.0,3.0,7.0,61.0,0.252764,0.576599,0.025993,...,0.0,0,0,0,1,0.0,0,0,0,1
37,2.0,2.0,830000.0,970.0,8.0,4.0,14.0,0.114708,0.331836,0.039209,...,0.0,1,0,0,0,0.0,0,1,0,0


In [None]:
with open('listing_frame_final.bin','wb') as f:
    pickle.dump(lff,f)
f.close()

In [6]:
len(listing_frame_final['price'])

2375

In [7]:
len(listing_frame_final.drop_duplicates())

2342

In [8]:
with open('listing_dict_y51d359h3m40s35','rb') as f:
    ldict=pickle.load(f)
f.close()

In [15]:
with open('RandomForestRegressor.bin','rb') as f:
    RFR=pickle.load(f)
f.close()

In [21]:
from sklearn.metrics import mean_absolute_error
preds=RFR.predict(lff.drop(['price'],axis=1))

In [22]:
mean_absolute_error(lff['price'],preds)

164608.3623897243

Unnamed: 0,beds,baths,price,sqft,stories,rooms,building_age,pol,sub,lda_topic0,...,Commercial,Condo,Multi-Family Home,Other,Single-Family Home,Bronx,Brooklyn,Manhattan,Queens,Staten Island
3,3.0,3.0,665000.0,1350.0,3.0,6.0,19.0,0.417532,0.530519,0.009693,...,0.0,0,0,0,1,0.0,0,0,0,1
8,1.0,1.0,459000.0,532.0,4.0,2.0,5.0,0.122857,0.495714,0.009472,...,0.0,1,0,0,0,0.0,1,0,0,0
15,5.0,5.0,1999999.0,3114.0,3.0,12.0,122.0,0.262141,0.528006,0.056832,...,0.0,1,0,0,0,0.0,1,0,0,0
18,6.0,5.0,1190000.0,3700.0,3.0,12.0,51.0,0.330556,0.701190,0.006829,...,0.0,0,1,0,0,0.0,1,0,0,0
20,2.0,1.0,849000.0,1200.0,6.0,5.0,104.0,0.303194,0.571307,0.066964,...,0.0,1,0,0,0,0.0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,1.0,1.0,725000.0,786.0,32.0,3.0,36.0,0.133917,0.482117,0.019385,...,0.0,1,0,0,0,0.0,0,1,0,0
75,2.0,3.0,379000.0,1354.0,3.0,6.0,25.0,0.356746,0.542659,0.011675,...,0.0,0,0,0,1,0.0,0,0,0,1
79,2.0,2.0,1695000.0,2000.0,7.0,4.0,109.0,0.296512,0.593700,0.032381,...,0.0,1,0,0,0,0.0,0,1,0,0
80,2.0,1.0,760750.0,793.0,6.0,4.0,122.0,0.151384,0.348802,0.028637,...,0.0,1,0,0,0,0.0,1,0,0,0
