In [67]:

import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np

url = "https://www.trulia.com/NY/New_York/1000000-20000000_price/0-1000_sqft/"


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36',
}
response = requests.get(url,
                        headers = headers)

soup = BeautifulSoup(response.text)

pagination = soup.find_all('li',{'data-testid':'pagination-page-link'})
pgs = []
for pg in pagination:
    pgs.append(pg.text)

In [68]:
pgs

['1', '2', '3', '18', '19', '20']

In [69]:

last_pg = pgs[-1]
pgs_formatted = [f'{str(i)}_p/' for i in range(1,int(last_pg)+1)]

urls = []
all_properties = []
for p in pgs_formatted:
    urlf = url+p
    response = requests.get(urlf,
                            headers = headers)
        
    soup = BeautifulSoup(response.text)

    cards = soup.find_all('div',{'data-testid':'property-card-details'})
    
    properties = []
    
    for card in cards:
            
        
        lst = card.find_all('div',{'data-testid':'property-price'})
        pr = lst[0].text if len(lst)>0 else 'U'
        lst = card.find_all('div',{'data-testid':'property-floorSpace'})
        sq = lst[0].text if len(lst)>0 else 'U'
        lst = card.find_all('div',{'data-testid':'property-address'})
        addr = lst[0].text if len(lst)>0 else 'U'
        lst = card.find_all('div',{'data-testid':'property-beds'})
        bds = lst[0].text if len(lst)>0 else 'U'
        lst = card.find_all('div',{'data-testid':'property-baths'})
        bths = lst[0].text if len(lst)>0 else 'U'
                
        properties.append(
            [pr,sq,addr,bds,bths]
            )
    
    all_properties=all_properties+properties

all_properties_df = pd.DataFrame(all_properties)

In [70]:
all_properties_df.columns = ['PRICE','SF','ADDRESS','BEDS','BATHS']


In [71]:
all_properties_df

Unnamed: 0,PRICE,SF,ADDRESS,BEDS,BATHS
0,"$1,495,000",938 sqft,"22 N 6th St #8H, Brooklyn, NY 11249",2bd,2ba
1,"$1,250,000",688 sqft,"22 N 6th St #11B, Brooklyn, NY 11249",1bd,1ba
2,"$1,650,000",714 sqft,"245 W 14th St #5A, New York, NY 10011",1bd,1ba
3,"$1,650,000",982 sqft,"225 W 60th St #7E, New York, NY 10023",2bd,2ba
4,"$1,650,000",720 sqft,"1 Irving Pl #V15B, New York, NY 10003",1bd,1ba
...,...,...,...,...,...
825,"$1,095,000",700 sqft,"20 River Ter #16C, New York, NY 10282",1bd,1ba
826,"$1,705,000",726 sqft,"1 Wall St #2506, New York, NY 10005",1bd,1ba
827,"$1,975,000",833 sqft,"1 Wall St #2412, New York, NY 10005",1bd,1ba
828,"$1,275,000","1,000 sqft","160 E 38th St #23D, New York, NY 10016",2bd,2ba


In [72]:
def str_delete(s: pd.Series,symbols: list[str]) -> pd.Series:
    
    for sym in symbols:
        s = s.astype(str).str.replace(sym,'',regex=False)
    return s

def str_replace_unknown(s: pd.Series,symbols: dict[str,str]) -> pd.Series:
    
    for sym in list(symbols.keys()):
        s = np.where(s==sym,symbols[sym],s)
    return s
    
for col in all_properties_df.columns:
    all_properties_df[col] = str_replace_unknown(all_properties_df[col],{'U':'0'})

all_properties_df['SF'] = all_properties_df['SF'].str.replace("\(([^\)]+)\)",'',regex=True)




In [73]:


all_properties_df['PRICE'] = str_delete(all_properties_df['PRICE'],
    ['$',',','+']).astype(float)
    
all_properties_df['SF'] = str_delete(all_properties_df['SF'].astype(str),
    [' sqft',',']).astype(float)




In [76]:


all_properties_df['BEDS'] = str_delete(all_properties_df['BEDS'].astype(str),
    ['bd','Studio'])

all_properties_df['BEDS']=np.where(all_properties_df['BEDS']=='','0',all_properties_df['BEDS']).astype(float)

In [78]:

all_properties_df['BATHS'] = str_delete(all_properties_df['BATHS'].astype(str),
    ['ba'])

all_properties_df['BATHS']=np.where(all_properties_df['BATHS']=='','0',all_properties_df['BATHS']).astype(float)

In [80]:
all_properties_df[all_properties_df['BEDS']!=all_properties_df['BATHS']]

Unnamed: 0,PRICE,SF,ADDRESS,BEDS,BATHS
7,1000000.0,767.0,"382 Central Park W #8M, New York, NY 10025",0.0,1.0
11,1095000.0,738.0,"145 E 48th St #32D, New York, NY 10017",1.0,2.0
14,1288000.0,960.0,"5828 146th St, Flushing, NY 11355",2.0,1.0
15,1700000.0,886.0,"200-210 E 65th St #14C, New York, NY 10065",1.0,2.0
16,1300000.0,724.0,"258 Avenue S, Brooklyn, NY 11223",3.0,2.0
...,...,...,...,...,...
819,4750000.0,989.0,"25 W 28th St #43C, New York, NY 10001",1.0,2.0
820,2625000.0,984.0,"30 Riverside Blvd #28D, New York, NY 10069",1.0,2.0
821,1330000.0,527.0,"1 Wall St #1210, New York, NY 10005",0.0,1.0
822,1180000.0,704.0,"1 Wall St #715, New York, NY 10005",0.0,1.0
