In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2


import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

In [2]:
baseurl = 'https://www.smartshanghai.com/housing/apartments-rent'


In [3]:
def get_data(a,b):
    contents = []
    for page in range(a,b):
        
        params = {'page': page}
        response = requests.get(baseurl,params)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            contents.append(soup.find_all("div", class_ = 'cont'))
            
        else:
            print(response.status_code)
    return contents


def extract_data(contents):   
    
    Listing_Id = []
    District = []
    Price = []
    Size = []
    N_Bedrooms = []
    N_Bathrooms = []

    for content in contents:
        for i in range(len(content)):
            Listing_Id.append(content[i].find('div').attrs['data-listingid'])
    
            apts = content[i].find('div', class_ = 'body')
            price = apts.find('div', class_ = 'price').text.strip().split()[1].split(',')
            Price.append(price[0]+price[1])
    
            info = re.findall('\d+', apts.find('div', class_ = 'room-type').text.strip())        
            Size.append(info[0])
            N_Bedrooms.append(info[1])
            N_Bathrooms.append(info[2])
    
    df = pd.DataFrame(np.column_stack([Listing_Id,Price,Size,N_Bedrooms,N_Bathrooms]), 
                    columns=['Listing_Id','Price','Size','N_Bedrooms', 'N_Bathrooms'])
    
    return pd.concat((house_data, df), ignore_index=True)
    

def page_data(data):

    features = ['Type', 'Available From', 'Agency Commission', 'Rooms', 'Size',
               'Floor', 'Furnished', 'Main Window Facing', 'District', 'Area',
                'Compound', 'Metro Station', 'Longtitue', 'Latitude', 'posting agent', 'description', 'first_post', 'Refresh']
    

    for list_id in data.Listing_Id:
        response = requests.get(f'{baseurl}/{list_id}')
        if response.status_code == 200:
            soup_info = BeautifulSoup(response.content, "html.parser")

        
        #each list's information
        try:
            detail = soup_info.find_all('div', class_='details')[0].find_all(name='div')
        except IndexError:
            print(list_id)
            
        #from 'Type' to 'Area'
        for indx, j in enumerate(detail[0:-3]):
            house_data.loc[list_id,features[indx]] = j.text.strip()
        
        
        #'Compound'
        house_data.loc[list_id,"Compound"] = detail[-3].text.split('/')[0].strip()
        
        
        # metro station
        text = detail[-2].text 
        try:
            found = re.search('walk to(.+?)on line', text).group(1)
        except AttributeError:
            found = ''
        house_data.loc[list_id,"Metro"] = found.strip()
        
        #long & lat
        long = soup_info.find('span', itemprop="longitude").text
        lat = soup_info.find('span', itemprop="latitude").text
        house_data.loc[list_id,"Longtitude"] = long
        house_data.loc[list_id,"Latitude"] = lat
        
        #posting agent
        house_data.loc[list_id,"Agent"] = soup_info.find('p', class_='username').text
        
        #description
        house_data.loc[list_id,"Description"] = soup_info.find('div', class_='description').text.strip()
        
        #post and views
        post = soup_info.find('div', class_='posted-and-views').text.strip().split(',')
        
        house_data.loc[list_id,"First_post"] = ' '.join(post[0].split(' ')[1:])
        house_data.loc[list_id,"Refresh"] = ' '.join(post[2].split(' ')[2:])
        
        #values.append(value)  # all listings
        
        
        #amenities 
        amenity_pos = soup_info.find('div', class_='amenities').find_all('li', class_='positive')
        amenity_neg = soup_info.find('div', class_='amenities').find_all('li', class_='negative')
        
        amenity_pos = [i.text.strip() for i in amenity_pos]
        amenity_neg = [i.text.strip() for i in amenity_neg]
        
        for indx, amenity in enumerate(amenity_pos):
            house_data.loc[list_id,amenity_pos[indx]] = 1
        
        for indx, amenity in enumerate(amenity_neg):
            house_data.loc[list_id, amenity_neg[indx]] = 0

        
    return house_data

In [4]:
house_data = pd.read_csv("housing_data_full.csv",low_memory=False)
house_data.shape

(26304, 42)

In [8]:
#daily update about 25 pages
house_data = extract_data(get_data(50,100))
house_data = house_data.drop_duplicates()
to_page = house_data[house_data['Type'].isnull()]

house_data["extra_index"] = house_data.Listing_Id
house_data.set_index("extra_index", inplace=True)

house_data = page_data(to_page)

In [10]:
house_data.to_csv("housing_data_full.csv", index=False)

In [9]:
house_data.shape

(27789, 42)

In [29]:
house_data.drop(index=[1533330],inplace=True)

In [8]:
house_data.drop_duplicates()

Unnamed: 0_level_0,Listing_Id,Price,Size,N_Bedrooms,N_Bathrooms,Type,Available From,Agency Commission,Rooms,Floor,...,Large Storage Room,Parking,Pets Allowed\n\n \n false,Playground,Pool,Tennis Courts,Wall heating,Water Filter,Pets Allowed\n\n \n true,Good View
extra_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1530087.0,1530087.0,16500.0,90 sqm,2.0,1.0,Apartments,Available Now,35%,"2 Bedrooms, 1 Bathrooms",7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
1530045.0,1530045.0,18000.0,150 sqm,2.0,2.0,Apartments,Available Now,35%,"2 Bedrooms, 2 Bathrooms",3.0,...,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,
1529782.0,1529782.0,6800.0,40 sqm,1.0,1.0,Apartments,Available Now,35%,"1 Bedrooms, 1 Bathrooms",1.0,...,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,
1531114.0,1531114.0,12000.0,65 sqm,1.0,0.0,Apartments,Available Now,35%,"1 Bedrooms, 0 Bathrooms",2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
1531113.0,1531113.0,13000.0,100 sqm,1.0,1.0,Apartments,Available Now,35%,"1 Bedrooms, 1 Bathrooms",1.0,...,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1547770,1547770,17500,120 sqm,3,2,Apartments,Available Now,-,"3 Bedrooms, 2 Bathrooms",15,...,1.0,1.0,,1.0,1.0,1.0,0.0,0.0,1.0,
1547769,1547769,9000,60 sqm,1,1,Apartments,Available Now,35%,"1 Bedrooms, 1 Bathrooms",4,...,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,
1547768,1547768,22000,145 sqm,2,2,Apartments,Available Now,-,"2 Bedrooms, 2 Bathrooms",3,...,1.0,1.0,,1.0,1.0,1.0,0.0,0.0,1.0,
1547767,1547767,24500,150 sqm,3,2,Apartments,Available Now,-,"3 Bedrooms, 2 Bathrooms",21,...,1.0,0.0,,0.0,0.0,0.0,1.0,0.0,1.0,
