In [35]:
import schedule
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import numpy as np

In [28]:
baseurl = 'https://www.smartshanghai.com/housing/apartments-rent'

In [26]:
def get_data(n):
    contents = []
    for page in range(n):
        
        params = {'page': page}
        response = requests.get(baseurl,params)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            contents.append(soup.find_all("div", class_ = 'cont'))
            
        else:
            print(response.status_code)
    return contents

In [97]:
new = get_data(1)

In [33]:
def extract_data(contents):   
    
    Listing_Id = []
    District = []
    Price = []
    Size = []
    N_Bedrooms = []
    N_Bathrooms = []

    for content in contents:
        for i in range(len(content)):
            Listing_Id.append(content[i].find('div').attrs['data-listingid'])
    
            apts = content[i].find('div', class_ = 'body')
            price = apts.find('div', class_ = 'price').text.strip().split()[1].split(',')
            Price.append(price[0]+price[1])
    
            info = re.findall('\d+', apts.find('div', class_ = 'room-type').text.strip())        
            Size.append(info[0])
            N_Bedrooms.append(info[1])
            N_Bathrooms.append(info[2])
    
    df = pd.DataFrame(np.column_stack([Listing_Id,Price,Size,N_Bedrooms,N_Bathrooms]), 
                    columns=['Listing_Id','Price','Size','N_Bedrooms', 'N_Bathrooms'])
    
    
    return df

In [146]:
raw = extract_data(new)

In [147]:
raw.shape

(15, 5)

In [149]:
raw = raw[:2]

In [150]:
raw

Unnamed: 0,Listing_Id,Price,Size,N_Bedrooms,N_Bathrooms
0,1597609,36800,150,3,2
1,1597608,8500,65,1,1


In [136]:
raw.columns

Index(['Listing_Id', 'Price', 'Size', 'N_Bedrooms', 'N_Bathrooms'], dtype='object')

In [38]:
def page_data(data):

    features = ['Type', 'Available From', 'Agency Commission', 'Rooms', 'Size',
               'Floor', 'Furnished', 'Main Window Facing', 'District', 'Area',
                'Compound', 'Metro Station', 'Longtitue', 'Latitude', 'posting agent', 'description', 'first_post', 'Refresh']
    

    for list_id in data.Listing_Id:
        response = requests.get(f'{baseurl}/{list_id}')
        if response.status_code == 200:
            soup_info = BeautifulSoup(response.content, "html.parser")

        
        #each list's information
        try:
            detail = soup_info.find_all('div', class_='details')[0].find_all(name='div')
        except IndexError:
            print(list_id)
            
        #from 'Type' to 'Area'
        for indx, j in enumerate(detail[0:-3]):
            house_data.loc[list_id,features[indx]] = j.text.strip()
        
        
        #'Compound'
        house_data.loc[list_id,"Compound"] = detail[-3].text.split('/')[0].strip()
        
        
        # metro station
        text = detail[-2].text 
        try:
            found = re.search('walk to(.+?)on line', text).group(1)
        except AttributeError:
            found = ''
        house_data.loc[list_id,"Metro"] = found.strip()
        
        #long & lat
        long = soup_info.find('span', itemprop="longitude").text
        lat = soup_info.find('span', itemprop="latitude").text
        house_data.loc[list_id,"Longtitude"] = long
        house_data.loc[list_id,"Latitude"] = lat
        
        #posting agent
        house_data.loc[list_id,"Agent"] = soup_info.find('p', class_='username').text
        
        #description
        house_data.loc[list_id,"Description"] = soup_info.find('div', class_='description').text.strip()
        
        #post and views
        post = soup_info.find('div', class_='posted-and-views').text.strip().split(',')
        
        house_data.loc[list_id,"First_post"] = ' '.join(post[0].split(' ')[1:])
        house_data.loc[list_id,"Refresh"] = ' '.join(post[2].split(' ')[2:])
        
        #values.append(value)  # all listings
        
        
        #amenities 
        amenity_pos = soup_info.find('div', class_='amenities').find_all('li', class_='positive')
        amenity_neg = soup_info.find('div', class_='amenities').find_all('li', class_='negative')
        
        amenity_pos = [i.text.strip() for i in amenity_pos]
        amenity_neg = [i.text.strip() for i in amenity_neg]
        
        for indx, amenity in enumerate(amenity_pos):
            house_data.loc[list_id,amenity_pos[indx]] = 1
        
        for indx, amenity in enumerate(amenity_neg):
            house_data.loc[list_id, amenity_neg[indx]] = 0

        
    return house_data

In [110]:
response = requests.get(f'{baseurl}/1597608')

In [111]:
soup_info = BeautifulSoup(response.content, "html.parser")

In [112]:
detail = soup_info.find_all('div', class_='details')[0].find_all(name='div')

In [113]:
Amenities = soup_info.find_all('div', class_='amenities')[0].find_all('li', class_=['positive', 'negative'])

In [114]:
Amenities

[<li class="positive">
                                         Air Filter
 
                                                                             </li>,
 <li class="positive">
                                         Balcony
 
                                                                             </li>,
 <li class="positive">
                                         English Speaking Landlord
 
                                                                             </li>,
 <li class="positive">
                                         Fitness Centers
 
                                                                             </li>,
 <li class="negative">
                                         Floor Heating
 
                                                                             </li>,
 <li class="negative">
                                         Garden
 
                                                                             </li>,
 <li class="negati

In [115]:
amenities = []
for li in Amenities:
    amenity = li.text.strip()
    status = 1 if 'positive' in li['class'] else 0
    amenities.append({'amenity': amenity, 'status': status})

amenities_df = pd.DataFrame(amenities)
print(amenities_df)

                                              amenity  status
0                                          Air Filter       1
1                                             Balcony       1
2                           English Speaking Landlord       1
3                                     Fitness Centers       1
4                                       Floor Heating       0
5                                              Garden       0
6                                   Historic Building       0
7                          Landlord lives in Shanghai       1
8                                  Large Storage Room       0
9                                                Oven       0
10                                            Parking       1
11  Pets Allowed\n\n                              ...       1
12                                         Playground       1
13                                               Pool       0
14                                      Tennis Courts       0
15      

In [155]:
for list_id in raw['Listing_Id']:
    response = requests.get(f'{baseurl}/{list_id}')
    if response.status_code == 200:
        soup_info = BeautifulSoup(response.content, "html.parser")
        amenities = soup_info.find_all('div', class_='amenities')[0].find_all('li', class_=['positive', 'negative'])
        
    amenities_dict = {}
    for amenity_li in amenities:
        amenity = amenity_li.text.strip()
        amenities_dict[amenity] = 1 if 'positive' in amenity_li['class'] else 0
            
            # Add the amenity to the corresponding row in the original DataFrame
    raw.loc[raw['Listing_Id'] == list_id, list(amenities_dict.keys())] = pd.DataFrame([amenities_dict])
    #raw.loc[raw['Listing_Id'] == list_id, amenity_name] = status

In [157]:
amenities_dict

{'Air Filter': 1,
 'Balcony': 1,
 'English Speaking Landlord': 1,
 'Fitness Centers': 1,
 'Floor Heating': 0,
 'Garden': 0,
 'Historic Building': 0,
 'Landlord lives in Shanghai': 1,
 'Large Storage Room': 0,
 'Oven': 0,
 'Parking': 1,
 'Pets Allowed\n\n                                                                                    \n                                                true': 1,
 'Playground': 1,
 'Pool': 0,
 'Tennis Courts': 0,
 'Villa': 0,
 'Wall heating': 0,
 'Water Filter': 0}

                                              amenity    status
0                                          Air Filter  positive
1                                             Balcony  negative
2                           English Speaking Landlord  negative
3                                     Fitness Centers  negative
4                                       Floor Heating  negative
5                                              Garden  negative
6                                   Historic Building  negative
7                          Landlord lives in Shanghai  positive
8                                  Large Storage Room  positive
9                                                Oven  positive
10                                            Parking  negative
11  Pets Allowed\n\n                              ...  negative
12                                         Playground  negative
13                                               Pool  negative
14                                      

In [None]:
amenities = []
for li in soup.find_all('li', class_=['positive', 'negative']):
    amenity = li.text.strip()
    status = 'positive' if 'positive' in li['class'] else 'negative'
    amenities.append({'amenity': amenity, 'status': status})

amenities_df = pd.DataFrame(amenities)

In [62]:
Amenities

[]

In [56]:
detail

[<div>
                                                                                     Apartments
                                         
                                                                             </div>,
 <div>
                                         Available Now                                    </div>,
 <div>
                                         35%
                                     </div>,
 <div>
 <span itemprop="numberOfBedrooms">2</span> Bedrooms, <span itemprop="numberOfBathroomsTotal">2</span> Bathrooms
                                         </div>,
 <div>
                                         100 sqm                                    </div>,
 <div>
                                         10
                                     </div>,
 <div>
                                                                                     Furnished                                                                            </div>,
 <div>
            

In [54]:
detail[0:-3]

[<div>
                                                                                     Apartments
                                         
                                                                             </div>,
 <div>
                                         Available Now                                    </div>,
 <div>
                                         35%
                                     </div>,
 <div>
 <span itemprop="numberOfBedrooms">2</span> Bedrooms, <span itemprop="numberOfBathroomsTotal">2</span> Bathrooms
                                         </div>,
 <div>
                                         100 sqm                                    </div>,
 <div>
                                         10
                                     </div>,
 <div>
                                                                                     Furnished                                                                            </div>,
 <div>
            

In [53]:
for indx, j in enumerate(detail[0:-3]):
    house_data.loc[list_id,features[indx]] = j.text.strip()

NameError: name 'list_id' is not defined

In [44]:
problem = page_data(raw)

In [45]:
problem

Unnamed: 0,Listing_Id,Price,Size,N_Bedrooms,N_Bathrooms,Type,Available From,Agency Commission,Rooms,Floor,...,Parking,Pets Allowed\n\n \n false,Playground,Pool,Tennis Courts,Wall heating,Water Filter,Pets Allowed\n\n \n true,Good View,Villa
4011,1526450.0,17500.0,380 sqm,2.0,1.0,Apartments,Available Now,-,"4 Bedrooms, 6 Bathrooms",2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
1597545,,,100 sqm,,,Apartments,Available Now,35%,"2 Bedrooms, 2 Bathrooms",10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0
1597544,,,100 sqm,,,Apartments,Available Now,35%,"1 Bedrooms, 1 Bathrooms",2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0
1597543,,,158 sqm,,,Apartments,Available Now,-,"3 Bedrooms, 2 Bathrooms",17.0,...,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,,0.0
1597542,,,120 sqm,,,Apartments,Available Now,35%,"2 Bedrooms, 2 Bathrooms",3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0
1597541,,,110 sqm,,,Apartments,Available Now,35%,"2 Bedrooms, 1 Bathrooms",8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0
1597540,,,150 sqm,,,Apartments,Available Now,35%,"3 Bedrooms, 2 Bathrooms",7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0
1597539,,,90 sqm,,,Apartments,Available Now,35%,"2 Bedrooms, 1 Bathrooms",2.0,...,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,,0.0
1597538,,,85 sqm,,,Apartments,Available Now,35%,"1 Bedrooms, 1 Bathrooms",13.0,...,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,,0.0
1597537,,,45 sqm,,,Apartments,Available Now,35%,"1 Bedrooms, 1 Bathrooms",6.0,...,0.0,,0.0,0.0,0.0,1.0,1.0,1.0,,0.0


In [42]:
house_data = pd.read_csv("housing_data_full.csv").sample(n=1)


In [17]:
house_data.columns

Index(['Listing_Id', 'Price', 'Size', 'N_Bedrooms', 'N_Bathrooms', 'Type',
       'Available From', 'Agency Commission', 'Rooms', 'Floor', 'Furnished',
       'Main Window Facing', 'District', 'Area', 'Compound', 'Metro',
       'Longtitude', 'Latitude', 'Agent', 'Description', 'First_post',
       'Refresh', 'Balcony', 'Landlord lives in Shanghai', 'Oven',
       'Recently renovated', 'Air Filter', 'English Speaking Landlord',
       'Fitness Centers', 'Floor Heating', 'Garden', 'Historic Building',
       'Large Storage Room', 'Parking',
       'Pets Allowed\n\n                                                                                    \n                                                false',
       'Playground', 'Pool', 'Tennis Courts', 'Wall heating', 'Water Filter',
       'Pets Allowed\n\n                                                                                    \n                                                true',
       'Good View', 'Villa'],
      dtype='ob

In [18]:
today = extract_data(get_data(0,1))

In [21]:
page_data(today)

Unnamed: 0,Listing_Id,Price,Size,N_Bedrooms,N_Bathrooms,Type,Available From,Agency Commission,Rooms,Floor,...,Parking,Pets Allowed\n\n \n false,Playground,Pool,Tennis Courts,Wall heating,Water Filter,Pets Allowed\n\n \n true,Good View,Villa
0,1530087.0,16500.0,90 sqm,2.0,1.0,Apartments,Available Now,35%,"2 Bedrooms, 1 Bathrooms",7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
1,1530045.0,18000.0,150 sqm,2.0,2.0,Apartments,Available Now,35%,"2 Bedrooms, 2 Bathrooms",3.0,...,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,,
2,1529782.0,6800.0,40 sqm,1.0,1.0,Apartments,Available Now,35%,"1 Bedrooms, 1 Bathrooms",1.0,...,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,,
3,1531114.0,12000.0,65 sqm,1.0,0.0,Apartments,Available Now,35%,"1 Bedrooms, 0 Bathrooms",2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
4,1531113.0,13000.0,100 sqm,1.0,1.0,Apartments,Available Now,35%,"1 Bedrooms, 1 Bathrooms",1.0,...,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,,
5,1531112.0,20000.0,140 sqm,2.0,2.0,Apartments,Available Now,-,"2 Bedrooms, 2 Bathrooms",15.0,...,1.0,,1.0,0.0,1.0,1.0,0.0,1.0,,
6,1531111.0,16000.0,130 sqm,3.0,2.0,Apartments,Available Now,-,"3 Bedrooms, 2 Bathrooms",2.0,...,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,,
7,1530078.0,10200.0,60 sqm,1.0,1.0,Apartments,Available Now,35%,"1 Bedrooms, 1 Bathrooms",2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
8,1531110.0,15000.0,110 sqm,2.0,2.0,Apartments,Available Now,-,"2 Bedrooms, 2 Bathrooms",2.0,...,0.0,,0.0,0.0,0.0,1.0,0.0,1.0,,
9,1530548.0,6100.0,35 sqm,1.0,1.0,Apartments,Available Now,35%,"1 Bedrooms, 1 Bathrooms",3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,


In [None]:
house_data = pd.concat((house_data, today), ignore_index=True)
house_data["extra_index"] = house_data.Listing_Id
house_data.set_index("extra_index", inplace=True)
today = page_data(house_data)