In [5]:
# -*- coding: utf-8 -*-
"""
Created on Mon Jan 25, 2023
@author: Kemal
"""

import requests
from bs4 import BeautifulSoup as bs

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime
import time
import random
import pprint

import os
import subprocess

In [6]:
'''
Globally create alternative user_agents_list and randomly choose one of them in order NOT to get banned. 
'''
user_agents_list = [
    'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.83 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
]

sleep_time_list = [0.1, 0.3, 0.5, 0.7, 1, 1.3, 1.5, 1.7]

In [7]:
'''
Create a function that sends request to an URL and parses the response

Parameters: 
    url: str
        URL to send request

    check: boolean
        Use to check the connection (default: false)

Returns: 
    r: BeautifulSoap object
        Response of the request
'''

def parse_request(url, check = False):
    # Randomly chose an agent
    agent = {'User-Agent': random.choice(user_agents_list)}
    
    # Send request to page
    response = requests.get(url, headers=agent)

    #time.sleep(random.choice(sleep_time_list))

    if check == True:
        # Control the status
        stat_code = response.status_code
        
        if stat_code == 200:
            print(f'Response status: 200. Response has received succesfully.')
        else:
            print('Check your request again.')

    # Take the html text
    html_text = response.text    
    
    # Convert to BeautifulSoap object:
    r = bs(html_text, "html.parser")

    return r

In [8]:
# Checkpoint: Check whether the connection is successfully established:
url_1 = "https://www.emlakjet.com/satilik-konut/istanbul/2/"
r = parse_request(url_1, check=True)

Response status: 200. Response has received succesfully.


In [9]:
'''
* Create a list of URLs for 50 page.  
'''

# Create list of page URLs using a base URL
base_url = "https://www.emlakjet.com/satilik-konut/istanbul/"
page_url_list = [base_url]

# Add page numbers to URLs
for i in range (2,51):
    page_url_list.append(base_url + str(i))

# Print out the list of URLs
#print(page_url_list)

In [10]:
'''
Create list of advert URLs from each page 
'''

# Create empty list to be appended
advert_url_list = []

for page_url in page_url_list:
    r = parse_request(page_url)

    # Look for class '_3qUI9q' for each page and save them in a list
    for link in r.find_all('a',{'class':'_3qUI9q'}):
        advert_url_list.append('http://www.emlakjet.com' + link.get('href'))

print(f"I have found {len(advert_url_list)} advert, an example of them is given below: \n {advert_url_list[0]}")


I have found 1500 advert, an example of them is given below: 
 http://www.emlakjet.com/ilan/safa-gyo-beylikduzu-marinaya-2-dk-kacmaz-lux-dubleks-firsati-12607526/


In [None]:
'''
1) Create 2 empty list to be appended. These two list will be used as a dictionary to form final dataframe.
    a) Advert Keys (e.g., brüt m2, oda sayisi, bina yasi)
    b) Advert Values (e.g., 110, 3+1, 16-20) 

2) Parse every single advert URL and retreive price from relevant div(s)
    * Some pages does NOT have price, and it causes an error
    a) Div1 - Price: Using try - except, instead of raising an error and therefore interrupt the loop, just simply fill the invalid ones with NAs.
    b) Div2 - Location: Location information is stored as Şehir - İlçe - Mahalle. Using string methods, save them as different features.  
    c) Div3 - Other Properties: Get properties and relevant values, and append into keys and values lists.

3) Split [keys,values] list into two list [keys], [values]. 
'''

# 1)
props = []
adv_props_key = []
adv_props_val = []

for advert in advert_url_list:
    r = parse_request(advert)

    adv_props = []

    # 2a)
    try:
        price = r.find('div', attrs={'class': '_2TxNQv'}).text
    
        adv_props.append('Fiyat (TL)')
        adv_props.append(price.split('TL')[0].replace(',',''))
    except:
        print("Couldn't find the price! Going on with the next advert..")
        adv_props.append('Fiyat (TL)')
        adv_props.append('NA')
    # 2b)
    for location in r.find_all('div', attrs={'class': '_3VQ1JB'}):
        #city-brough-neigbour list İstanbul, Beylikdüzü, Sahil Mah.
        cbn_list = location.find('p').text.split('-')
        cbn_list[0] = cbn_list[0].rstrip() 
        cbn_list[1] = cbn_list[1].lstrip().rstrip()
        cbn_list[2] = cbn_list[2].lstrip().rstrip() 

        adv_props.append('Şehir')
        adv_props.append(cbn_list[0])

        adv_props.append('İlçe')
        adv_props.append(cbn_list[1])

        adv_props.append('Mahalle')
        adv_props.append(cbn_list[2])

    # 2c)
    for prop in r.find_all('div',{'class':'_1bVOdb'}):
        
        adv_props.append(prop.get_text())

    # 3)
    adv_props_key.append(adv_props[0: :2])
    adv_props_val.append(adv_props[1: :2])   


In [314]:
# Checkpoint
if ((len(adv_props_key) > 2) & (len(adv_props_val) > 2)): 
    print(f"I have found many props!")

# print(f'Length:',len(adv_props_key))
# print(f'Shape:',np.shape(adv_props_val))
# print(adv_props_val)

I have found many props!


In [273]:
'''
A function to create key-value dictionary from advert properties and convert them into pandas dataframe. 

Parameters: 
    keys: list
        List of properties extracted from the adverts -- key

    vals: list
        List of properties' value extracted from the adverts -- value

Returns: 
    df: Pandas DataFrame
        Tabular data table that contains advert keys and values
'''
def df_creator(keys,vals):

    df = pd.DataFrame([dict(zip(keys, vals))])

    return df


In [307]:
'''
Initialize a dataframe using first row and expand the dataframe with the consecutive rows.
'''
all_adv_df = pd.DataFrame([dict(zip(adv_props_key[0], adv_props_val[0]))])

for i in range(1,len(adv_props_key)):
    temp_df = df_creator(adv_props_key[i],adv_props_val[i])
    all_adv_df = pd.concat([all_adv_df,temp_df])


In [None]:
# Checkpoint: Display first few rows to check if everything is OK!
all_adv_df.head()

In [311]:
'''
Everything should be OK. 
HOWEVER, rows contains units or some other strings to be removed.
'''
def remove_str(df, col, splitter = None, order = 0, new_col = None):
    df[col] = df[col].str.split().str.get(order)

    return df[col]

all_adv_df['Brüt Metrekare'] = all_adv_df['Brüt Metrekare'].str.split().str.get(0)
all_adv_df['Net Metrekare'] = all_adv_df['Brüt Metrekare'].str.split().str.get(0)
all_adv_df['Kira Getirisi'] = all_adv_df['Kira Getirisi'].str.split().str.get(0)
all_adv_df['Aidat'] = all_adv_df['Kira Getirisi'].str.split().str.get(0)
all_adv_df['WC Sayısı'] = all_adv_df['WC Sayısı'].str.split().str.get(0)
all_adv_df['WC Metrekare'] = all_adv_df['WC Metrekare'].str.split().str.get(0)
all_adv_df['Banyo Metrekare'] = all_adv_df['Banyo Metrekare'].str.split().str.get(0)
all_adv_df['Salon Metrekare'] = all_adv_df['Salon Metrekare'].str.split().str.get(0)

# Some columns requires special treatments
all_adv_df['Yaş Nümerik'] = all_adv_df['Binanın Yaşı'].str.split('-').str.get(0)
all_adv_df['Normal Oda Sayısı'] = all_adv_df['Oda Sayısı'].str.split('+').str.get(0)
all_adv_df['Salon Sayısı'] = all_adv_df['Oda Sayısı'].str.split('+').str.get(1)


In [310]:
''' 
It is important to export to datafile as .csv or .xlsx to use it later w/o repeating time consuming (and unstable) steps.
'''

all_adv_df.to_excel('emlakjet_istanbul_%s.xlsx'%(datetime.now().date().strftime('%d_%m_%y'))) #emlakjet_istanbul_%s.csv

In [296]:
# Convert some fields to numeric
dict_to_numeric = {"Brüt Metrekare":'float', 
"Binanın Kat Sayısı":'float',
"Kira Getirisi":'float',
"Banyo Sayısı":'float',
"WC Metrekare":'float',
"Net Metrekare":'float',
#"Aidat":'int', 
"Banyo Metrekare":'float',
"WC Sayısı":'float',
#"Balkon Sayısı":'float',
"Salon Metrekare":'float',
#"Yaş Nümerik":'int',
"Normal Oda Sayısı":'float',
"Salon Sayısı":'float',
'Fiyat (TL)':'float'
}


all_adv_df = all_adv_df.astype(dict_to_numeric)

In [297]:
all_adv_df['Toplam Oda Sayısı'] = all_adv_df['Normal Oda Sayısı'] + all_adv_df['Salon Sayısı']