#Importing packages

In [None]:
from bs4 import BeautifulSoup
from bs4 import ResultSet
import numpy as np
import pandas as pd
import requests
import re
import matplotlib.pyplot as plt
from scipy import stats

# Creating necesserary variables and functions

In [None]:
# A dictionary consisting of links from which the data would be gathered
urls = {
    'ogloszenia_wtórny': 'https://ogloszenia.trojmiasto.pl/nieruchomosci-rynek-wtorny/f1i,1_2_3,ri,1_,wi,100_200_230_250_260_220_240_210,xi,1900_.html',
    'ogloszenia_pierwotny': 'https://ogloszenia.trojmiasto.pl/nieruchomosci-rynek-pierwotny/f1i,1_2_3,ri,1_,wi,100_200_230_250_260_220_240_210,xi,1900_.html'
}

In [None]:
# Taking data from the main page and returning Sets of values for each attribute
def take_data_from_main(response):
  soup = BeautifulSoup(response.content, 'html.parser')
  title_list = soup.find_all(class_ ='list__item__content__title__name link')
  district_list = soup.select('p[class="list__item__content__subtitle"]')
  area_list = soup.select('li[class = "list__item__details__icons__element details--icons--element--powierzchnia"] p[class = "list__item__details__icons__element__desc"]')
  number_of_rooms_list = soup.select('li[class = "list__item__details__icons__element details--icons--element--l_pokoi"] p[class = "list__item__details__icons__element__desc"]')
  year_of_creation_list = soup.select('li[class = "list__item__details__icons__element details--icons--element--rok_budowy"] p[class = "list__item__details__icons__element__desc"]')
  price_list = soup.select('p[class = "list__item__details__info details--info--price"]')
  href_list = soup.select('a[class="list__item__content__title__name link"]')
  return title_list, district_list, area_list, number_of_rooms_list, year_of_creation_list, price_list, href_list

# Diving into the page of an individual advertisement and returning Sets of values for each attribute which was not shown on a main page
def take_data_from_individual(link):
  soup_for_specific_advertisement = BeautifulSoup(requests.get(link).content, 'html.parser')
  type_of_building = soup_for_specific_advertisement.select('div[class="xogField xogField--rodzaj_nieruchomosci"] span[class="xogField__value"]')[0].get_text().strip()
  additional_list = soup_for_specific_advertisement.select('li[class="oglFieldList__item"] b')

  # Adding a try clause because not every offer has an information of the floor (Not given or an offer is a house)
  try:
    floor = soup_for_specific_advertisement.select('a[class="xogField xogField--pietro xogField--withIcon"] span[class="xogField__value xogField__value--big"]')[0].get_text().strip()
  except:
    floor = "nan"
  return type_of_building, additional_list, floor


# Searching through the content of the page to gather necessary data such as:
- title
- area of the estate
- number of rooms
- district and street where estate is placed
- year of creation

# Gathering data from the secondary market

In [None]:
#Initializing a list which would save records with advertisements data
data = []

#Searching data for secondary market (rynek wtórny)
url = urls['ogloszenia_wtórny']


"""
Iterating through each page and gathering necessary data such as:
- title
- district
- area
- number of rooms
- year of creation
- price in PLN (for m2)
- link of individual adv (to dive further)

"""
for page in range(0,150):
    print("Page number: " + str(page))
    response = requests.get(url, params={'strona': f"{page}"})
    print(response)
    if response.ok :

        # Using previously defined function for taking data from a main page
        title_list, district_list, area_list, number_of_rooms_list, year_of_creation_list, price_list, href_list = take_data_from_main(response)

        print(len(title_list))

        # Iterating through lists to create separate records
        for element in range(len(title_list)):
            try:
              price = price_list[element].get_text().replace(" ", "").rstrip("zł/m2")
            except:
              continue
            tytul = title_list[element].get_text().strip().replace("=", "")
            district = district_list[element].get_text().strip()
            powierzchnia = area_list[element].get_text().strip().rstrip(" m2")
            number_of_rooms = number_of_rooms_list[element].get_text().strip()
            year_of_creation = year_of_creation_list[element].get_text().strip()
            price = price_list[element].get_text().replace(" ", "").rstrip("zł/m2")
            href = href_list[element]
            link = href['href']
            type_of_building, additional_list, floor = take_data_from_individual(link)

            #Converting a Set of values into a list of strings
            additional_list_str = []
            find_string = lambda x : 1 if x.lower() in additional_list_str else 0
            for item in range(len(additional_list)):
              additional_list_str.append(additional_list[item].get_text().strip().lower())

            # Using lambda function to check if an asset for a buyer is in the offer
            garage = find_string('Garaż')
            balcony = find_string('Balkon')
            internet = find_string('Internet')
            parking = find_string('Miejsce Parkingowe')
            elevator = find_string('Winda')
            terrace = find_string('Taras')
            basement = find_string('Piwnica')
            garden = find_string('Ogródek')
            #print("Garage: " + str(garage) + " Balcony: " + str(balcony) + " Internet: " + str(internet) + " Parking: " + str(parking) + " Elevator: " + str(elevator) + " Terrace: " + str(terrace) + " Basement: " + str(basement))

            # Creating a record for a specific offer and adding it to a list
            print(additional_list_str)
            record = f"{tytul}={district}={powierzchnia}={number_of_rooms}={year_of_creation}={price}={type_of_building}={garage}={balcony}={internet}={parking}={elevator}={floor}={terrace}={basement}={garden}=secondary"
            data.append(record)

Page number: 0
<Response [200]>
30
['balkon', 'garaż', 'winda', 'taras', 'piwnica', 'kuchnia']
['winda', 'ogródek', 'woda', 'piwnica', 'miejsce parkingowe']
['gaz', 'woda', 'kanalizacja', 'piwnica', 'miejsce parkingowe', 'internet', 'kuchnia', 'kablówka', 'światłowód', 'rowerownia']
[]
[]
['balkon', 'garaż', 'winda', 'taras', 'woda', 'kanalizacja', 'siła', 'piwnica', 'miejsce parkingowe', 'internet', 'kuchnia', 'aneks kuchenny', 'kablówka', 'podjazd dla niepełnosprawnych', 'światłowód', 'ochrona', 'komórka lokatorska', 'rowerownia', 'osiedle zamknięte']
['balkon', 'piwnica', 'miejsce parkingowe', 'kuchnia']
['taras', 'siła', 'miejsce parkingowe', 'internet']
['winda', 'ogródek', 'miejsce parkingowe', 'internet', 'aneks kuchenny', 'osiedle zamknięte']
['winda', 'taras', 'siła', 'miejsce parkingowe', 'internet', 'aneks kuchenny']
[]
['garaż', 'winda', 'aneks kuchenny']
[]
[]
['balkon', 'gaz', 'woda', 'kanalizacja', 'miejsce parkingowe', 'internet', 'kuchnia', 'kablówka', 'komórka lokator

# Gathering data for the primary market

In [None]:
# Searching data for primary market (rynek pierwotny)
url = urls['ogloszenia_pierwotny']


"""
Iterating through each page and gathering necessary data such as:
- title
- district
- area
- number of rooms
- year of creation
- price in PLN (for m2)
- link of individual adv (to dive further)

"""
for page in range(0,100):
    print("Page number: " + str(page))
    response = requests.get(url, params={'strona': f"{page}"})
    print(response)
    if response.ok :

        # Using previously defined function for taking data from a main page
        title_list, district_list, area_list, number_of_rooms_list, year_of_creation_list, price_list, href_list = take_data_from_main(response)

        print(len(title_list))

        # Iterating through lists to create separate records
        for element in range(len(title_list)):
            try:
              price = price_list[element].get_text().replace(" ", "").rstrip("zł/m2")
            except:
              continue
            tytul = title_list[element].get_text().strip().replace("=", "")
            powierzchnia = area_list[element].get_text().strip().rstrip(" m2")
            number_of_rooms = number_of_rooms_list[element].get_text().strip()
            year_of_creation = year_of_creation_list[element].get_text().strip()
            href = href_list[element]
            link = href['href']
            type_of_building, additional_list, floor = take_data_from_individual(link)

            #Converting a Set of values into a list of strings
            additional_list_str = []
            find_string = lambda x : 1 if x.lower() in additional_list_str else 0
            for item in range(len(additional_list)):
              additional_list_str.append(additional_list[item].get_text().strip().lower())

            # Using lambda function to check if an asset for a buyer is in the offer
            garage = find_string('Garaż')
            balcony = find_string('Balkon')
            internet = find_string('Internet')
            parking = find_string('Miejsce Parkingowe')
            elevator = find_string('Winda')
            terrace = find_string('Taras')
            basement = find_string('Piwnica')
            garden = find_string('Ogródek')

            # Creating a record for a specific offer and adding it to a list. To separate data from each other the '=' sign is being used
            record = f"{tytul}={district}={powierzchnia}={number_of_rooms}={year_of_creation}={price}={type_of_building}={garage}={balcony}={internet}={parking}={elevator}={floor}={terrace}={basement}={garden}=primary"
            data.append(record)

Page number: 0
<Response [200]>
30
Page number: 1
<Response [200]>
30
Page number: 2
<Response [200]>
30
Page number: 3
<Response [200]>
30
Page number: 4
<Response [200]>
30
Page number: 5
<Response [200]>
30
Page number: 6
<Response [200]>
30
Page number: 7
<Response [200]>
30
Page number: 8
<Response [200]>
30
Page number: 9
<Response [200]>
30
Page number: 10
<Response [200]>
30
Page number: 11
<Response [200]>
30
Page number: 12
<Response [200]>
30
Page number: 13
<Response [200]>
30
Page number: 14
<Response [200]>
30
Page number: 15
<Response [200]>
30
Page number: 16
<Response [200]>
30
Page number: 17
<Response [200]>
30
Page number: 18
<Response [200]>
30
Page number: 19
<Response [200]>
30
Page number: 20
<Response [200]>
30
Page number: 21
<Response [200]>
30
Page number: 22
<Response [200]>
30
Page number: 23
<Response [200]>
30
Page number: 24
<Response [200]>
30
Page number: 25
<Response [200]>
30
Page number: 26
<Response [200]>
30
Page number: 27
<Response [200]>
30
Pa

In [None]:
df = pd.DataFrame(data)

In [None]:
print(df)

                                                      0
0     3 pok 71m2 loggia 6m2 widok na morze i LAS=Gda...
1     Parter | 2-pokoje | Gotowe do zamieszkania=Gda...
2     Mieszkanie 2-pokojowe Przymorze=Gdańsk Przymor...
3     3- pok.mieszkanie, kamerlane osiedle, nowe bud...
4     Mieszkania=Gdańsk Łostowice, Niepołomicka 20=4...
...                                                 ...
7433  Już jest! nowe i czeka=Gdańsk Łostowice, Wielk...
7434  Mieszkanie - Gdańsk Łostowice=Gdańsk Łostowice...
7435  Blisko morza/81,4 m2/strefa fitness=Gdańsk Brz...
7436  2 pokoje, inwestycyjnie?!=Gdańsk Łostowice, Wi...
7437  Nowe i gotowe=Gdańsk Łostowice, Wielkopolska=4...

[7438 rows x 1 columns]


In [None]:
columns = ["title", "district", "area", "number_of_rooms", "year_of_creation", 'price_for_m2_in_PLN', 'type_of_building', 'garage', 'balcony', 'internet', 'parking', 'elevator', 'floor', 'terrace', 'basement', 'garden','type_of_market']
correct_df = df[columns] = df[0].str.split('=', expand=True)

In [None]:
print(df.dtypes)
correct_df.columns = columns
display(correct_df)

0    object
dtype: object


ValueError: Length mismatch: Expected axis has 18 elements, new values have 17 elements

In [None]:
correct_df['area'].replace('\s+$', '', regex=True, inplace=True)
correct_df[['area', 'price_for_m2_in_PLN']] = correct_df[['area', 'price_for_m2_in_PLN']].astype(float)
correct_df[['number_of_rooms', 'year_of_creation']] = correct_df[['number_of_rooms', 'year_of_creation']].astype(int)
correct_df[['garage', 'balcony', 'internet', 'parking', 'elevator', 'terrace', 'basement', 'garden']] = correct_df[['garage', 'balcony', 'internet', 'parking', 'elevator', 'terrace', 'basement', 'garden']].astype(int)
correct_df[['garage', 'balcony', 'internet', 'parking', 'elevator', 'terrace', 'basement', 'garden']] = correct_df[['garage', 'balcony', 'internet', 'parking', 'elevator', 'terrace', 'basement', 'garden']].astype(bool)
correct_df['floor'].replace('nan', np.nan, inplace = True)
correct_df['floor'].replace("Parter", "0", inplace = True)
print(correct_df.dtypes)

In [None]:
correct_df[correct_df['type_of_market'] == 'secondary']

Unnamed: 0,title,district,area,number_of_rooms,year_of_creation,price_for_m2_in_PLN,type_of_building,garage,balcony,internet,parking,elevator,floor,terrace,basement,garden,type_of_market
0,2 pok oddzielna kuchnia cicha część Starówki,Gdańsk Śródmieście,40.0,2,1952,19975.0,Mieszkanie,False,True,False,False,False,2.0,False,True,False,secondary
1,Piękna kamienica | Cicha okolica | Widok na zi...,"Gdańsk Wrzeszcz, Adama Mickiewicza",96.6,4,1910,9834.0,Mieszkanie,False,False,True,False,False,2.0,False,True,False,secondary
2,2-pokojowe mieszkanie Zaspa,"Gdańsk Zaspa, Powstańców Wielkopolskich 5",53.19,2,2017,17823.0,Mieszkanie,False,False,True,True,True,0.0,False,False,True,secondary
3,Okazja cenowa! inwestycyjne,"Gdańsk Długie Ogrody, Długa Grobla",42.59,2,2017,18429.0,Mieszkanie,False,False,True,True,True,0.0,True,False,False,secondary
4,Na sprzedaż mieszkanie 4-pokojowe Gdańsk Wrzeszcz,"Gdańsk Wrzeszcz, Bohaterów Getta Warszawskiego",74.53,4,1901,10466.0,Mieszkanie,False,False,False,False,False,2.0,False,False,False,secondary
5,Dni otwarte 17.04 duży ogródek || 20 min do morza,"Gdańsk Letnica, Letnicka",56.05,3,2023,14077.0,Mieszkanie,True,False,False,False,True,0.0,False,False,False,secondary
6,Unikalny i funkcjonalny apartament w Sopocie,"Sopot Dolny Sopot, Łokietka 19",14.0,6,2015,25345.0,Mieszkanie,True,True,True,True,True,2.0,True,True,False,secondary
7,"Ekskluzywny Apartament 3-pokoje, blisko plaży","Gdańsk Letnica, Starowiejska",67.5,3,2023,16993.0,Mieszkanie,False,False,False,False,False,6.0,False,False,False,secondary
8,Promocja I Nowo Urządzone| 3 Pokoje|Nowoczesne...,"Gdańsk Kokoszki, Nowatorów",66.7,3,2023,12429.0,Mieszkanie,False,False,False,False,False,4.0,False,False,False,secondary
9,Słoneczne mieszkanie na sprzedaż- Diamentowa,"Gdańsk Orunia, Diamentowa 10",45.0,2,1970,12200.0,Mieszkanie,False,True,True,True,False,1.0,False,False,False,secondary


In [None]:
correct_df[correct_df['floor'].isna()]

Unnamed: 0,title,district,area,number_of_rooms,year_of_creation,price_for_m2_in_PLN,type_of_building,garage,balcony,internet,parking,elevator,floor,terrace,basement,garden,type_of_market
28,"Piękny, nowy dom jednorodzinny z dużym ogrodem","Gdańsk Św. Wojciech, Miłocińska",160.0,6,2009,8987.0,Dom wolnostojący,True,False,False,True,False,,False,False,True,secondary
31,AMprojekt- Perspectum Gdańsk Jasień/Zabornia O...,"Gdańsk Jasień, Stolema",243.71,6,2023,7911.0,Dom bliźniak,True,True,True,True,False,,True,False,True,primary
40,Komfortowy dom w pięknej okolicy - Mondo,"Gdańsk Morena, Jasieńska",216.0,6,2024,11806.0,Dom bliźniak,False,False,False,False,False,,False,False,False,primary
50,INPRO S.A. - OPTIMA - mieszkanie 3-pok. 61.71 ...,"Gdańsk Jasień, Tadeusza Jasińskiego",61.71,3,2025,10476.0,Mieszkanie,True,False,True,True,False,,False,True,False,primary
57,INPRO S.A. - Koncept - Gotowe do odbioru mies...,Gdańsk,48.35,2,2023,11340.0,Mieszkanie,True,False,True,True,False,,False,True,False,primary
63,"Dom 257 m2 w Gdańsku ul. Kartuska, działka 647 m2","Gdańsk, Kartuska 300",257.1,5,2024,906.0,Dom szeregowy,True,False,True,True,False,,False,False,True,primary
64,Dom 184 m2 w Gdańsku ul. Kartuska,"Gdańsk, Kartuska 300",183.74,5,2024,9960.0,Dom szeregowy,True,False,True,True,False,,False,False,True,primary
65,"Dom 184 m2 w Gdańsku ul. Kartuska, działka 353 m2","Gdańsk, Kartuska 300",183.74,5,2024,107.0,Dom szeregowy,True,False,True,True,False,,False,False,True,primary
70,Dom szeregowy - 7A,"Gdynia Redłowo, Kombatantów 18/1",157.35,5,2024,15570.0,Dom szeregowy,True,True,False,True,False,,True,True,False,primary
71,Dom gotowy do wydania 5B ETAP,"Gdańsk Matarnia - Rębiechowo, Ikara",113.66,4,2023,8578.0,Dom szeregowy,True,True,True,True,False,,True,False,True,primary


In [None]:
correct_df.to_csv('estate_data.csv')

In [None]:
df_from_file = pd.read_csv('estate_data.csv')
display(df_from_file)

In [None]:
df_from_file[['district', 'street']] = df['district'].str.split(',', expand=True)
rows_without_space = df_from_file[~df_from_file['district'].str.contains(' ', regex=False)]
display(rows_without_space)
#display(df_from_file)