# Problem 1: scraping houses prices

## Strategy
Loop over all the pages.
For each page, loop over all the list items: 
```html
<li class="sold-results__normal-hit">
  ...
</li>
```
For each list item, extract the following information:
### Sold date: 
```html
<span class="hcl-label hcl-label--state hcl-label--sold-at">
    Såld 9 oktober 2023
</span>
```
remove the "Såld " prefix

### Address: 
```html
<h2 class="sold-property-listing__heading qa-selling-price-title hcl-card__title">
  Skårby station 350
</h2>
```
### Location of the estate: 
```html
   <div class="sold-property-listing__location">
     <div>
       <span class="property-icon property-icon--result">...</span>
       Kareby,
       Kungälvs kommun
     </div>
   </div>
```
access the div child and remove the span element

### Area of the house & number of rooms:
```html  
<div class="sold-property-listing__subheading sold-property-listing__area">
    143
      <span class="listing-card__attribute--normal-weight">
        + 25&nbsp;m²
      </span>
    &nbsp;
    7&nbsp;rum
</div>
```
OR
```html
<div class="sold-property-listing__subheading sold-property-listing__area">
  123&nbsp;m²
  &nbsp;
  6&nbsp;rum
</div>
```
remove the span element if present, remove the m² and rum suffixes
save the biarea if a "+" is present

### Area of the plot:
```html
<div class="sold-property-listing__land-area">
  2&nbsp;963&nbsp;m² tomt
</div>
```
remove the m² tomt suffix

### Closing price:
```html
<span class="hcl-text hcl-text--medium">
  Slutpris 4&nbsp;395&nbsp;000&nbsp;kr
</span>
```
remove the "Slutpris " prefix and the kr suffix
  
---

In [21]:
import glob
from bs4 import BeautifulSoup
import re

Loop over all the pages.

In [22]:
file_pattern = "kungalv_slutpriser/kungalv_slutpris_page_*.html"

data_dict = []

for file_name in glob.glob(file_pattern):
	with open(file_name, "r") as f:
		content = f.read()
		soup = BeautifulSoup(content, "html.parser")
		print("Processing", file_name, "...")
		for li in soup.find_all("li", class_="sold-results__normal-hit"):
			
			# Sold date
			sold_date_element = li.find("span", class_="hcl-label hcl-label--state hcl-label--sold-at")
			if sold_date_element is not None:
				sold_date = (
					sold_date_element.text
					.replace("Såld ", "")
					.strip()
				)

			# Address
			address_element = li.find("h2", class_="sold-property-listing__heading qa-selling-price-title hcl-card__title")
			if address_element is not None:
				address = (
					address_element.text
					.strip()
				)

			# Location
			location_element = li.find("div", class_="sold-property-listing__location").div
			if location_element is not None:
				location_element.find("span").decompose()
				location = re.sub(
					r'\n\s+',
					" ",
					location_element.text.strip(),
				)

			area_element = li.find("div", class_="sold-property-listing__subheading sold-property-listing__area")
			if area_element is not None:
				if area_element.find("span") is not None:
					biarea = float(
						area_element
						.find("span").text
						.replace("\u00a0", "")
						.replace("m²", "")
						.replace("+ ", "")
						.replace(",", ".")
						.strip()
					)
					area_element.find("span").decompose()
				values = (
					area_element.text
					.replace("\u00a0m²", "")
					.replace("\u00a0rum", "")
					.split("\u00a0")
				)
				values = [
					float(v.replace(",", ".").strip())
					for v in values
					if re.match(
						r'^-?\d+(?:\.\d+)?$',
						v.replace(",", ".").strip()
					) is not None
				]
				boarea, rooms = (values + [None, None])[:2]

			# Plot
			plot_element = li.find("div", class_="sold-property-listing__land-area")
			plot = None
			if plot_element is not None:
				plot = float(
					plot_element.text
					.replace("\u00a0", "")
					.replace("m² tomt", "")
					.replace(",", ".")
					.strip()
				)

			# Price
			price_element = li.find("span", class_="hcl-text hcl-text--medium")
			price = None
			if price_element is not None:
				price = float(
					price_element.text
					.replace("\u00a0", "")
					.replace("Slutpris ", "")
					.replace("kr", "")
					.replace(",", ".")
					.strip()
				)

			data_dict.append({
				"sold_date": sold_date,
				"address": address,
				"location": f'"{location}"',
				"boarea": boarea,
				"biarea": biarea,
				"rooms": rooms,
				"plot": plot,
				"price": price,
			})

with open("housing_data.csv", "w") as f:
	f.write(",".join(data_dict[0].keys()) + "\n")
	for data in data_dict:
		f.write(",".join(str(v) for v in data.values()) + "\n")

Processing kungalv_slutpriser/kungalv_slutpris_page_27.html ...
Processing kungalv_slutpriser/kungalv_slutpris_page_31.html ...
Processing kungalv_slutpriser/kungalv_slutpris_page_11.html ...
Processing kungalv_slutpriser/kungalv_slutpris_page_07.html ...
Processing kungalv_slutpriser/kungalv_slutpris_page_06.html ...
Processing kungalv_slutpriser/kungalv_slutpris_page_10.html ...
Processing kungalv_slutpriser/kungalv_slutpris_page_30.html ...
Processing kungalv_slutpriser/kungalv_slutpris_page_26.html ...
Processing kungalv_slutpriser/kungalv_slutpris_page_40.html ...
Processing kungalv_slutpriser/kungalv_slutpris_page_17.html ...
Processing kungalv_slutpriser/kungalv_slutpris_page_01.html ...
Processing kungalv_slutpriser/kungalv_slutpris_page_21.html ...
Processing kungalv_slutpriser/kungalv_slutpris_page_37.html ...
Processing kungalv_slutpriser/kungalv_slutpris_page_36.html ...
Processing kungalv_slutpriser/kungalv_slutpris_page_20.html ...
Processing kungalv_slutpriser/kungalv_sl