<a href="https://colab.research.google.com/github/KamalM13/hatla2ee-scraper/blob/main/Information_Retrieval_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Scraping used cars listed on sale from all brands on The Egyptian Website Hatla2ee
We will make use of 2 basic libraries and BeautifulSoup

*   Pandas
*   Requests
*   BeautifulSoup

# Problem Statement
## Phase 1: Data Scraping and Collection
### Target Website: [Hatla2ee](https://eg.hatla2ee.com/en/)
![picture](https://i.ibb.co/MMdJspN/image.png)

### Data to be scraped is as follows
* Name
* Brand
* Model
* Color
* Manafacture Year
* Price
* Mileage
* Location
* Listing Date

### Mission
This is a two phase project where the 1st phase will prepare a dataset for data modeling in PowerBI. In this notebook, we will be scraping all car listings for each brand on the Egyptian website, Hatla2ee.

### Methodolgy
* Download the Hatla2ee search page and convert it's content to a BeautifulSoup object
* Derive an algorithm to navigate all search pages according to a car brand
* Parse the required information through multiple processing steps and write the data to a .csv file
* Preview data using Pandas


# Downloading Hatla2ee Page
This section will scrap all car listings for each brand and it's subsequent pages

**The Car brands are as follows:**


In [None]:
car_dict = {
    "Abarth": 255,
    "Acura": 247,
    "Alfa Romeo": 48,
    "Aston Martin": 106,
    "Audi": 23,
    "Baic": 110,
    "Bentley": 104,
    "Bestune": 221,
    "BMW": 8,
    "Borgward": 278,
    "Brilliance": 5,
    "Bugatti": 138,
    "Buick": 70,
    "Byd": 6,
    "Cadillac": 51,
    "Canghe": 128,
    "Chana": 132,
    "Changan": 77,
    "Chery": 94,
    "Chevrolet": 1,
    "Chrysler": 53,
    "CitroÃ«n": 9,
    "Cupra": 233,
    "Daewoo": 14,
    "Daihatsu": 27,
    "Datsun": 84,
    "DFSK": 126,
    "Dodge": 54,
    "Domy": 158,
    "Dongfeng": 114,
    "Dorcen": 279,
    "Ds": 159,
    "Emgrand": 72,
    "Exeed": 237,
    "Faw": 65,
    "Ferrari": 90,
    "Fiat": 4,
    "Ford": 45,
    "Forthing": 234,
    "Foton": 102,
    "GAC": 108,
    "Gaz": 96,
    "Geely": 59,
    "Genesis": 231,
    "Gmc": 57,
    "Great Wall": 64,
    "Hafei": 63,
    "Haima": 120,
    "Hanteng": 162,
    "Haval": 146,
    "Hawtai": 136,
    "Honda": 41,
    "Hongqi": 212,
    "Hummer": 50,
    "Hyundai": 12,
    "Ineos": 285,
    "Infiniti": 60,
    "Isuzu": 61,
    "Jac": 62,
    "Jaguar": 28,
    "Jeep": 29,
    "Jetour": 163,
    "Jonway": 68,
    "Kaiyi": 227,
    "Karry": 100,
    "Kenbo": 122,
    "Keyton": 124,
    "KGM (ssangyong)": 287,
    "Kia": 3,
    "Lada": 2,
    "Lamborghini": 116,
    "Lancia": 75,
    "Land Rover": 56,
    "Landwind": 142,
    "Leapmotor": 253,
    "Lexus": 52,
    "Lifan": 69,
    "Lincoln": 46,
    "Lotus": 248,
    "Lynkco": 252,
    "Mahindra": 82,
    "Maserati": 92,
    "Maxus": 228,
    "Mazda": 16,
    "McLaren": 243,
    "Mercedes": 40,
    "Mercury": 76,
    "MG": 71,
    "Mini": 66,
    "Mitsubishi": 13,
    "Nissan": 21,
    "Opel": 22,
    "Perodua": 240,
    "Peugeot": 10,
    "Polestar": 275,
    "Pontiac": 67,
    "Porsche": 73,
    "Proton": 44,
    "Renault": 30,
    "Rolls Royce": 224,
    "Saab": 74,
    "Saipa": 118,
    "Scion": 86,
    "Seat": 47,
    "Senova": 98,
    "Skoda": 43,
    "Skywell": 284,
    "Smart": 130,
    "Sokon": 140,
    "Soueast": 88,
    "Speranza": 7,
    "Ssang Yong": 49,
    "Subaru": 39,
    "Suzuki": 31,
    "Tank": 249,
    "Tata": 80,
    "Tesla": 134,
    "Toyota": 36,
    "Vgv": 281,
    "Volkswagen": 35,
    "Volvo": 33,
    "Zeekr": 273,
    "ZNA": 112,
    "Zotye": 78
}


# Scrapping Section
This section is concerned with scraping all HTML pages for each brand and their subsequent search pages.
Our approach is farily simple, we will do 2 requests per page.
1st request is to acquire the number of search pages
per brand and the >2 requests are saving all the pages
as a beautiful soup object.

In [173]:

import requests
from bs4 import BeautifulSoup as bs
#The headers change the request from being generated by python to a request similar to an actual user through a browser
request_headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
# Since hatla2ee supports pagination, we have to implement a function to get the nth page for every brand
def get_brand_pages_number(url):
  response = requests.get(url,headers = request_headers)
  if not response.ok:
    print("Error:",response.status_code)
    raise Exception("Failed to fetch" +  url)
  doc = bs(response.text)
  #Finding the number of search pages per brand
  number = doc.find_all("a",class_ = "paginate")
  if len(number) != 0:
    return len(number)
  else:
    return 0


def get_brand_for_nth_page(brand):
  #Example url: https://eg.hatla2ee.com/en/car/search?make=8&city=0&dateMin=0&priceMin=&model=&body=&dateMax=0&priceMax=&page=2
  url_start = 'https://eg.hatla2ee.com/en/car/search?make='
  # Second part represents subsequent queries, we will not be adjusting those
  url_end = '&city=0&dateMin=0&priceMin=&model=&body=&dateMax=0&priceMax=&page='
  #changing the pages according to the brand
  brand_url = url_start + str(brand) + url_end
  #Getting the number of pages per brand
  pages = get_brand_pages_number(brand_url)

  #making sure that we iterate over brands with only 1 search page
  if pages == 0:
    pages += 1
  docs = []
  for i in range(1,pages+1):
    #the concatenated link (link + page number)
    temp = brand_url + str(i)
    response = requests.get(temp)
    doc = bs(response.text)
    if not response.ok:
        print('Status Code:', response.status_code)
        raise Exception('Failed to get web page' + brand_url)
    docs.append(doc)

  return docs

store = []
for car in car_dict:
  #main line to call the function for every brand
  docs = get_brand_for_nth_page(car_dict[car])
  store.append(docs)


In [None]:
store[0]

# Scraping the required values
Now we have all the website pages as BS object, going through
the "css" classes, we can identify the required fields for our
dataset. Again, the approach is pretty simple, we initilaize an iterator and a find function for every value and loop over them for every page in our collection of brand pages.

In [174]:
import re
car_info_list = []
store_counter = 0
for i in range(0,len(store)):
  for docs in store[store_counter]:
    #an iterator for every target value
    color_and_mileage = docs.find_all("span", class_="newCarListUnit_metaTag")
    color_and_mileage_counter = 0
    location = docs.find_all("a", href=re.compile("/en/car/city"))
    location_counter = 0
    date = docs.find_all("div", class_ = "otherData_Date")
    date_counter = 0
    price = docs.find_all("div", class_= "main_price")
    price_counter = 0
    for header in docs.find_all("div", class_="newCarListUnit_header"):
        car_info = {}
        anchor_tag = header.find("a")
        if anchor_tag:
            car_info["Name"] = anchor_tag.text.strip()
            car_info["Brand"] = car_info["Name"].split()[0]
            car_info["Model"] = ' '.join(car_info["Name"].split()[1:-1])
            car_info["Manafacture Year"] = car_info["Name"].split()[-1]

            car_info["Color"] = color_and_mileage[color_and_mileage_counter].text.strip()
            car_info["Mileage"] = color_and_mileage[color_and_mileage_counter+1].text.strip()
            color_and_mileage_counter+=2

            price_tag = price[price_counter].find('a')
            car_info["Price"] = price_tag.text.strip()
            price_counter += 1

            car_info["Location"] = location[location_counter].text.strip()
            location_counter += 1

            date_tag = date[date_counter].find("span")
            car_info["Listing Date"] = date_tag.text.strip()
            date_counter += 1

            car_info_list.append(car_info)
  store_counter += 1




In [175]:
import pandas as pd
df = pd.DataFrame(car_info_list)
df['Mileage'] = df['Mileage'].str.replace(r'[\s,]', '', regex=True).str.replace(r'Km', '0', regex=True).astype(int)
df['Price'] = df['Price'].str.replace("-","0")
df['Price'] = df['Price'].str.replace(r'[^\d-]', '', regex=True).replace('', '0', regex=True).astype(int)

df.head()
df.to_csv('hatla2ee_scrap_data.csv')