## WebScraper v0.4
### Steps:
#### - Full 'Pride' web pages fetching
#### - Little pre-processing
#### - saving as csv

### Importing libraries

In [2]:
import json
import pandas as pd
import numpy as np
import requests

---
### Extracting 'Pride' c

In [16]:
total_pages = 21    # Changeable based on active ads on website !!!
car_counter = 0
page_counter = 0

columns = [
    'Brand',
    'Name',
    'Model',
    'Trim',
    'Year',
    'Mileage',
    'Fuel',
    'Transmission',
    'Body status',
    'Price'
]

df = pd.DataFrame(columns=columns)            # Data Frame for adding data in a tabular format

for page in range(total_pages):
    page_counter += 1
    print(f'Receiving page No.{page_counter}')
    pageReq = requests.get(f'https://bama.ir/cad/api/search?vehicle=pride&pageIndex={page}')   # API url to scroll down the ads pages
    print('Responded HTTP Code:', pageReq)                                                     # since it's a dynamic website..
    
    pageReqTxt = pageReq.text 
    pageReqJson = json.loads(pageReqTxt)      
    
    container = pageReqJson.get('data').get('ads')      # Extracting items of the page
    for item in container:
        item_type = item.get('type')
        if item_type == 'ad':
            car_counter +=1

            car_brand = item.get('detail').get('brand') 
            
            car_title_list = item.get('detail').get('title').split('،')                # example : 'پراید، 151'
            if len(car_title_list) == 2:    
                car_name = car_title_list[1].strip()
                car_model = car_title_list[0].strip()                                 # exception handling
            # else:
            #     car_name = str(car_title_list[0])
            #     car_model = str(car_title_list[0])
            
            car_trim = item.get('detail').get('trim')
            car_year = int(item.get('detail').get('year'))
            
            car_mileage = item.get('detail').get('mileage')                              # Values like : '120,000 km' or 'صفر کیلومتر'
            if car_mileage == 'صفر کیلومتر':
                car_mileage = 0
            elif car_mileage == 'کارکرده':
                pass
            else:
                car_mileage = int(car_mileage.replace('km','').replace(',',''))
            
            
            car_fuel = item.get('detail').get('fuel')
            car_trans = item.get('detail').get('transmission') 
            car_status = item.get('detail').get('body_status')
            
            car_price_str = item.get('price').get('price')                                # example : 600,000,000 Toman
            car_price = int(car_price_str.replace(',',''))
            
            # --------------- Adding cleaned row of data to DataFrame ------------------

            data_row = {
                    'Brand':car_brand,
                    'Name':car_name,
                    'Model':car_model,
                    'Trim':car_trim,
                    'Year':car_year,
                    'Mileage':car_mileage,
                    'Fuel':car_fuel,
                    'Transmission':car_trans,
                    'Body status':car_status,
                    'Price':car_price
            }

            data_row = pd.DataFrame([data_row])
            df = pd.concat([df, data_row], ignore_index=True)
            
    print('All car ads were fetched from this page.')
    print(f'{car_counter} cars info extracted so far.')
    print('-'*50)

print('\nAll pages related to Pride car brand has successfully been fetched from bama.ir !')
print('Total observed pages:', page_counter)
print('Total car ads fetched:', car_counter)


Receiving page No.1
Responded HTTP Code: <Response [200]>
All car ads were fetched from this page.
13 cars info extracted so far.
--------------------------------------------------
Receiving page No.2
Responded HTTP Code: <Response [200]>
All car ads were fetched from this page.
43 cars info extracted so far.
--------------------------------------------------
Receiving page No.3
Responded HTTP Code: <Response [200]>
All car ads were fetched from this page.
73 cars info extracted so far.
--------------------------------------------------
Receiving page No.4
Responded HTTP Code: <Response [200]>
All car ads were fetched from this page.
103 cars info extracted so far.
--------------------------------------------------
Receiving page No.5
Responded HTTP Code: <Response [200]>
All car ads were fetched from this page.
133 cars info extracted so far.
--------------------------------------------------
Receiving page No.6
Responded HTTP Code: <Response [200]>
All car ads were fetched from this 

In [28]:
print(df.head(10))          # contains other brands too ! PROBLEM IN DATA !
print(df.shape)
# Check unique values in Brand column
print(df['Brand'].unique())
print(df['Brand'].value_counts())

          Brand       Name              Model      Trim  Year  Mileage  \
0         pride  صندوق دار              پراید   دنده ای  1389   355000   
1         pride  صندوق دار              پراید   دنده ای  1386   325000   
2  mercedesbenz   کلاس EQB                بنز    EQB260  2024        0   
3           kmc       ایگل  پیش فروش کی ام سی  1.5 لیتر  1404        0   
4         pride        151              پراید        GX  1404        0   
5         pride        131              پراید        SE  1398    12000   
6         pride  صندوق دار              پراید   دنده ای  1389  کارکرده   
7         pride        131              پراید        SE  1399    40000   
8         pride        131              پراید        SE  1398    88000   
9         pride        132              پراید        SE  1390   332000   

     Fuel Transmission  Body status      Price  
0  بنزینی      دنده ای  گلگیر تعویض  345000000  
1  بنزینی      دنده ای    گلگیر رنگ  320000000  
2    برقی     اتوماتیک     بدون رنگ   

#### As you can see in the cell above, in Brand column we also see other values than just Pride car ads!
#### To solve this issue we filter this column in the DataFrame:

In [32]:
# Solving problem:
df = df[df['Brand'] == 'pride']
df.head(10)
df.shape

(583, 10)

---

In [34]:
type(df.iloc[289]['Name']) # data types are cool in here, I mean all int and str.. Why is it object in the DataFrame? 

df.dtypes                 # all objects? you kidding me?

# Update : Pandas tend to show combination of values and strings as objects, futher we will turn numerical columns to int data type.

Brand           object
Name            object
Model           object
Trim            object
Year            object
Mileage         object
Fuel            object
Transmission    object
Body status     object
Price           object
dtype: object

#### Checking data:

In [36]:
df.describe()

Unnamed: 0,Brand,Name,Model,Trim,Year,Mileage,Fuel,Transmission,Body status,Price
count,583,583,583,583,583,583,583,583,583,583
unique,1,7,3,12,32,218,2,1,17,163
top,pride,131,پراید,SE,1404,0,بنزینی,دنده ای,بدون رنگ,0
freq,583,177,577,255,51,56,483,583,242,44


In [40]:
df_backup = df.copy()
df_backup.head()

Unnamed: 0,Brand,Name,Model,Trim,Year,Mileage,Fuel,Transmission,Body status,Price
0,pride,صندوق دار,پراید,دنده ای,1389,355000,بنزینی,دنده ای,گلگیر تعویض,345000000
1,pride,صندوق دار,پراید,دنده ای,1386,325000,بنزینی,دنده ای,گلگیر رنگ,320000000
4,pride,151,پراید,GX,1404,0,بنزینی,دنده ای,بدون رنگ,680000000
5,pride,131,پراید,SE,1398,12000,بنزینی,دنده ای,گلگیر رنگ,0
6,pride,صندوق دار,پراید,دنده ای,1389,کارکرده,بنزینی,دنده ای,بدون رنگ,0


#### Solving dtypes (Mileage still needs) to get ready for DB saving:

In [42]:
#String columns : cleaninh possible whitespaces and making sure they will be saved as string!
str_cols = ['Brand', 'Name', 'Model', 'Trim', 'Mileage', 'Fuel', 'Transmission', 'Body status']
for col in str_cols:
    df[col] = df[col].astype(str).str.strip()
print('Converted string columns to <str>')

# Numerical columns: chaging dtype to integer
num_cols = ['Year', 'Price']
for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')  # coerce invalid to NaN and converts others to integer
print('Converted integer columns to <int>')

print(df.dtypes)

Converted string columns to <str>
Converted integer columns to <int>
Brand           object
Name            object
Model           object
Trim            object
Year             int64
Mileage         object
Fuel            object
Transmission    object
Body status     object
Price            int64
dtype: object


#### Making sure that column data types are okay before saving the first version of dataset:

In [44]:
print('Checking DataFram dtypes:\n')

print(df['Brand'].apply(type).value_counts())
print('-'*30)
print(df['Name'].apply(type).value_counts())
print('-'*30)
print(df['Model'].apply(type).value_counts())
print('-'*30)
print(df['Trim'].apply(type).value_counts())
print('-'*30)
print(df['Year'].apply(type).value_counts())
print('-'*30)
print(df['Mileage'].apply(type).value_counts())
print('-'*30)
print(df['Fuel'].apply(type).value_counts())
print('-'*30)
print(df['Transmission'].apply(type).value_counts())
print('-'*30)
print(df['Body status'].apply(type).value_counts())
print('-'*30)
print(df['Price'].apply(type).value_counts())           

# I know you might be thinking why the heck I did not use a for loop... and... you are right!
# Possible implementation with for loop for this cell in next COMMITS !!! :]

Checking DataFram dtypes:

Brand
<class 'str'>    583
Name: count, dtype: int64
------------------------------
Name
<class 'str'>    583
Name: count, dtype: int64
------------------------------
Model
<class 'str'>    583
Name: count, dtype: int64
------------------------------
Trim
<class 'str'>    583
Name: count, dtype: int64
------------------------------
Year
<class 'int'>    583
Name: count, dtype: int64
------------------------------
Mileage
<class 'str'>    583
Name: count, dtype: int64
------------------------------
Fuel
<class 'str'>    583
Name: count, dtype: int64
------------------------------
Transmission
<class 'str'>    583
Name: count, dtype: int64
------------------------------
Body status
<class 'str'>    583
Name: count, dtype: int64
------------------------------
Price
<class 'int'>    583
Name: count, dtype: int64


#### Mileage column has issue : Containing both numerical and string values..!
#### We will take care of 'Mileage' issue in in next pre-processing steps..
#### Let's first save current dataset as a csv file:

In [50]:
df.to_csv('D:/AIjourney/projects/Pride Ads Project/CSV/pride_ads_1.csv', index=False, encoding='utf-8-sig')
print('Successfully saved as csv.')

Successfully saved as csv.
