## Data Preprocessing + Label encoding
#### on Pride car ads, scraped and processed through Bama.ir website

#### Importing libraries

In [19]:
import pandas as pd
import numpy as np
import pymysql
import matplotlib.pyplot as plt
# import seaborn as sns

#### Options for loading the datset
##### 1. Loading data from cleaned csv file
##### 2. Loading data using local MySQL DB

#### Option 1

In [8]:
df = pd.read_csv('D:/AIjourney/projects/Pride Ads Project/CSV/pride_ads_cleaned_1.csv', encoding='utf-8-sig')

print('Shape of loaded dataset<csv>:', df.shape)
df.head(5)         # validation only

Shape of loaded dataset(csv): (539, 10)


Unnamed: 0,Brand,Name,Model,Trim,Year,Mileage,Fuel,Transmission,Body status,Price
0,pride,صندوق دار,پراید,ساده,1389,355000.0,بنزینی,دنده ای,گلگیر تعویض,345000000
1,pride,صندوق دار,پراید,ساده,1386,325000.0,بنزینی,دنده ای,گلگیر رنگ,320000000
2,pride,151,پراید,GX,1404,0.0,بنزینی,دنده ای,بدون رنگ,680000000
3,pride,131,پراید,SE,1399,40000.0,بنزینی,دنده ای,گلگیر تعویض,560000000
4,pride,131,پراید,SE,1398,88000.0,بنزینی,دنده ای,دو لکه رنگ,600000000


#### Option 2

In [12]:
connection = pymysql.connect(
    host = 'localhost',
    user='root',
    password='',      # Password censored +_+
    database='iranian_cars_db',
    charset='utf8mb4'
)

query = 'SELECT * FROM pride_cars'
df = pd.read_sql(query, connection)

connection.close()

print('Shape of loaded dataset<db>:', df.shape)
df.head(5)           # validation

  df = pd.read_sql(query, connection)


Shape of loaded dataset<db>: (539, 12)


Unnamed: 0,id,brand,name,model,trim,year,mileage,fuel,transmission,body_status,price,created_at
0,1,pride,صندوق دار,پراید,ساده,1389,355000,بنزینی,دنده ای,گلگیر تعویض,345000000,2026-01-28 02:53:13
1,2,pride,صندوق دار,پراید,ساده,1386,325000,بنزینی,دنده ای,گلگیر رنگ,320000000,2026-01-28 02:53:13
2,3,pride,151,پراید,GX,1404,0,بنزینی,دنده ای,بدون رنگ,680000000,2026-01-28 02:53:13
3,4,pride,131,پراید,SE,1399,40000,بنزینی,دنده ای,گلگیر تعویض,560000000,2026-01-28 02:53:13
4,5,pride,131,پراید,SE,1398,88000,بنزینی,دنده ای,دو لکه رنگ,600000000,2026-01-28 02:53:13


---
### Feature Engineering begins

In [145]:
# Copying the dataframe for further engineerings

df_eng = df.copy()

print(f"Original df shape: {df.shape}")
print(f"Engineering df shape: {df_eng.shape}")
print("\nOriginal columns(new df):", list(df_eng.columns))

Original df shape: (539, 12)
Engineering df shape: (539, 12)

Original columns(new df): ['id', 'brand', 'name', 'model', 'trim', 'year', 'mileage', 'fuel', 'transmission', 'body_status', 'price', 'created_at']


#### Dropping not essential columns

In [147]:
df_eng = df_eng.drop(['id', 'created_at'], axis=1)

df_eng.head(3)

Unnamed: 0,brand,name,model,trim,year,mileage,fuel,transmission,body_status,price
0,pride,صندوق دار,پراید,ساده,1389,355000,بنزینی,دنده ای,گلگیر تعویض,345000000
1,pride,صندوق دار,پراید,ساده,1386,325000,بنزینی,دنده ای,گلگیر رنگ,320000000
2,pride,151,پراید,GX,1404,0,بنزینی,دنده ای,بدون رنگ,680000000


#### Adding new column 'age'
##### This can be more useful when it comes to ML modeling

In [149]:
current_year = 1404
df_eng['age'] = 1404 - df_eng['year']

df_eng.head(3)

Unnamed: 0,brand,name,model,trim,year,mileage,fuel,transmission,body_status,price,age
0,pride,صندوق دار,پراید,ساده,1389,355000,بنزینی,دنده ای,گلگیر تعویض,345000000,15
1,pride,صندوق دار,پراید,ساده,1386,325000,بنزینی,دنده ای,گلگیر رنگ,320000000,18
2,pride,151,پراید,GX,1404,0,بنزینی,دنده ای,بدون رنگ,680000000,0


#### Encoding 'trim' column based on average price

In [151]:
# Calculations based on average price
trim_avg_price = df_eng.groupby('trim')['price'].mean().sort_values()
print("Trim average prices (sorted):")
for trim, avg_price in trim_avg_price.items():
    print(f"  {trim}: {avg_price:,.0f} Toman")

trim_mapping = {}
for i, (trim, _) in enumerate(trim_avg_price.items(), 1):
    trim_mapping[trim] = i

print(f"\ntrim column encoding mapping:")
for trim, code in sorted(trim_mapping.items(), key=lambda x: x[1]):
    print(f"  {code:2} → {trim}")

# Apply encoding
df_eng['trim'] = df_eng['trim'].map(trim_mapping)

print(f"\nUnique Trim values after encoding: {df_eng['trim'].unique()}")
#print(df_eng[['Trim', 'Price']].head())
df_eng.head(3)        # Checking changes..

Trim average prices (sorted):
  ساده: 276,893,333 Toman
  LE: 301,250,000 Toman
  SL: 348,272,727 Toman
  LX: 350,000,000 Toman
  SX: 365,857,143 Toman
  EX: 393,750,000 Toman
  TL: 398,333,333 Toman
  پلاس: 498,000,000 Toman
  SE: 523,899,582 Toman
  GX: 651,577,419 Toman

trim column encoding mapping:
   1 → ساده
   2 → LE
   3 → SL
   4 → LX
   5 → SX
   6 → EX
   7 → TL
   8 → پلاس
   9 → SE
  10 → GX

Unique Trim values after encoding: [ 1 10  9  8  3  5  2  6  4  7]


Unnamed: 0,brand,name,model,trim,year,mileage,fuel,transmission,body_status,price,age
0,pride,صندوق دار,پراید,1,1389,355000,بنزینی,دنده ای,گلگیر تعویض,345000000,15
1,pride,صندوق دار,پراید,1,1386,325000,بنزینی,دنده ای,گلگیر رنگ,320000000,18
2,pride,151,پراید,10,1404,0,بنزینی,دنده ای,بدون رنگ,680000000,0


#### Encoding 'name' column base on average price

In [153]:

name_avg_price = df_eng.groupby('name')['price'].mean().sort_values()
print("Name average prices:")
for name, avg_price in list(name_avg_price.items()):
    print(f"  {name}: {avg_price:,.0f} Toman")

name_mapping = {}
for i, (name, _) in enumerate(name_avg_price.items(), 1):
    name_mapping[name] = i

print(f"\nname column encoding mapping:")
for name, code in sorted(name_mapping.items(), key=lambda x: x[1]):
    print(f"  {code:2} → {name}")

df_eng['name'] = df_eng['name'].map(name_mapping)
print(f"\nUnique name values after encoding: {df_eng['name'].unique()}")

df_eng.head(3)

Name average prices:
  هاچ بک: 248,500,000 Toman
  صندوق دار: 277,523,567 Toman
  141: 309,500,000 Toman
  132: 357,733,333 Toman
  131: 462,621,302 Toman
  111: 518,909,091 Toman
  151: 614,589,773 Toman

name column encoding mapping:
   1 → هاچ بک
   2 → صندوق دار
   3 → 141
   4 → 132
   5 → 131
   6 → 111
   7 → 151

Unique name values after encoding: [2 7 5 4 3 6 1]


Unnamed: 0,brand,name,model,trim,year,mileage,fuel,transmission,body_status,price,age
0,pride,2,پراید,1,1389,355000,بنزینی,دنده ای,گلگیر تعویض,345000000,15
1,pride,2,پراید,1,1386,325000,بنزینی,دنده ای,گلگیر رنگ,320000000,18
2,pride,7,پراید,10,1404,0,بنزینی,دنده ای,بدون رنگ,680000000,0


#### Encoding 'body_status' column based on quality

In [155]:
body_status_hierarchy = {
    # Lowest quality - major repairs
    'اتاق تعویض': 1,          # Body replaced
    
    # Lower quality - part replacements
    'درب تعویض': 2,          # Door replaced
    'گلگیر تعویض': 3,        # Fender replaced
    'کاپوت تعویض': 4,        # Hood replaced
    
    # Medium quality - significant paint work
    'کامل رنگ': 5,           # Full paint job
    'صافکاری بدون رنگ': 6,   # Bodywork without paint
    'دور رنگ': 7,            # All-around paint
    'گلگیر رنگ': 8,          # Fender painted
    
    # Good quality - minor paint work
    'کاپوت رنگ': 9,          # Hood painted
    'دو درب رنگ': 10,        # Two doors painted
    'یک درب رنگ': 11,        # One door painted
    'چند لکه رنگ': 12,       # Several spots painted
    'دو لکه رنگ': 13,        # Two spots painted
    'یک لکه رنگ': 14,        # One spot painted
    
    # Highest quality - perfect/complete paint
    'بدون رنگ': 15           # No paint needed (original paint in good condition)
}

for status in df_eng['body_status'].unique():
    if status not in body_status_hierarchy:
        body_status_hierarchy[status] = 0

df_eng['body_status'] = df_eng['body_status'].map(body_status_hierarchy)

print(f"\nUnique body_status values after encoding: {df_eng['body_status'].unique()}")

df_eng.head(3)


Unique body_status values after encoding: [ 3  8 15 13 14  7 12  6  4  5 10 11  9  2  1]


Unnamed: 0,brand,name,model,trim,year,mileage,fuel,transmission,body_status,price,age
0,pride,2,پراید,1,1389,355000,بنزینی,دنده ای,3,345000000,15
1,pride,2,پراید,1,1386,325000,بنزینی,دنده ای,8,320000000,18
2,pride,7,پراید,10,1404,0,بنزینی,دنده ای,15,680000000,0


#### Encoding fuel column

In [157]:

fuel_mapping = {'دوگانه سوز': 2, 'بنزینی': 1}

df_eng['fuel'] = df_eng['fuel'].map(fuel_mapping)
print(f"Fuel encoded: {df_eng['fuel'].unique()}")

df_eng.head(3)

Fuel encoded: [1 2]


Unnamed: 0,brand,name,model,trim,year,mileage,fuel,transmission,body_status,price,age
0,pride,2,پراید,1,1389,355000,1,دنده ای,3,345000000,15
1,pride,2,پراید,1,1386,325000,1,دنده ای,8,320000000,18
2,pride,7,پراید,10,1404,0,1,دنده ای,15,680000000,0


#### Encoding transmission column

In [159]:

transmission_mapping = {'اتوماتیک': 2, 'دنده ای': 1}
df_eng['transmission'] = df_eng['transmission'].map(transmission_mapping)
print(f"Transmission encoded: {df_eng['transmission'].unique()}")

df_eng.head(3)                 # Pride cars only contain manual transmission

Transmission encoded: [1]


Unnamed: 0,brand,name,model,trim,year,mileage,fuel,transmission,body_status,price,age
0,pride,2,پراید,1,1389,355000,1,1,3,345000000,15
1,pride,2,پراید,1,1386,325000,1,1,8,320000000,18
2,pride,7,پراید,10,1404,0,1,1,15,680000000,0


#### Saving engineered dataset

In [166]:
df_eng.to_csv('D:/AIjourney/projects/Pride Ads Project/CSV/pride_ads_engineered_1.csv', encoding='utf-8-sig')
print('Engineered dataset has been successfully saved.')

Engineered dataset has been successfully saved.
