1. Loading Data:
   - Load the datasets.
   - Display the first few rows of each dataset to understand its structure


In [101]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Hypothetical file path
cars_file_path = r'Data\pakwheels_used_cars.csv'
weather_file_path = r'Data\weather_classification_data.csv'

# Load the dataset
used_cars_df = pd.read_csv(cars_file_path)
weather_df=pd.read_csv(weather_file_path)

# Display the first few rows of the dataset
print("--------------Used Cars Data--------------")
print(used_cars_df.head())

print("--------Weather Classfication data--------")
print(weather_df.head())

--------------Used Cars Data--------------
    ad_ref  assembly       body ad_city                color  engine_cc  \
0  7927285  Imported        Van  Lahore          Pearl White     2000.0   
1  7679303  Imported  Hatchback  Lahore                 Grey      996.0   
2  7915479       NaN      Sedan  Lahore          Super white     1798.0   
3  7918380       NaN      Sedan  Lahore  Crystal Black Pearl     1500.0   
4  7676167  Imported        MPV  Lahore               Silver     3000.0   

  fuel_type    make  mileage    model     registered transmission    year  \
0    Hybrid  Nissan   124000   Serena  Un-Registered    Automatic  1905.0   
1    Petrol  Toyota    30738     Vitz         Punjab    Automatic  1905.0   
2    Petrol  Toyota   183000  Corolla         Punjab    Automatic  1905.0   
3    Petrol   Honda    41000    Civic         Punjab    Automatic  1905.0   
4    Petrol  Toyota   126000  Alphard         Punjab    Automatic  1905.0   

       price  
0  8990000.0  
1  4190000.0 

______________________________________________________________________________________________________________________________________
2. Handling Missing Values:
   - Identify and handle missing values in both datasets. Provide a justification for the methods used to handle missing data.

  ----- pakwheel used car-----
In Pakwheels used car data there is a lotof missing values in assembly column so i put Local in the missing values because if we drop the missing values before adding Local to Assembly our data will becomes small

After this we drop the missing value which is not to much

-----Weather Classification data-----
In this data we do not have to much missing values so we simply drop them


In [102]:
def Fill_Assembly(used_cars_df):
    used_cars_df['assembly'].fillna('Local', inplace=True)
    return used_cars_df

def Drop_Missing_value(used_cars_df, weather_df):
    used_cars_df = used_cars_df.dropna()
    weather_df = weather_df.dropna()
    return used_cars_df, weather_df

used_cars_df = Fill_Assembly(used_cars_df)
used_cars_df, weather_df = Drop_Missing_value(used_cars_df, weather_df)

num_rows_car = used_cars_df.shape[0]
print(f"Number of rows in used_cars_df: {num_rows_car}")

num_rows_weather = weather_df.shape[0]
print(f"Number of rows in weather_df: {num_rows_weather}")

Number of rows in used_cars_df: 62807
Number of rows in weather_df: 13200


______________________________________________________________________________________________________________________________________
3. Data Transformation:
   - Convert categorical variables to numerical variables using appropriate encoding techniques (e.g., one-hot encoding, label encoding).
   - Normalize/standardize the numerical features.


------pakwheel used cars data---------

In fuel_type, transmission and assembly column i use one hot encoding because values are not to many in these categorical values and on the others value i simply apply label encoding

-----Weather Classification data------

In this data i applied One hot encoding to every category column because the values in it is not to much


In [103]:
used_car_df = pd.get_dummies(used_cars_df, columns=['fuel_type'])
used_car_df = pd.get_dummies(used_car_df, columns=['transmission'])
used_car_df = pd.get_dummies(used_car_df, columns=['assembly'])

label_encoder = LabelEncoder()
used_car_df['body'] = label_encoder.fit_transform(used_car_df['body'])
used_car_df['ad_city'] = label_encoder.fit_transform(used_car_df['ad_city'])
used_car_df['color'] = label_encoder.fit_transform(used_car_df['color'])
used_car_df['make'] = label_encoder.fit_transform(used_car_df['make'])
used_car_df['model'] = label_encoder.fit_transform(used_car_df['model'])
used_car_df['registered'] = label_encoder.fit_transform(used_car_df['registered'])

print("--------------Used Cars Data--------------")
print(used_car_df.head())


--------------Used Cars Data--------------
    ad_ref  body  ad_city  color  engine_cc  make  mileage  model  registered  \
0  7927285    20      153    234     2000.0    35   124000    230         102   
1  7679303     7      153    156      996.0    45    30738    282          80   
2  7915479    16      153    325     1798.0    45   183000     80          80   
3  7918380    16      153     93     1500.0    19    41000     71          80   
4  7676167     9      153    290     3000.0    45   126000     22          80   

     year      price  fuel_type_Diesel  fuel_type_Hybrid  fuel_type_Petrol  \
0  1905.0  8990000.0             False              True             False   
1  1905.0  4190000.0             False             False              True   
2  1905.0  3990000.0             False             False              True   
3  1905.0  6490000.0             False             False              True   
4  1905.0  4750000.0             False             False              True   

 

In [104]:
scaler = MinMaxScaler()
used_cars_normalized = pd.DataFrame(scaler.fit_transform(used_car_df), columns=used_car_df.columns)
print(used_cars_normalized)

         ad_ref  body   ad_city     color  engine_cc      make   mileage  \
0      0.999357  1.00  0.534965  0.635870   0.127517  0.714286  0.123999   
1      0.964819  0.35  0.534965  0.423913   0.060134  0.918367  0.030737   
2      0.997712  0.80  0.534965  0.883152   0.113960  0.918367  0.182999   
3      0.998116  0.80  0.534965  0.252717   0.093960  0.387755  0.040999   
4      0.964382  0.45  0.534965  0.788043   0.194631  0.918367  0.125999   
...         ...   ...       ...       ...        ...       ...       ...   
62802  0.955566  0.35  0.000000  0.078804   0.046980  0.897959  0.054999   
62803  0.932218  0.80  0.335664  0.875000   0.093960  0.918367  0.029163   
62804  0.966289  0.80  0.720280  0.016304   0.100671  0.387755  0.119499   
62805  0.972333  0.80  0.433566  0.875000   0.093960  0.918367  0.017999   
62806  0.975781  0.80  0.433566  0.891304   0.114094  0.387755  0.029999   

          model  registered  year     price  fuel_type_Diesel  \
0      0.759076    0.9

In [105]:
scaler = StandardScaler()
used_cars_standardized = pd.DataFrame(scaler.fit_transform(used_cars_normalized), columns=used_cars_normalized.columns)
print(used_cars_standardized)

         ad_ref      body   ad_city     color  engine_cc      make   mileage  \
0      0.498619  1.772839  0.201167  0.002891   0.953963 -0.013295  0.388517   
1     -0.489386 -1.041538  0.201167 -0.658642  -0.627128  0.773871 -0.719574   
2      0.451582  0.906877  0.201167  0.774680   0.635855  0.773871  1.089525   
3      0.463140  0.906877  0.201167 -1.192958   0.166567 -1.272761 -0.597647   
4     -0.501880 -0.608557  0.201167  0.477838   2.528756  0.773871  0.412280   
...         ...       ...       ...       ...        ...       ...       ...   
62802 -0.754071 -1.041538 -2.367810 -1.735754  -0.935788  0.695155 -0.431306   
62803 -1.421979  0.906877 -0.755902  0.749237   0.166567  0.773871 -0.738276   
62804 -0.447329  0.906877  1.091074 -1.930822   0.324046 -1.272761  0.335051   
62805 -0.274439  0.906877 -0.285763  0.749237   0.166567  0.773871 -0.870921   
62806 -0.175807  0.906877 -0.285763  0.800124   0.639005 -1.272761 -0.728343   

          model  registered  year     p

In [106]:
weathers_df=pd.get_dummies(weather_df,columns=['Cloud Cover'])
weathers_df=pd.get_dummies(weathers_df,columns=['Season'])
weathers_df=pd.get_dummies(weathers_df,columns=['Location'])
weathers_df=pd.get_dummies(weathers_df,columns=['Weather Type'])


print("--------Weather Classfication data--------")
print(weathers_df.head())


--------Weather Classfication data--------
   Temperature  Humidity  Wind Speed  Precipitation (%)  Atmospheric Pressure  \
0         14.0        73         9.5               82.0               1010.82   
1         39.0        96         8.5               71.0               1011.43   
2         30.0        64         7.0               16.0               1018.72   
3         38.0        83         1.5               82.0               1026.25   
4         27.0        74        17.0               66.0                990.67   

   UV Index  Visibility (km)  Cloud Cover_clear  Cloud Cover_cloudy  \
0         2              3.5              False               False   
1         7             10.0              False               False   
2         5              5.5               True               False   
3         7              1.0               True               False   
4         1              2.5              False               False   

   Cloud Cover_overcast  ...  Season_Spring

In [107]:
scaler = MinMaxScaler()
weather_normalized = pd.DataFrame(scaler.fit_transform(weathers_df), columns=weathers_df.columns)
print(weather_normalized)

       Temperature  Humidity  Wind Speed  Precipitation (%)  \
0         0.291045  0.595506    0.195876           0.752294   
1         0.477612  0.853933    0.175258           0.651376   
2         0.410448  0.494382    0.144330           0.146789   
3         0.470149  0.707865    0.030928           0.752294   
4         0.388060  0.606742    0.350515           0.605505   
...            ...       ...         ...                ...   
13195     0.261194  0.606742    0.298969           0.651376   
13196     0.179104  0.629213    0.072165           0.211009   
13197     0.410448  0.640449    0.113402           0.256881   
13198     0.208955  0.629213    0.206186           0.862385   
13199     0.149254  0.202247    0.000000           0.844037   

       Atmospheric Pressure  UV Index  Visibility (km)  Cloud Cover_clear  \
0                  0.527951  0.142857            0.175                0.0   
1                  0.529480  0.500000            0.500                0.0   
2           

4. Data Splitting:
   - Split the data into training and testing sets using an 80-20 split


In [108]:
train1, test1 = train_test_split(used_cars_standardized, test_size=0.2)
train2, test2 = train_test_split(weather_normalized, test_size=0.2)
num_rows_train_car = train1.shape[0]
print("------Pakwheel Used Data---------\n")
print(f"Number of rows of Training of Pakwheel data : {num_rows_train_car}")  
num_rows_test_car = test1.shape[0]
print(f"Number of rows of Testing of Pakwheel data : {num_rows_test_car}\n")  
print("------Weather Classification data---------\n")
num_rows_train_weather = train2.shape[0]
print(f"Number of rows of Trainig of Weather data: {num_rows_train_weather}")  
num_rows_test_weather = test2.shape[0]
print(f"Number of rows of Testing of Weather data: {num_rows_test_weather}")  


------Pakwheel Used Data---------

Number of rows of Training of Pakwheel data : 50245
Number of rows of Testing of Pakwheel data : 12562

------Weather Classification data---------

Number of rows of Trainig of Weather data: 10560
Number of rows of Testing of Weather data: 2640
