# Use case : Regression
dataset 1 : used_cars →
 https://www.kaggle.com/datasets/taeefnajib/used-car-price-prediction-dataset

dataset2 : dataset →



> Importing libraries



In [4]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

sns.set(style="whitegrid")

ModuleNotFoundError: No module named 'pandas'

> importing datasets



In [None]:
#import of the dfs
df1=pd.read_csv("used_cars.csv")
df2=pd.read_csv("dataset.csv")

> datasets overview

In [None]:
#a look into the dfs
df1
df2

In [None]:
print(df1.dtypes)
print(df2.dtypes)

#as we can see the common data from both dataframes has different types so we need to change them

> # Regression _ Data Cleaning

In [None]:
#let's see if we have null values
print(df1.isnull().sum())
print(df2.isnull().sum())

In [None]:
dfC1 = df1.dropna()
dfC2 = df2.dropna()

In [None]:
#let's see if we have null values
print(dfC1.isnull().sum())
print(dfC2.isnull().sum())
#as we see we have no more null values

In [None]:
#milage had the miles symbol so we couldnt convert it to int so we deleted the symbol
# Assuming dfC1 is your DataFrame

# Update milage column
dfC1.loc[:, 'milage'] = dfC1['milage'].str.replace('\D', '', regex=True)

# Update price column
dfC1.loc[:, 'price'] = dfC1['price'].str.replace('\D', '', regex=True)

#an error will be shown cauz now i have already deleted in the dataframe

In [None]:
#now convert the columns

# Assuming dfC1 is your DataFrame

# Convert 'milage' column to int
dfC1.loc[:, 'milage'] = dfC1['milage'].astype(int)

# Convert 'price' column to int
dfC1.loc[:, 'price'] = dfC1['price'].astype(int)

# Convert 'accident' column to boolean
dfC1.loc[:, 'accident'] = np.where(dfC1['accident'] != 'None reported', True, False)

dfC1

In [None]:
#same process for the second df
dfC2['Mileage'] = dfC2['Mileage'].str.replace('\D', '', regex=True)
dfC2['New_Price'] = dfC2['New_Price'].str.replace('\D', '', regex=True)

In [None]:
dfC2["Mileage"] = dfC2["Mileage"].astype(int)
dfC2["New_Price"] = dfC2["New_Price"].astype(int)
#df2.dtypes
#df2.head()

In [None]:
print(dfC1.dtypes)
print(dfC2.dtypes)

In [None]:
#for the second df we need to multiply the price by 1000
dfC2['Price'] = dfC2['Price']*1000

In [None]:
#now let's match the features for our model
#as we see we can fusion the brand and model into the name of the car
dfC1['Name'] = dfC1['brand'] + ' ' + dfC1['model']

# Drop the 'brand' and 'model' columns
dfC1 = dfC1.drop(['brand', 'model'], axis=1)

In [None]:
dfC2

In [None]:
columns_to_drop2 = ['Location', 'Owner_Type', 'Mileage', 'Power', 'Seats', 'New_Price']

# Drop the specified columns
dfC2 = dfC2.drop(columns=columns_to_drop2, axis=1)
dfC2 = dfC2.loc[:, ~dfC2.columns.str.contains('^Unnamed')]

In [None]:
# changing names
dfC1 = dfC1.rename(columns={'model_year': 'Year'})
dfC1 = dfC1.rename(columns={'milage': 'Kilometers_Driven'})
dfC1 = dfC1.rename(columns={'transmission': 'Transmission'})
dfC1 = dfC1.rename(columns={'fuel_type': 'Fuel_Type'})
dfC1 = dfC1.rename(columns={'price': 'Price'})
dfC1 = dfC1.rename(columns={'engine': 'Engine'})

In [None]:
columns_to_drop1 = ['ext_col',	'int_col'	,'accident'	,'clean_title']

# Drop the specified columns
dfC1 = dfC1.drop(columns=columns_to_drop1, axis=1)

In [None]:
#now our dfs are clear and have the same type of data
#let's merge both dfs
dfC1

In [None]:
print(dfC1.dtypes)
print(dfC2.dtypes)

In [None]:
dfC1['Year'] = dfC1['Year'].astype('int64')
dfC1['Kilometers_Driven'] = dfC1['Kilometers_Driven'].astype('int64')
dfC1['Price'] = dfC1['Price'].astype('int64')

In [None]:
df3 = pd.concat([dfC1, dfC2], ignore_index=True)

# Display the concatenated dataframe
print(df3)

In [None]:
df3.to_csv('output_file.csv', index=False)

> # Data Augmentation _ Regression

>***Feature Engineering***: Creating new feature from existing ones  

Example1: **Car Age**  
➡ Creating a new feature representing the age of the car.  
Example2: **Usage Ratio**  
➡ Presenting how much a car has been driven relative to its age.

In [None]:
current_year = 2023  # Assuming current year
df3['Car_Age'] = current_year - df3['Year']

In [None]:
df3['Usage_Ratio'] = df3['Kilometers_Driven'] / df3['Car_Age']

>***Binning***:Converting continuous variables into categorical bins.  
Example: **Price Range Binning**  

➡ Categorizing cars into age groups or price ranges.

In [None]:
df3['Price_Range'] = pd.cut(df3['Price'], bins=[0, 10000, 20000, 30000, 40000, np.inf], labels=['0-10k', '10-20k', '20-30k', '30-40k', '40k+'])

>One-Hot Encoding for Categorical Variables:

➡ one-hot encoding to 'Fuel_Type' and 'Transmission'

In [None]:
df3 = pd.get_dummies(df3, columns=['Fuel_Type', 'Transmission'])

> Interaction Features

➡ Understanding the relationship between the engine specifications and the car's model year.

In [None]:
df3['Engine_Year_Interaction'] = df3['Engine'].astype(str) + '_' + df3['Year'].astype(str)

# Data visualisation _ Regression

> **Price Range Distribution** :

* we are to visualize the distribution of cars across different price ranges.
* Plot Type: Bar Chart




In [None]:
sns.countplot(x='Price_Range', data=df3)
plt.title('Distribution of Cars Across Price Ranges')
plt.xlabel('Price Range')
plt.ylabel('Number of Cars')
plt.show()

> **Year /VS/ Kilometers Driven** :

* this plot will help us explore the relationship between the year of the car and the kilometers driven.
* Plot Type: Scatter Plot

In [None]:
sns.scatterplot(x='Year', y='Kilometers_Driven', data=df3, color='#FF1493')
plt.title('Year vs. Kilometers Driven')
plt.xlabel('Year')
plt.ylabel('Kilometers Driven')
plt.show()

> **Fuel Type Distribution:** :

* we thought about analyzing the distribution of different fuel types in the dataset  
* Plot Type: Pie Chart

**Important note : we got an error while creating the pie chart that indicates that there is no column named Fuel_Type.  
Apparently this happened because we performed one-hot encoding on the 'Fuel_Type' column, which converts it into multiple binary columns, one for each fuel type.**
Therefore we need to with these newly created binary columns

In [None]:
# Collecting all one-hot encoded fuel type columns
fuel_type_columns = [col for col in df3.columns if col.startswith('Fuel_Type_')]

# Summarizing the counts for each fuel type
fuel_type_counts = df3[fuel_type_columns].sum()

# Creating labels from column names (removing the 'Fuel_Type_' prefix)
labels = [col.replace('Fuel_Type_', '') for col in fuel_type_columns]

# Increase the figure size for better readability
plt.figure(figsize=(10, 8))

# Explode the 1st slice (e.g., 'Gasoline') if it has the largest count, or adjust as needed
explode = [0.1 if i == fuel_type_counts.idxmax() else 0 for i in range(len(fuel_type_counts))]

# Create a pie chart with adjustments
plt.pie(fuel_type_counts, labels=labels, autopct='%1.1f%%', startangle=140, explode=explode)

# Place the legend outside of the pie chart
plt.legend(title='Fuel Type', loc='center left', bbox_to_anchor=(1, 0.5))

# Set the title for the pie chart
plt.title('Fuel Type Distribution')

# Adjust layout to make room for the legend
plt.tight_layout()

# Show the pie chart
plt.show()

# **The Regression model**

The goal here is to create a regression model to determine the price of a car

In [None]:
# Split the data into training set and testing set
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Handle infinite values in the "Usage_Ratio" column
max_non_infinite_train = X_train['Usage_Ratio'][~np.isinf(X_train['Usage_Ratio'])].max()
X_train['Usage_Ratio'].replace(np.inf, max_non_infinite_train, inplace=True)

max_non_infinite_test = X_test['Usage_Ratio'][~np.isinf(X_test['Usage_Ratio'])].max()
X_test['Usage_Ratio'].replace(np.inf, max_non_infinite_test, inplace=True)

# Normalization of our data
sca = preprocessing.StandardScaler()
X_train = sca.fit_transform(X_train)
X_test = sca.transform(X_test)
X_test = pd.DataFrame(X_test)

**Linear Regression**

In [None]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
lr.fit(X_train, y_train)

**Evaluation**

In [None]:
# to print the coefficients of our linear model
print(lr.coef_)

##Evaluate the mean_absolute_error
from sklearn import metrics
y_test_pred=lr.predict(X_test)
metrics.mean_absolute_error(y_test, y_test_pred)