# دیتاست خودروهای کارکرده
---
* name - نام خودرو
* year - سال تولید
* selling_price - قیمت فروش
* km_driven - کیلومتر طی‌شده
* fuel - نوع سوخت
* seller_type - نوع فروشنده
* transmission - نوع گیربکس
* owner - تعداد مالکین
* mileage - مصرف سوخت (کیلومتر بر لیتر)
* engine - حجم موتور (سی‌سی)
* max_power - قدرت موتور (اسب بخار)
* torque - گشتاور (نیوتون متر)
* seats - تعداد صندلی‌ها

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
from datetime import datetime
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,StandardScaler,PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
%matplotlib inline

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/datasets/used_cars.csv')
df.head()

### Exploration of the dataset

In [None]:
df.shape
df.columns
df.dtypes
df.info()
df.describe()
df.describe(exclude=[np.number])
df.nunique()
df['fuel'].value_counts()
df['seller_type'].value_counts()
df['transmission'].value_counts()
df['owner'].value_counts()
df['seats'].value_counts()
df.isnull().sum()
df.duplicated().sum()

### Preprocessing Dataset

In [None]:
df=df.drop_duplicates()
df.duplicated().sum()

In [None]:
df.shape

In [None]:
df['mileage'] = df['mileage'].str.extract('(\d+\.\d+|\d+)').astype(float)
df['engine'] = df['engine'].str.extract('(\d+\.\d+|\d+)').astype(float)
df['max_power'] = df['max_power'].str.extract('(\d+\.\d+|\d+)').astype(float)
df.head()

In [None]:
df.dtypes

In [None]:
df['mileage'].fillna(df['mileage'].mean(), inplace=True)
df['engine'].fillna(df['engine'].mean(), inplace=True)
df['max_power'].fillna(df['max_power'].mean(), inplace=True)
df['seats'].fillna(df['seats'].median(), inplace=True)
df.isnull().sum()

In [None]:
nm_to_kgm = 0.10197
def convert_torque(value):
    if isinstance(value, str):
        match = re.search(r'\d+(\.\d+)?', value)
        if match:
            number = float(match.group())
            if 'nm' in value.lower():
                number_converted = number * nm_to_kgm
                return f"{number_converted:.2f}"
            elif 'kgm' in value.lower():
                return f"{number:.2f}"
    return np.nan

df['torque'] = df['torque'].apply(convert_torque).astype(float)
df.head()

In [None]:
df.isnull().sum()

In [None]:
df['torque'].fillna(df['torque'].mode()[0], inplace=True)
df.isnull().sum()

In [None]:
def yearTransform(year):
    return datetime.now().year - year

df["year"] = df["year"].apply(yearTransform)
df.head()

In [None]:
df=df.drop(["name"],axis=1)
df.head()

In [None]:
categorical_columns = ["fuel","seller_type","transmission","owner","seats"]
labelencoder = LabelEncoder()
for col in categorical_columns:
    df[col] = labelencoder.fit_transform(df[col])
df.head()

In [None]:
plt.figure(figsize=(15, 10))
sns.boxplot(data=df)
plt.xticks(rotation=90)
plt.title('Boxplot for All Features')
plt.show()

In [None]:
threshold=2
def find_outliers(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - threshold * IQR
    upper_bound = Q3 + threshold * IQR
    return column[(column < lower_bound) | ( upper_bound < column)]

In [None]:
for i, column in enumerate(df.columns):
    outliers = find_outliers(df[column])
    if(outliers.size>0):
      print(f"outliers - {column} :\n{outliers.size}")

In [None]:
def remove_outliers(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - threshold * IQR
    upper_bound = Q3 + threshold * IQR
    df_cleaned = df[~((df < lower_bound) | (df > upper_bound)).any(axis=1)]
    return df_cleaned

In [None]:
df = remove_outliers(df)

In [None]:
for i, column in enumerate(df.columns):
    outliers = find_outliers(df[column])
    if(outliers.size>0):
      print(f"outliers - {column} :\n{outliers.size}")

In [None]:
plt.figure(figsize=(15, 10))
sns.boxplot(data=df)
plt.xticks(rotation=90)
plt.title('Boxplot for All Features')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
df['selling_price'].plot(kind='hist', bins=30, color='skyblue', edgecolor='black')
plt.title('Distribution of Selling Prices')
plt.xlabel('Selling Price')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()


plt.figure(figsize=(10, 6))
plt.scatter(df['year'], df['selling_price'], alpha=0.5)
plt.title('Year vs Selling Price')
plt.xlabel('Year')
plt.ylabel('Selling Price')
plt.grid(True)
plt.show()


plt.figure(figsize=(10, 6))
plt.scatter(df['engine'], df['max_power'], alpha=0.5, color='orange')
plt.title('Engine Size vs Max Power')
plt.xlabel('Engine Size (CC)')
plt.ylabel('Max Power (BHP)')
plt.grid(True)
plt.show()

In [None]:
X=df.drop(["selling_price"],axis=1)
y=df["selling_price"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape , X_test.shape , y_train.shape , y_test.shape

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)
X_train , X_test

In [None]:
X_train = pd.DataFrame(X_train, columns=['year', 'km_driven', 'fuel', 'seller_type',
                                         'transmission', 'owner', 'mileage',
                                         'engine', 'max_power', 'torque', 'seats'])

X_test = pd.DataFrame(X_test, columns=['year', 'km_driven', 'fuel', 'seller_type',
                                         'transmission', 'owner', 'mileage',
                                         'engine', 'max_power', 'torque', 'seats'])


#### Single LinearRegression

In [None]:
X_train_single = X_train[['mileage']]
X_test_single = X_test[['mileage']]
model_single = LinearRegression()
model_single.fit(X_train_single, y_train)
y_pred_single = model_single.predict(X_test_single)

mse_single = mean_squared_error(y_test, y_pred_single)
r2_single = r2_score(y_test, y_pred_single)
print(f"Mean Squared Error : {mse_single}")
print(f"R-squared : {r2_single}")

plt.scatter(X_test_single, y_test, color='blue', label='Actual')
plt.plot(X_test_single, y_pred_single, color='red', label='Predicted')
plt.xlabel('Engine Size')
plt.ylabel('Selling Price')
plt.legend()
plt.show()

In [None]:
model_multi = LinearRegression()
model_multi.fit(X_train, y_train)
y_pred_multi = model_multi.predict(X_test)

mse_multi = mean_squared_error(y_test, y_pred_multi)
r2_multi = r2_score(y_test, y_pred_multi)
print(f"Mean Squared Error : {mse_multi}")
print(f"R-squared : {r2_multi}")

In [None]:
degree = 3
model_poly = make_pipeline(PolynomialFeatures(degree), LinearRegression())
model_poly.fit(X_train, y_train)
y_pred_poly = model_poly.predict(X_test)

mse_poly = mean_squared_error(y_test, y_pred_poly)
r2_poly = r2_score(y_test, y_pred_poly)
print(f"Mean Squared Error : {mse_poly}")
print(f"R-squared : {r2_poly}")

In [None]:
model_ridge = Ridge(alpha=1.0)
model_ridge.fit(X_train, y_train)
y_pred_ridge = model_ridge.predict(X_test)

mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)
print(f"Mean Squared Error : {mse_ridge}")
print(f"R-squared : {r2_ridge}")

In [None]:
model_lasso = Lasso(alpha=0.1)
model_lasso.fit(X_train, y_train)
y_pred_lasso = model_lasso.predict(X_test)

mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)
print(f"Mean Squared Error : {mse_lasso}")
print(f"R-squared : {r2_lasso}")

In [None]:
model_elastic = ElasticNet(alpha=0.1, l1_ratio=0.5)
model_elastic.fit(X_train, y_train)
y_pred_elastic = model_elastic.predict(X_test)

mse_elastic = mean_squared_error(y_test, y_pred_elastic)
r2_elastic = r2_score(y_test, y_pred_elastic)
print(f"Mean Squared Error : {mse_elastic}")
print(f"R-squared : {r2_elastic}")

In [None]:
models = ['Single', 'Multi', 'Poly', 'Ridge', 'Lasso', 'ElasticNet']
mse_values = [mse_single, mse_multi, mse_poly, mse_ridge, mse_lasso, mse_elastic]
r2_values = [r2_single, r2_multi, r2_poly, r2_ridge, r2_lasso, r2_elastic]

plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.bar(models, mse_values, color='skyblue')
plt.title('MSE for Different Regression Models')
plt.ylabel('Mean Squared Error')
plt.xlabel('Models')

plt.subplot(1, 2, 2)
plt.bar(models, r2_values, color='lightgreen')
plt.title('R2 Score for Different Regression Models')
plt.ylabel('R2 Score')
plt.xlabel('Models')

plt.tight_layout()
plt.show()