In [1]:
#Multiple Linear Regression – Car Price Prediction

##Uploading Dataset from kaggle

#Upload Kaggle.json file downloaded from kaggle.

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

!kaggle datasets download -d hellbuoy/car-price-prediction #Datset name with user_id of the person who uploaded this dataset.

##Unzipping the file#

from zipfile import ZipFile

with ZipFile("/content/car-price-prediction.zip","r") as zip_file:
  zip_file.extractall()

##Starting Main Program

# Importing all required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import root_mean_squared_error,r2_score
from sklearn.linear_model import LinearRegression

# Reading CSV file
df = pd.read_csv('/content/CarPrice_Assignment.csv')

# Displaying first 5 rows of the dataset.
df.head()

# Displaying Information of the dataset.
df.info()

#In thid dataset 16 columns are of numerical type and 10 are object type

df.describe()

#Ignore not available data
df=df.dropna()

# Selecting columns for x and y to use in train test split.
x = df[['symboling','enginetype','enginesize','horsepower','peakrpm','highwaympg','citympg','cylindernumber']]
y = df['price']

# Splitting the data in 30(Test)/70(Train).
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state=2304)

num_features = ['symboling','enginesize','horsepower','peakrpm','highwaympg','citympg']
# Applying OHE on Categorical data,and
ohe = OneHotEncoder(sparse_output = False)
# Applying StandardScaler on numerical data for scaling
scalar = StandardScaler()
# Using ColumnTransformer to apply different transformers to different columns of a dataset
ct = ColumnTransformer(transformers = [('ohe_transform',ohe,['cylindernumber','enginetype']),('scalar_transform',scalar,num_features)],remainder = "drop")

# Creating a Pipeline for data preprocessing and linear regression model
model_pipeline = Pipeline(steps = [('Preprocess',ct),('model',LinearRegression())])

#Showing Pipeline steps.
model_pipeline

# Providing x_train and y_train to preprocess data and train the model through pipeline.
model_pipeline.fit(x_train,y_train)

# Predicting output on test data.
y_pred = model_pipeline.predict(x_test)

# Applying metrics like root_mean_squared_error and r2_score to check how well model is predicting.
rmse = root_mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

# Prining the results of rmse and r2_square
print(f"rmse: {rmse}\nr2: {r2}")

Dataset URL: https://www.kaggle.com/datasets/hellbuoy/car-price-prediction
License(s): unknown
Downloading car-price-prediction.zip to /content
  0% 0.00/18.1k [00:00<?, ?B/s]
100% 18.1k/18.1k [00:00<00:00, 43.7MB/s]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight      