### Import required libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import statsmodels.api as sm

#### Read the data

Load the csv file and set the first column as index

In [2]:
df=pd.read_csv('car_price.csv')

In [3]:
df=df.set_index('Car_Name')

In [4]:
df.head(5)

Unnamed: 0_level_0,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
Car_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


Our objective is to predict the selling price of the cars data.

**The data definition is as follows:** <br><br>
**Car_Name:** name of the car <br>

**YearThis:** year in which the car was bought <br>

**Present_Price:** current ex-showroom price of the car (in lakhs)<br>

**Kms_Driven:** distance completed by the car in km <br>

**Fuel_Type:** fuel type of the car <br>

**Seller_Type:** defines whether the seller is a dealer or an individual<br>

**Transmission:** defines whether the car is manual or automatic <br>

**Owner:** defines the number of owners the car has previously had <br>

**Selling_Price:** price the owner wants to sell the car at (in lakhs) (response variable)

### Let's begin with some hands-on practice exercises

**1. Build a full model and interpret the beta coefficients**

        Hint: A full model is a model which includes all the features 

In [5]:
df['Seller_Type'].value_counts()

Seller_Type
Dealer        195
Individual    106
Name: count, dtype: int64

In [6]:
df['Transmission'].value_counts()

Transmission
Manual       261
Automatic     40
Name: count, dtype: int64

In [7]:
df['Fuel_Type'].value_counts()

Fuel_Type
Petrol    239
Diesel     60
CNG         2
Name: count, dtype: int64

In [8]:
df1 = df.copy(deep=True)

df1 = pd.get_dummies(df1,columns=['Seller_Type'],dtype='int64',drop_first=True)
df1 = pd.get_dummies(df1,columns=['Transmission'],dtype='int64',drop_first=True)
df1.head()

Unnamed: 0_level_0,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Owner,Seller_Type_Individual,Transmission_Manual
Car_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ritz,2014,3.35,5.59,27000,Petrol,0,0,1
sx4,2013,4.75,9.54,43000,Diesel,0,0,1
ciaz,2017,7.25,9.85,6900,Petrol,0,0,1
wagon r,2011,2.85,4.15,5200,Petrol,0,0,1
swift,2014,4.6,6.87,42450,Diesel,0,0,1


In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [10]:
df1['Fuel_Type'] = le.fit_transform(df1['Fuel_Type'])

In [11]:
df1 = pd.get_dummies(df1,columns=['Fuel_Type'],dtype='int64')
df1.head()

Unnamed: 0_level_0,Year,Selling_Price,Present_Price,Kms_Driven,Owner,Seller_Type_Individual,Transmission_Manual,Fuel_Type_0,Fuel_Type_1,Fuel_Type_2
Car_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ritz,2014,3.35,5.59,27000,0,0,1,0,0,1
sx4,2013,4.75,9.54,43000,0,0,1,0,1,0
ciaz,2017,7.25,9.85,6900,0,0,1,0,0,1
wagon r,2011,2.85,4.15,5200,0,0,1,0,0,1
swift,2014,4.6,6.87,42450,0,0,1,0,1,0


In [12]:
# Fuel Type
# 0 -> CNG
# 1 -> Diesel
# 2 -> Petrol

In [13]:
df1[['Fuel_Type_0','Fuel_Type_1','Fuel_Type_2']].sum()

Fuel_Type_0      2
Fuel_Type_1     60
Fuel_Type_2    239
dtype: int64

In [14]:
y = df1['Selling_Price']
x = df1.drop('Selling_Price',axis=1)
xc = sm.add_constant(x)

model = sm.OLS(y,xc).fit()
model.summary()

0,1,2,3
Dep. Variable:,Selling_Price,R-squared:,0.883
Model:,OLS,Adj. R-squared:,0.879
Method:,Least Squares,F-statistic:,274.3
Date:,"Mon, 12 May 2025",Prob (F-statistic):,5.71e-131
Time:,18:38:22,Log-Likelihood:,-593.62
No. Observations:,301,AIC:,1205.0
Df Residuals:,292,BIC:,1239.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-591.1580,64.903,-9.108,0.000,-718.896,-463.420
Year,0.3931,0.043,9.159,0.000,0.309,0.478
Present_Price,0.4372,0.016,27.355,0.000,0.406,0.469
Kms_Driven,-7.013e-06,3.23e-06,-2.172,0.031,-1.34e-05,-6.57e-07
Owner,-0.6742,0.423,-1.595,0.112,-1.506,0.158
Seller_Type_Individual,-1.1213,0.257,-4.371,0.000,-1.626,-0.616
Transmission_Manual,-1.4482,0.328,-4.417,0.000,-2.093,-0.803
Fuel_Type_0,-198.0785,21.631,-9.157,0.000,-240.651,-155.506
Fuel_Type_1,-195.6086,21.674,-9.025,0.000,-238.265,-152.952

0,1,2,3
Omnibus:,95.107,Durbin-Watson:,1.795
Prob(Omnibus):,0.0,Jarque-Bera (JB):,696.392
Skew:,1.075,Prob(JB):,6.03e-152
Kurtosis:,10.135,Cond. No.,3.8e+20


In [15]:
# Owner is insignificant variable, therefore dropping it.

In [16]:
y = df1['Selling_Price']
x = df1.drop(['Selling_Price','Owner'],axis=1)
xc = sm.add_constant(x)

model = sm.OLS(y,xc).fit()
model.summary()

0,1,2,3
Dep. Variable:,Selling_Price,R-squared:,0.882
Model:,OLS,Adj. R-squared:,0.879
Method:,Least Squares,F-statistic:,311.5
Date:,"Mon, 12 May 2025",Prob (F-statistic):,1.1e-131
Time:,18:38:22,Log-Likelihood:,-594.93
No. Observations:,301,AIC:,1206.0
Df Residuals:,293,BIC:,1236.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-606.1467,64.389,-9.414,0.000,-732.870,-479.424
Year,0.4030,0.043,9.465,0.000,0.319,0.487
Present_Price,0.4357,0.016,27.237,0.000,0.404,0.467
Kms_Driven,-7.004e-06,3.24e-06,-2.163,0.031,-1.34e-05,-6.32e-07
Seller_Type_Individual,-1.1731,0.255,-4.598,0.000,-1.675,-0.671
Transmission_Manual,-1.4308,0.329,-4.355,0.000,-2.077,-0.784
Fuel_Type_0,-203.0687,21.460,-9.463,0.000,-245.303,-160.834
Fuel_Type_1,-200.6014,21.503,-9.329,0.000,-242.921,-158.281
Fuel_Type_2,-202.4766,21.452,-9.439,0.000,-244.696,-160.258

0,1,2,3
Omnibus:,93.414,Durbin-Watson:,1.785
Prob(Omnibus):,0.0,Jarque-Bera (JB):,697.351
Skew:,1.044,Prob(JB):,3.73e-152
Kurtosis:,10.158,Cond. No.,4.02e+20


In [17]:
# Therefore, Seller Type, Transmission, Fuel Type have negative impact on the selling price of the car.

**2. Is there multicollinearity present? If yes, which variables are involved in multicollinearity?**

In [18]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [19]:
y = df1['Selling_Price']
x = df1.drop('Selling_Price',axis=1)
xc = sm.add_constant(x)
vf = [vif(xc.values,1) for i in range(xc.shape[1])]

In [20]:
vdf = pd.DataFrame(vf,index=xc.columns,columns=['vif'])
vdf

Unnamed: 0,vif
const,1.482347
Year,1.482347
Present_Price,1.482347
Kms_Driven,1.482347
Owner,1.482347
Seller_Type_Individual,1.482347
Transmission_Manual,1.482347
Fuel_Type_0,1.482347
Fuel_Type_1,1.482347
Fuel_Type_2,1.482347


In [21]:
# Since vif value is close to 1, there is no multicollinarity.

In [22]:
model = sm.OLS(y,xc).fit()
model.summary()

0,1,2,3
Dep. Variable:,Selling_Price,R-squared:,0.883
Model:,OLS,Adj. R-squared:,0.879
Method:,Least Squares,F-statistic:,274.3
Date:,"Mon, 12 May 2025",Prob (F-statistic):,5.71e-131
Time:,18:38:22,Log-Likelihood:,-593.62
No. Observations:,301,AIC:,1205.0
Df Residuals:,292,BIC:,1239.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-591.1580,64.903,-9.108,0.000,-718.896,-463.420
Year,0.3931,0.043,9.159,0.000,0.309,0.478
Present_Price,0.4372,0.016,27.355,0.000,0.406,0.469
Kms_Driven,-7.013e-06,3.23e-06,-2.172,0.031,-1.34e-05,-6.57e-07
Owner,-0.6742,0.423,-1.595,0.112,-1.506,0.158
Seller_Type_Individual,-1.1213,0.257,-4.371,0.000,-1.626,-0.616
Transmission_Manual,-1.4482,0.328,-4.417,0.000,-2.093,-0.803
Fuel_Type_0,-198.0785,21.631,-9.157,0.000,-240.651,-155.506
Fuel_Type_1,-195.6086,21.674,-9.025,0.000,-238.265,-152.952

0,1,2,3
Omnibus:,95.107,Durbin-Watson:,1.795
Prob(Omnibus):,0.0,Jarque-Bera (JB):,696.392
Skew:,1.075,Prob(JB):,6.03e-152
Kurtosis:,10.135,Cond. No.,3.8e+20


**3. What is the impact of present price of the car and seller type on the selling price?**

In [23]:
df1.columns

Index(['Year', 'Selling_Price', 'Present_Price', 'Kms_Driven', 'Owner',
       'Seller_Type_Individual', 'Transmission_Manual', 'Fuel_Type_0',
       'Fuel_Type_1', 'Fuel_Type_2'],
      dtype='object')

In [24]:
y = df1['Selling_Price']
x = df1[['Present_Price','Seller_Type_Individual']]
xc = sm.add_constant(x)

model = sm.OLS(y,xc).fit()
model.summary()

0,1,2,3
Dep. Variable:,Selling_Price,R-squared:,0.786
Model:,OLS,Adj. R-squared:,0.785
Method:,Least Squares,F-statistic:,548.4
Date:,"Mon, 12 May 2025",Prob (F-statistic):,1.34e-100
Time:,18:38:22,Log-Likelihood:,-683.71
No. Observations:,301,AIC:,1373.0
Df Residuals:,298,BIC:,1385.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.5423,0.261,5.901,0.000,1.028,2.057
Present_Price,0.4758,0.018,25.956,0.000,0.440,0.512
Seller_Type_Individual,-1.4493,0.331,-4.376,0.000,-2.101,-0.798

0,1,2,3
Omnibus:,72.895,Durbin-Watson:,1.645
Prob(Omnibus):,0.0,Jarque-Bera (JB):,845.311
Skew:,0.581,Prob(JB):,2.77e-184
Kurtosis:,11.127,Cond. No.,33.4


In [25]:
# 78.6% of variation in selling price is impacted by Present Price and Seller Type.
# Price of vehicle sold by individuals are lesser than those with dealers.

**4. Consider all the numeric features in the data. Do all of them significantly contribute to explaining the variation in the selling price?**

In [26]:
df2 = df.copy(deep=True)
df2.head()

Unnamed: 0_level_0,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
Car_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [27]:
y = df2['Selling_Price']
x = df2.select_dtypes(include=np.number).drop('Selling_Price',axis=1)
xc = sm.add_constant(x)

model = sm.OLS(y,xc).fit()
model.summary()

0,1,2,3
Dep. Variable:,Selling_Price,R-squared:,0.852
Model:,OLS,Adj. R-squared:,0.85
Method:,Least Squares,F-statistic:,426.6
Date:,"Mon, 12 May 2025",Prob (F-statistic):,1.66e-121
Time:,18:38:22,Log-Likelihood:,-628.25
No. Observations:,301,AIC:,1267.0
Df Residuals:,296,BIC:,1285.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-937.7642,94.392,-9.935,0.000,-1123.528,-752.000
Year,0.4661,0.047,9.949,0.000,0.374,0.558
Present_Price,0.5256,0.013,39.067,0.000,0.499,0.552
Kms_Driven,-1.267e-06,3.51e-06,-0.361,0.718,-8.17e-06,5.64e-06
Owner,-0.9513,0.466,-2.042,0.042,-1.868,-0.034

0,1,2,3
Omnibus:,69.45,Durbin-Watson:,1.56
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1362.873
Skew:,0.241,Prob(JB):,1.1400000000000001e-296
Kurtosis:,13.413,Cond. No.,44600000.0


In [28]:
# Kms_Driven is insignificant.

**5. In the model obtained in question 4, consider the interaction effect of the present price of the car and the year in which it was purchased. Compare the resultant model with the model obtained in previous question and give your interpretation**

In [29]:
df3 = df.copy(deep=True)
df3['Year_Present_Price'] = df3['Year']*df3['Present_Price']

In [30]:
df3.columns

Index(['Year', 'Selling_Price', 'Present_Price', 'Kms_Driven', 'Fuel_Type',
       'Seller_Type', 'Transmission', 'Owner', 'Year_Present_Price'],
      dtype='object')

In [31]:
y = df3['Selling_Price']
x = df3[['Year','Present_Price','Year_Present_Price']]
xc = sm.add_constant(x)

model = sm.OLS(y,xc).fit()
model.summary()

0,1,2,3
Dep. Variable:,Selling_Price,R-squared:,0.963
Model:,OLS,Adj. R-squared:,0.963
Method:,Least Squares,F-statistic:,2581.0
Date:,"Mon, 12 May 2025",Prob (F-statistic):,2.58e-212
Time:,18:38:22,Log-Likelihood:,-419.58
No. Observations:,301,AIC:,847.2
Df Residuals:,297,BIC:,862.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,69.7255,52.799,1.321,0.188,-34.182,173.633
Year,-0.0346,0.026,-1.318,0.189,-0.086,0.017
Present_Price,-137.3900,4.576,-30.026,0.000,-146.395,-128.385
Year_Present_Price,0.0685,0.002,30.141,0.000,0.064,0.073

0,1,2,3
Omnibus:,36.951,Durbin-Watson:,1.784
Prob(Omnibus):,0.0,Jarque-Bera (JB):,175.983
Skew:,-0.319,Prob(JB):,6.11e-39
Kurtosis:,6.691,Cond. No.,21700000.0


In [32]:
# The model performs better than the q4 resultant. 
# The interaction variable "Year_Present_Price" has significantly lower impact on the "Selling Price", "Year" is found to be insignificant here.

In [33]:
# The interaction variable created here isn't logically valid and is used to only provide a picture of model creation using interaction of variables.

**6. What is the impact of fuel type of cars on the selling price?**

In [34]:
y = df1['Selling_Price']
x = df1[['Fuel_Type_0','Fuel_Type_1','Fuel_Type_2']]
xc = sm.add_constant(x)

model = sm.OLS(y,xc).fit()
model.summary()

0,1,2,3
Dep. Variable:,Selling_Price,R-squared:,0.305
Model:,OLS,Adj. R-squared:,0.3
Method:,Least Squares,F-statistic:,65.41
Date:,"Mon, 12 May 2025",Prob (F-statistic):,2.7999999999999998e-24
Time:,18:38:22,Log-Likelihood:,-861.21
No. Observations:,301,AIC:,1728.0
Df Residuals:,298,BIC:,1740.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.1607,0.767,5.424,0.000,2.651,5.670
Fuel_Type_0,-1.0607,2.260,-0.469,0.639,-5.508,3.387
Fuel_Type_1,6.1178,0.860,7.117,0.000,4.426,7.810
Fuel_Type_2,-0.8965,0.791,-1.133,0.258,-2.454,0.661

0,1,2,3
Omnibus:,149.368,Durbin-Watson:,1.338
Prob(Omnibus):,0.0,Jarque-Bera (JB):,826.132
Skew:,2.022,Prob(JB):,4.05e-180
Kurtosis:,10.036,Cond. No.,9640000000000000.0


In [35]:
# The fuel type diesel has a significant impact on the selling price whereas CNG and Petrol turned out to be insignificant.

**7. Does the model significantly explain variation in the target variable? Justify your answer with analysis of variation**

            Regress the selling price over the transmission.
            
            Selling_Price ~ Transmission

In [36]:
y = df1['Selling_Price']
x = df1['Transmission_Manual']
xc = sm.add_constant(x)

model = sm.OLS(y,xc).fit()
model.summary()

0,1,2,3
Dep. Variable:,Selling_Price,R-squared:,0.135
Model:,OLS,Adj. R-squared:,0.132
Method:,Least Squares,F-statistic:,46.58
Date:,"Mon, 12 May 2025",Prob (F-statistic):,4.9e-11
Time:,18:38:22,Log-Likelihood:,-894.2
No. Observations:,301,AIC:,1792.0
Df Residuals:,299,BIC:,1800.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,9.4200,0.749,12.580,0.000,7.946,10.894
Transmission_Manual,-5.4880,0.804,-6.825,0.000,-7.070,-3.906

0,1,2,3
Omnibus:,143.925,Durbin-Watson:,1.182
Prob(Omnibus):,0.0,Jarque-Bera (JB):,946.055
Skew:,1.855,Prob(JB):,3.6900000000000003e-206
Kurtosis:,10.853,Cond. No.,5.31


In [37]:
# Manual transmission cars tend to have lower price than the cars with automatic.

**8. Regress the selling price over the present price. Compare the 99% and 95% confidence interval of present price of a car**

In [38]:
stats.shapiro(df1['Present_Price'])

ShapiroResult(statistic=0.6911642449420012, pvalue=3.946113617575611e-23)

In [39]:
df['Present_Price_z_score'] = stats.zscore(df['Present_Price'])
df['Present_Price_z_score']

Car_Name
ritz      -0.236215
sx4        0.221505
ciaz       0.257427
wagon r   -0.403079
swift     -0.087890
             ...   
city       0.460214
brio      -0.200292
city       0.390687
city       0.564504
brio      -0.200292
Name: Present_Price_z_score, Length: 301, dtype: float64

In [40]:
popmean = np.mean(df['Present_Price_z_score'])
sd = np.std(df['Present_Price_z_score'])
n = 0.70*df.shape[0]
se = sd/np.sqrt(n)
print("95% CI",stats.norm.interval(0.95,loc=popmean,scale=se))
print()
print("99% CI",stats.norm.interval(0.99,loc=popmean,scale=se))

95% CI (-0.1350255090367089, 0.13502550903670874)

99% CI (-0.17745359896752422, 0.17745359896752405)


**9. Verify the statement: The sum of the residuals in any regression model that contains an intercept β<sub>0</sub> is always zero**

        To verify the result, we will fit a regression model of 'Present_Price' on 'Selling_Price' 

In [41]:
y = df['Selling_Price']
x = df['Present_Price']
xc = sm.add_constant(x)

model = sm.OLS(y,xc).fit()
model.summary()

0,1,2,3
Dep. Variable:,Selling_Price,R-squared:,0.773
Model:,OLS,Adj. R-squared:,0.772
Method:,Least Squares,F-statistic:,1016.0
Date:,"Mon, 12 May 2025",Prob (F-statistic):,3.6e-98
Time:,18:38:22,Log-Likelihood:,-693.08
No. Observations:,301,AIC:,1390.0
Df Residuals:,299,BIC:,1398.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.7185,0.187,3.847,0.000,0.351,1.086
Present_Price,0.5168,0.016,31.874,0.000,0.485,0.549

0,1,2,3
Omnibus:,59.775,Durbin-Watson:,1.533
Prob(Omnibus):,0.0,Jarque-Bera (JB):,926.121
Skew:,0.084,Prob(JB):,7.86e-202
Kurtosis:,11.592,Cond. No.,15.4


In [42]:
residuals = model.resid
print("Sum of Squared Errors:",sum(residuals))

Sum of Squared Errors: -2.6689761511988763e-13


In [43]:
round(sum(residuals))==0

True

Yes, it will always be zero

**10. Consider two models as specified below. Compare the performance of the models**

                First model:
        
        Selling_Price ~ Year + Present_Price + Kms_Driven + Owner + Fuel_Type + Seller_Type + Transmission
        
        
                Second model:
        
        Selling_Price ~ Year + Present_Price + Kms_Driven + Owner 

In [44]:
df1.columns

Index(['Year', 'Selling_Price', 'Present_Price', 'Kms_Driven', 'Owner',
       'Seller_Type_Individual', 'Transmission_Manual', 'Fuel_Type_0',
       'Fuel_Type_1', 'Fuel_Type_2'],
      dtype='object')

In [45]:
y = df1['Selling_Price']
x = df1[['Year','Present_Price','Kms_Driven','Owner','Fuel_Type_0','Fuel_Type_1','Fuel_Type_2','Seller_Type_Individual','Transmission_Manual']]
xc = sm.add_constant(x)

model1 = sm.OLS(y,xc).fit()
model1.summary()

0,1,2,3
Dep. Variable:,Selling_Price,R-squared:,0.883
Model:,OLS,Adj. R-squared:,0.879
Method:,Least Squares,F-statistic:,274.3
Date:,"Mon, 12 May 2025",Prob (F-statistic):,5.71e-131
Time:,18:38:22,Log-Likelihood:,-593.62
No. Observations:,301,AIC:,1205.0
Df Residuals:,292,BIC:,1239.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-591.1580,64.903,-9.108,0.000,-718.896,-463.420
Year,0.3931,0.043,9.159,0.000,0.309,0.478
Present_Price,0.4372,0.016,27.355,0.000,0.406,0.469
Kms_Driven,-7.013e-06,3.23e-06,-2.172,0.031,-1.34e-05,-6.57e-07
Owner,-0.6742,0.423,-1.595,0.112,-1.506,0.158
Fuel_Type_0,-198.0785,21.631,-9.157,0.000,-240.651,-155.506
Fuel_Type_1,-195.6086,21.674,-9.025,0.000,-238.265,-152.952
Fuel_Type_2,-197.4709,21.624,-9.132,0.000,-240.030,-154.912
Seller_Type_Individual,-1.1213,0.257,-4.371,0.000,-1.626,-0.616

0,1,2,3
Omnibus:,95.107,Durbin-Watson:,1.795
Prob(Omnibus):,0.0,Jarque-Bera (JB):,696.392
Skew:,1.075,Prob(JB):,6.03e-152
Kurtosis:,10.135,Cond. No.,4.37e+20


In [46]:
y = df1['Selling_Price']
x = df1[['Year','Present_Price','Kms_Driven','Owner']]
xc = sm.add_constant(x)

model2 = sm.OLS(y,xc).fit()
model2.summary()

0,1,2,3
Dep. Variable:,Selling_Price,R-squared:,0.852
Model:,OLS,Adj. R-squared:,0.85
Method:,Least Squares,F-statistic:,426.6
Date:,"Mon, 12 May 2025",Prob (F-statistic):,1.66e-121
Time:,18:38:22,Log-Likelihood:,-628.25
No. Observations:,301,AIC:,1267.0
Df Residuals:,296,BIC:,1285.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-937.7642,94.392,-9.935,0.000,-1123.528,-752.000
Year,0.4661,0.047,9.949,0.000,0.374,0.558
Present_Price,0.5256,0.013,39.067,0.000,0.499,0.552
Kms_Driven,-1.267e-06,3.51e-06,-0.361,0.718,-8.17e-06,5.64e-06
Owner,-0.9513,0.466,-2.042,0.042,-1.868,-0.034

0,1,2,3
Omnibus:,69.45,Durbin-Watson:,1.56
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1362.873
Skew:,0.241,Prob(JB):,1.1400000000000001e-296
Kurtosis:,13.413,Cond. No.,44600000.0


The first model depicts better performance by explaining 88.3% of variation in selling price than the second model with 85.2% R^2. 