# Regression 

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('Car_Price_Prediction.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Make          994 non-null    object 
 1   Model         999 non-null    object 
 2   Year          999 non-null    float64
 3   Engine Size   998 non-null    float64
 4   Mileage       1000 non-null   int64  
 5   Fuel Type     998 non-null    object 
 6   Transmission  999 non-null    object 
 7   Price         996 non-null    float64
dtypes: float64(3), int64(1), object(4)
memory usage: 62.6+ KB


In [4]:
def MissingValues(df):
    for col in df.columns:
        if df[col].isnull().any():
            if df[col].dtype == 'object':
                df[col].fillna(df[col].mode()[0], inplace=True)
            else:
                df[col].fillna(df[col].mean(), inplace=True)

    return df

In [5]:
MissingValues(df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


Unnamed: 0,Make,Model,Year,Engine Size,Mileage,Fuel Type,Transmission,Price
0,Honda,Model B,2015.0,3.9,74176,Petrol,Manual,30246.207931
1,Ford,Model C,2014.0,1.7,94799,Electric,Automatic,22785.747684
2,BMW,Model B,2006.0,4.1,98385,Electric,Manual,25760.290347
3,Ford,Model B,2015.0,2.6,88919,Electric,Automatic,25638.003491
4,Honda,Model C,2004.0,3.4,138482,Petrol,Automatic,21021.386657
...,...,...,...,...,...,...,...,...
995,Toyota,Model D,2002.0,1.9,5445,Petrol,Manual,22765.597091
996,Honda,Model B,2020.0,3.1,149112,Diesel,Manual,30392.575567
997,Ford,Model C,2008.0,1.9,195387,Petrol,Automatic,16446.892292
998,Toyota,Model A,2003.0,4.4,246,Petrol,Automatic,27396.156708


In [6]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [7]:
def Encoding(df):
    encoder = LabelEncoder()
    for col in df.columns:
        if df[col].dtype == 'object':
            if df[col].nunique() <= 5:
                dummies = pd.get_dummies(df[col], prefix=col, dtype=int)
                df = pd.concat([df.drop(columns=[col]), dummies], axis=1)
            else:
                df[col] = encoder.fit_transform(df[col])
    return df

In [8]:
df = Encoding(df)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Year                    1000 non-null   float64
 1   Engine Size             1000 non-null   float64
 2   Mileage                 1000 non-null   int64  
 3   Price                   1000 non-null   float64
 4   Make_Audi               1000 non-null   int64  
 5   Make_BMW                1000 non-null   int64  
 6   Make_Ford               1000 non-null   int64  
 7   Make_Honda              1000 non-null   int64  
 8   Make_Toyota             1000 non-null   int64  
 9   Model_Model A           1000 non-null   int64  
 10  Model_Model B           1000 non-null   int64  
 11  Model_Model C           1000 non-null   int64  
 12  Model_Model D           1000 non-null   int64  
 13  Model_Model E           1000 non-null   int64  
 14  Fuel Type_Diesel        1000 non-null   i

In [10]:
def Scaling(df):
    minmax = MinMaxScaler()
    num_col = df.select_dtypes(include=['float64', 'int64']).columns.drop('Price')
    df[num_col] = minmax.fit_transform(df[num_col])
    return df

In [11]:
df = Scaling(df)

In [13]:
from sklearn.model_selection import train_test_split

In [15]:
x = df.drop('Price',axis=1)
y = df['Price']

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [17]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()

In [19]:
lr.fit(x_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [20]:
y_pred = lr.predict(x_test)

In [21]:
from sklearn.metrics import r2_score, mean_absolute_error

In [24]:
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# K-Fold Cross Validation

In [26]:
from sklearn.model_selection import  KFold, cross_val_score

In [27]:
kf = KFold(n_splits=5,shuffle=True,random_state=42)

In [28]:
scores = cross_val_score(lr, x, y, cv=kf, scoring='r2')

In [29]:
print(scores)

[0.81643994 0.83094191 0.84545947 0.83473159 0.816482  ]


In [30]:
import numpy as np

In [32]:
np.mean(scores)

np.float64(0.8288109833197463)

In [33]:
np.std(scores)

np.float64(0.011151799440909778)

In [34]:
from sklearn.metrics import make_scorer

In [35]:
mae = make_scorer(mean_absolute_error, greater_is_better=False)

In [36]:
scores = cross_val_score(lr, x, y, cv=kf, scoring=mae)

In [38]:
print(-scores)

[1810.7825751  1618.86660418 1677.73983478 1616.86552173 1687.08730517]


In [39]:
print(-scores.mean())

1682.2683681901722
