# Write a ml program that will predict the price of a house

### Load your data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("mpg.csv")

In [3]:
df.head()

Unnamed: 0,MPG,Cylinders,Engine Disp,Horsepower,Weight,Accelerate,Year,Origin
0,15.0,8,390.0,190,3850,8.5,70,American
1,21.0,6,199.0,90,2648,15.0,70,American
2,18.0,6,199.0,97,2774,15.5,70,American
3,16.0,8,304.0,150,3433,12.0,70,American
4,14.0,8,455.0,225,3086,10.0,70,American


### Explore your data (EDA)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MPG          392 non-null    float64
 1   Cylinders    392 non-null    int64  
 2   Engine Disp  392 non-null    float64
 3   Horsepower   392 non-null    int64  
 4   Weight       392 non-null    int64  
 5   Accelerate   392 non-null    float64
 6   Year         392 non-null    int64  
 7   Origin       392 non-null    object 
dtypes: float64(3), int64(4), object(1)
memory usage: 24.6+ KB


In [5]:
df.describe()

Unnamed: 0,MPG,Cylinders,Engine Disp,Horsepower,Weight,Accelerate,Year
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,23.445918,5.471939,194.41199,104.469388,2977.584184,15.541327,75.979592
std,7.805007,1.705783,104.644004,38.49116,849.40256,2.758864,3.683737
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0
25%,17.0,4.0,105.0,75.0,2225.25,13.775,73.0
50%,22.75,4.0,151.0,93.5,2803.5,15.5,76.0
75%,29.0,8.0,275.75,126.0,3614.75,17.025,79.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0


In [8]:
df.corr()

Unnamed: 0,MPG,Cylinders,Engine Disp,Horsepower,Weight,Accelerate,Year
MPG,1.0,-0.777618,-0.805127,-0.778427,-0.832244,0.423329,0.580541
Cylinders,-0.777618,1.0,0.950823,0.842983,0.897527,-0.504683,-0.345647
Engine Disp,-0.805127,0.950823,1.0,0.897257,0.932994,-0.5438,-0.369855
Horsepower,-0.778427,0.842983,0.897257,1.0,0.864538,-0.689196,-0.416361
Weight,-0.832244,0.897527,0.932994,0.864538,1.0,-0.416839,-0.30912
Accelerate,0.423329,-0.504683,-0.5438,-0.689196,-0.416839,1.0,0.290316
Year,0.580541,-0.345647,-0.369855,-0.416361,-0.30912,0.290316,1.0


In [9]:
df.corr().MPG

MPG            1.000000
Cylinders     -0.777618
Engine Disp   -0.805127
Horsepower    -0.778427
Weight        -0.832244
Accelerate     0.423329
Year           0.580541
Name: MPG, dtype: float64

### Identify the input and the output

In [16]:
y = df.MPG
x = df.drop(columns=['MPG', 'Origin'])

In [17]:
x.head()

Unnamed: 0,Cylinders,Engine Disp,Horsepower,Weight,Accelerate,Year
0,8,390.0,190,3850,8.5,70
1,6,199.0,90,2648,15.0,70
2,6,199.0,97,2774,15.5,70
3,8,304.0,150,3433,12.0,70
4,8,455.0,225,3086,10.0,70


### Choose an ml algorithm

In [18]:
from sklearn.linear_model import LinearRegression

In [19]:
lr = LinearRegression()

### Train the algorithm

In [20]:
lr.fit(x,y)

LinearRegression()

### Check the learning accuracy

In [21]:
lr.score(x,y)

0.8092552890383932

### predict

In [23]:
lr.predict([x.iloc[4]])



array([18.85175174])

In [24]:
lr.coef_

array([-3.29859089e-01,  7.67843024e-03, -3.91355574e-04, -6.79461791e-03,
        8.52732469e-02,  7.53367180e-01])

In [25]:
lr.intercept_

-14.535250480506118

### Metrics

In [26]:
df.head()

Unnamed: 0,MPG,Cylinders,Engine Disp,Horsepower,Weight,Accelerate,Year,Origin
0,15.0,8,390.0,190,3850,8.5,70,American
1,21.0,6,199.0,90,2648,15.0,70,American
2,18.0,6,199.0,97,2774,15.5,70,American
3,16.0,8,304.0,150,3433,12.0,70,American
4,14.0,8,455.0,225,3086,10.0,70,American


In [27]:
df["predicted_mpg"] = lr.predict(x)

In [28]:
df.head()

Unnamed: 0,MPG,Cylinders,Engine Disp,Horsepower,Weight,Accelerate,Year,Origin,predicted_mpg
0,15.0,8,390.0,190,3850,8.5,70,American,13.047353
1,21.0,6,199.0,90,2648,15.0,70,American,21.001034
2,18.0,6,199.0,97,2774,15.5,70,American,20.184809
3,16.0,8,304.0,150,3433,12.0,70,American,15.534475
4,14.0,8,455.0,225,3086,10.0,70,American,18.851752


In [29]:
df["error"] = df.MPG - df.predicted_mpg

In [30]:
df.head()

Unnamed: 0,MPG,Cylinders,Engine Disp,Horsepower,Weight,Accelerate,Year,Origin,predicted_mpg,error
0,15.0,8,390.0,190,3850,8.5,70,American,13.047353,1.952647
1,21.0,6,199.0,90,2648,15.0,70,American,21.001034,-0.001034
2,18.0,6,199.0,97,2774,15.5,70,American,20.184809,-2.184809
3,16.0,8,304.0,150,3433,12.0,70,American,15.534475,0.465525
4,14.0,8,455.0,225,3086,10.0,70,American,18.851752,-4.851752


In [31]:
df["abserror"] = df.error.apply(abs)

In [32]:
df.head()

Unnamed: 0,MPG,Cylinders,Engine Disp,Horsepower,Weight,Accelerate,Year,Origin,predicted_mpg,error,abserror
0,15.0,8,390.0,190,3850,8.5,70,American,13.047353,1.952647,1.952647
1,21.0,6,199.0,90,2648,15.0,70,American,21.001034,-0.001034,0.001034
2,18.0,6,199.0,97,2774,15.5,70,American,20.184809,-2.184809,2.184809
3,16.0,8,304.0,150,3433,12.0,70,American,15.534475,0.465525,0.465525
4,14.0,8,455.0,225,3086,10.0,70,American,18.851752,-4.851752,4.851752


In [33]:
df.abserror.sum() # absolute error

1026.3595063177515

In [34]:
df.abserror.count()

392

In [35]:
# mean absolute error
df.abserror.sum()/df.abserror.count()

2.618264046728958

In [36]:
df["squared_error"] = df.error ** 2

In [37]:
df.head()

Unnamed: 0,MPG,Cylinders,Engine Disp,Horsepower,Weight,Accelerate,Year,Origin,predicted_mpg,error,abserror,squared_error
0,15.0,8,390.0,190,3850,8.5,70,American,13.047353,1.952647,1.952647,3.812829
1,21.0,6,199.0,90,2648,15.0,70,American,21.001034,-0.001034,0.001034,1e-06
2,18.0,6,199.0,97,2774,15.5,70,American,20.184809,-2.184809,2.184809,4.77339
3,16.0,8,304.0,150,3433,12.0,70,American,15.534475,0.465525,0.465525,0.216714
4,14.0,8,455.0,225,3086,10.0,70,American,18.851752,-4.851752,4.851752,23.539495


In [38]:
df.squared_error.sum() # total squared error

4543.347024714769

In [39]:
df.squared_error.sum()/df.squared_error.count() # mean squared error

11.590170981415227

In [40]:
(df.squared_error.sum()/df.squared_error.count() )** 0.5 # root mean squared error

3.4044340177796406

In [41]:
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import median_absolute_error as mde
from sklearn.metrics import r2_score as score

In [42]:
original = [1,2,3,4,5]
predicted = [1,2.2,3,4.2,5]

In [43]:
print("MSE is ",mse(original,predicted))
print("MAE is ",mae(original,predicted))
print("MDE is ",mde(original,predicted))
print("R Squared is ",score(original,predicted))

MSE is  0.016000000000000028
MAE is  0.08000000000000007
MDE is  0.0
R Squared is  0.992


In [44]:
mse(y,lr.predict(x))

11.590170981415227

In [45]:
mse(y,lr.predict(x)) ** 0.5

3.4044340177796406