In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [52]:
data = {
    'mesin x': [1000, 2000, 3000, 4000, 5000],
    'harga y': [10, 25, 35, 55, 80]
}
df = pd.DataFrame(data)
df

Unnamed: 0,mesin x,harga y
0,1000,10
1,2000,25
2,3000,35
3,4000,55
4,5000,80


In [53]:
# dataframe untuk menghitung m (slope) & c (intercept)
df['xy'] = df['mesin x'] * df['harga y']
df['x^2'] = df['mesin x'] ** 2
df['y^2'] = df['harga y'] ** 2
df

Unnamed: 0,mesin x,harga y,xy,x^2,y^2
0,1000,10,10000,1000000,100
1,2000,25,50000,4000000,625
2,3000,35,105000,9000000,1225
3,4000,55,220000,16000000,3025
4,5000,80,400000,25000000,6400


<hr>

### Linear Regression: Gradient (m) & Intercept (c)

In [54]:
# m (slope/gradient)
m = ( (df['mesin x'].count() * df['xy'].sum()) - (df['mesin x'].sum() * df['harga y'].sum()) ) / ( (df['mesin x'].count() * df['x^2'].sum()) - (df['mesin x'].sum() ** 2) )
m

0.017

In [55]:
# c intercept
c = ( (df['harga y'].sum() * df['x^2'].sum()) - (df['mesin x'].sum() * df['xy'].sum()) ) / ( (df['mesin x'].count() * df['x^2'].sum()) - (df['mesin x'].sum() ** 2) )
c

-10.0

In [56]:
# prediksi harga berdasarkan linear regression y' = mx + c
df['harga ŷ'] = m * df['mesin x'] + c
df

Unnamed: 0,mesin x,harga y,xy,x^2,y^2,harga ŷ
0,1000,10,10000,1000000,100,7.0
1,2000,25,50000,4000000,625,24.0
2,3000,35,105000,9000000,1225,41.0
3,4000,55,220000,16000000,3025,58.0
4,5000,80,400000,25000000,6400,75.0


In [64]:
# selisih harga y dengan harga prediksi ŷ 
df["|y-ŷ|"] = np.abs(df['harga y'] - df['harga ŷ'])
# selisih y-ŷ ** 2
df["|y-ŷ|^2"] = (np.abs(df['harga y'] - df['harga ŷ'])) ** 2

# selisih harga y dengan harga rata-rata y̅
df["|y-y̅|"] = np.abs(df['harga y'] - df['harga y'].mean())
# selisih y-y̅ ** 2
df["|y-y̅|^2"] = (np.abs(df['harga y'] - df['harga y'].mean())) ** 2

# elog(1+y) => ln(1+y) logaritma natural
df['ln(1+y)'] = np.log(1 + df['harga y'])
# elog(1+ŷ) => ln(1+ŷ) logaritma natural
df['ln(1+ŷ)'] = np.log(1 + df['harga ŷ'])
# selisih ln(1+y) & ln(1+ŷ) ** 2
df['|ln(1+y)-ln(1+ŷ)|^2'] = (np.abs(df['ln(1+y)'] - df['ln(1+ŷ)'])) ** 2

# %error = |y-ŷ/y|
df['%error'] = np.abs((df['harga y'] - df['harga ŷ']) / df['harga y'])
# %error^2 = |y-ŷ/y| ** 2
df['%err^2'] = (np.abs((df['harga y'] - df['harga ŷ']) / df['harga y'])) ** 2

df

Unnamed: 0,mesin x,harga y,xy,x^2,y^2,harga ŷ,|y-ŷ|,|y-ŷ|^2,|y-y̅|,|y-y̅|^2,ln(1+y),ln(1+ŷ),|ln(1+y)-ln(1+ŷ)|^2,%error,%err^2
0,1000,10,10000,1000000,100,7.0,3.0,9.0,31.0,961.0,2.397895,2.079442,0.101413,0.3,0.09
1,2000,25,50000,4000000,625,24.0,1.0,1.0,16.0,256.0,3.258097,3.218876,0.001538,0.04,0.0016
2,3000,35,105000,9000000,1225,41.0,6.0,36.0,6.0,36.0,3.583519,3.73767,0.023762,0.171429,0.029388
3,4000,55,220000,16000000,3025,58.0,3.0,9.0,14.0,196.0,4.025352,4.077537,0.002723,0.054545,0.002975
4,5000,80,400000,25000000,6400,75.0,5.0,25.0,39.0,1521.0,4.394449,4.330733,0.00406,0.0625,0.003906


<hr>

### Evaluation Metrics Linear Regression

#### 1. Max Error

In [65]:
maxErr = round(df['|y-ŷ|'].max())
maxErr

6.0

#### 2. MAE (Mean Absolute Error)

In [71]:
MAE = round((1/df['mesin x'].count()) * df['|y-ŷ|'].sum(), 2)
MAE

3.6

#### 3. MSE (Mean Squared Error)

In [72]:
MSE = round((1/df['mesin x'].count()) * df['|y-ŷ|^2'].sum(), 2)
MSE

16.0

#### 4. RMSE (Root of MSE)

In [73]:
RMSE = np.sqrt(MSE)
RMSE

4.0

#### 5. MedAE (Median Absolute Error)

In [74]:
MedAE = df['|y-ŷ|'].median()
MedAE

3.0

#### 6. MSLE (Mean Squared Logarithmic Error)

In [75]:
MSLE = (1/df['mesin x'].count()) * df['|ln(1+y)-ln(1+ŷ)|^2'].sum()
MSLE

0.026699306626327746

#### 7. RMSLE (Root of MSLE)

In [76]:
RMSLE = np.sqrt(MSLE)
RMSLE

0.16339922468092602

#### 8. MAPE (Mean Absolute Percentage Error)

In [78]:
MAPE = (1/df['mesin x'].count()) * df['%error'].sum()
MAPE

0.12569480519480522

#### 9. MSPE (Mean Squared Percentage Error)

In [79]:
MSPE = (1/df['mesin x'].count()) * df['%err^2'].sum()
MSPE

0.025573842342722225

#### 10. R<sup>2</sup> Score (Coefficient of Determination)

In [81]:
R2 = 1 - (df['|y-ŷ|^2'].sum() / df['|y-y̅|^2'].sum())
R2

0.9730639730639731