In [2]:
#make interact plot ==> displaying plot in a separate window
%matplotlib 
import matplotlib.pylab as plt

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
# from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score

Using matplotlib backend: <object object at 0x7f2c42b03fa0>


In [3]:
# load data (from https://www.kaggle.com/datasets/peimandaii/dataset-of-people)
df = pd.read_csv('house-price.csv')
df

Unnamed: 0,location,size,price-krw-million
0,seoul,10,150
1,seoul,15,250
2,seoul,20,550
3,seoul,35,900
4,seoul,45,1500
5,incheon,10,100
6,incheon,15,150
7,incheon,25,300
8,incheon,35,500
9,incheon,50,1200


In [4]:
# prepare and cleaning data before training

# data to be train can only consist of value (integer / float)
# transform all text to value
# tips (not must): recommended not use '0' to avoid strange math calculation if using multiplier.
location_dict = {"seoul": 1, "incheon": 2, "busan":3}
df.loc[:,"location"] = df["location"].map(location_dict) # only run 1 time !!
df


Unnamed: 0,location,size,price-krw-million
0,1,10,150
1,1,15,250
2,1,20,550
3,1,35,900
4,1,45,1500
5,2,10,100
6,2,15,150
7,2,25,300
8,2,35,500
9,2,50,1200


In [5]:
# display plot in 3D to visualize data
fig = plt.figure()
ax = fig.add_subplot(projection='3d')

ax.set_title('Predict House Price')
ax.set_xlabel('Location')
ax.set_ylabel('Size')
ax.set_zlabel('Price (KRW-Million)')

data = df.query('location == 1') # 1 == Seoul
ax.plot(data['location'], data['size'], data['price-krw-million']
        , label='Seoul'
            , color='red'
            , linewidth=2
            , marker="o"
            , markerfacecolor='gray'
            , markersize=5)

data = df.query('location == 2') # 2 == Incheon
ax.plot(data['location'], data['size'], data['price-krw-million']
        , label='Incheon'
            , color='green'
            , linewidth=2
            , marker="o"
            , markerfacecolor='gray'
            , markersize=5)

data = df.query('location == 3') # 3 == Busan
ax.plot(data['location'], data['size'], data['price-krw-million']
        , label='Busan'
            , color='blue'
            , linewidth=2
            , marker="o"
            , markerfacecolor='gray'
            , markersize=5)
    
ax.legend()
#fig.legend()

# in 3D, use mouse to click (empty area) and drag to change viewing point

# important to block and to display plot properly (avoid black empty window in Fedora or Jupyter Notebook)
plt.show(block=True) # code will continue after the window is closed

In [6]:
# define data to train and the target (annotated)

# get all columns except the last column (right side) (==y)
X_train = df.drop(df.columns[-1], axis=1) # X_train will be numpy.ndarray
X_train

Unnamed: 0,location,size
0,1,10
1,1,15
2,1,20
3,1,35
4,1,45
5,2,10
6,2,15
7,2,25
8,2,35
9,2,50


In [7]:
# define target (supervised annotated data), only use 1 column of the most right side
y_train = df.take([-1], axis=1)
y_train

Unnamed: 0,price-krw-million
0,150
1,250
2,550
3,900
4,1500
5,100
6,150
7,300
8,500
9,1200


In [8]:
# create model
modelLR = LinearRegression()

# fit == train
modelLR.fit(X_train.values, y_train.values) # use .values (do not take header text, because next predict data has no header)


In [9]:

# show the coefficient and intercept value
print(f"***\n{modelLR.coef_ = }\n{modelLR.intercept_ = }\n***")



***
modelLR.coef_ = array([[-210.58258606,   27.43303442]])
modelLR.intercept_ = array([173.32889789])
***


In [10]:
# test predict using original train data
y_pred = modelLR.predict(X_train.values) # when predict, only use .values (without header text)

# do not convert, only display without decimal value, convert from float to int (round down)
y_pred.astype(int)


array([[ 237],
       [ 374],
       [ 511],
       [ 922],
       [1197],
       [  26],
       [ 163],
       [ 437],
       [ 712],
       [1123],
       [-184],
       [  90],
       [ 364],
       [ 638],
       [ 913]])

In [11]:
# check accuracy
r2_score(y_train, y_pred) # closer to 1.0 means more accurate, 1.0 == perfect !!

0.8912430648111884

In [12]:
# Q: Why predict the same training-data resulted in different values?
# A: Because the value is calculated using (math) regression using 'coefficient' and 'intercept' values.
# Linear Regression formula: y = (x1 * coef[1]) + (x2 * coef[2]) + intercept

# verify formula == calculate manually
y_manual = (X_train['location'][10] * modelLR.coef_[0][0]) + (X_train['size'][10] * modelLR.coef_[0][1]) + modelLR.intercept_
y_manual

array([-184.08851605])

In [13]:
# use Polynomial Regression
from sklearn.preprocessing import PolynomialFeatures 

# create Polynomial Feature to transform data
# default: degree=2, include_bias=True !
poly = PolynomialFeatures(degree=2, include_bias=False) 

# transform X_train value to poly
X_train_poly = poly.fit_transform(X_train.values)
X_train_poly 

array([[1.000e+00, 1.000e+01, 1.000e+00, 1.000e+01, 1.000e+02],
       [1.000e+00, 1.500e+01, 1.000e+00, 1.500e+01, 2.250e+02],
       [1.000e+00, 2.000e+01, 1.000e+00, 2.000e+01, 4.000e+02],
       [1.000e+00, 3.500e+01, 1.000e+00, 3.500e+01, 1.225e+03],
       [1.000e+00, 4.500e+01, 1.000e+00, 4.500e+01, 2.025e+03],
       [2.000e+00, 1.000e+01, 4.000e+00, 2.000e+01, 1.000e+02],
       [2.000e+00, 1.500e+01, 4.000e+00, 3.000e+01, 2.250e+02],
       [2.000e+00, 2.500e+01, 4.000e+00, 5.000e+01, 6.250e+02],
       [2.000e+00, 3.500e+01, 4.000e+00, 7.000e+01, 1.225e+03],
       [2.000e+00, 5.000e+01, 4.000e+00, 1.000e+02, 2.500e+03],
       [3.000e+00, 1.000e+01, 9.000e+00, 3.000e+01, 1.000e+02],
       [3.000e+00, 2.000e+01, 9.000e+00, 6.000e+01, 4.000e+02],
       [3.000e+00, 3.000e+01, 9.000e+00, 9.000e+01, 9.000e+02],
       [3.000e+00, 4.000e+01, 9.000e+00, 1.200e+02, 1.600e+03],
       [3.000e+00, 5.000e+01, 9.000e+00, 1.500e+02, 2.500e+03]])

In [14]:
# create a new poly (non linear) model
modelPolyR = LinearRegression() # Polynomial Regression is a special case of LinearRegression !!

# train the transformed data (using PolynomialFeatures)
modelPolyR.fit(X_train_poly, y_train.values)

# show the coefficient and intercept value
print(f"***\n{modelLR.coef_ = }\n{modelLR.intercept_ = }\n***")

***
modelLR.coef_ = array([[-210.58258606,   27.43303442]])
modelLR.intercept_ = array([173.32889789])
***


In [15]:
# predict
y_poly_pred = modelPolyR.predict(X_train_poly)
y_poly_pred.astype(int)


array([[ 204],
       [ 301],
       [ 426],
       [ 962],
       [1455],
       [  62],
       [ 111],
       [ 289],
       [ 576],
       [1210],
       [ 116],
       [ 142],
       [ 277],
       [ 520],
       [ 873]])

In [16]:
r2_score(y_train, y_poly_pred)

0.9853957029778391

In [17]:
# next is to predict some external cases (not inside the train data)

# note: must use np.array to make everything as np.array
input = np.array([[1,11],[1,40],[1,60] # Seoul
                  , [2,10],[2,40],[2,60] # Incheon 
                  , [3,10],[3,40],[3,60]] # Busan
                ) 

# predict using Linear Regression
output = modelLR.predict(input)

# display results
print(f"Case 1\nInput: {input}")
print(f"Predicted by LinearRegression: {output}")

#transform input to poly
input_poly = poly.fit_transform(input)

# predict using poly
output_poly = modelPolyR.predict(input_poly)
print(f"Predicted by Polynomial Regression: {output_poly}")


Case 1
Input: [[ 1 11]
 [ 1 40]
 [ 1 60]
 [ 2 10]
 [ 2 40]
 [ 2 60]
 [ 3 10]
 [ 3 40]
 [ 3 60]]
Predicted by LinearRegression: [[ 264.50969048]
 [1060.06768875]
 [1608.72837721]
 [  26.49407   ]
 [ 849.48510269]
 [1398.14579115]
 [-184.08851605]
 [ 638.90251663]
 [1187.56320509]]
Predicted by Polynomial Regression: [[ 221.82902966]
 [1195.24517753]
 [2399.54128954]
 [  62.90839301]
 [ 760.40730234]
 [1769.25823923]
 [ 116.44590224]
 [ 520.77704888]
 [1334.18281066]]


In [None]:
# display plot in 3D to visualize data
fig = plt.figure()
ax = fig.add_subplot(projection='3d')

ax.set_title('Predict House Price')
ax.set_xlabel('Location')
ax.set_ylabel('Size')
ax.set_zlabel('Price (KRW-Million)')


#ax.scatter(input[:,0], input[:,1], output

#LINEAR
ax.plot(input[0:3,0], input[0:3,1], output[0:3,0] 
            , label='Linear Seoul'
            , color='blue'
            , linewidth=2
            , marker="o"
            , markerfacecolor='gray'
            , markersize=5
            )
ax.plot(input[3:6,0], input[3:6,1], output[3:6,0]
            , label='Linear Incheon'
            , color='blue'
            , linewidth=2
            , marker="o"
            , markerfacecolor='gray'
            , markersize=5
            )
ax.plot(input[6:9,0], input[6:9,1], output[6:9,0]
            , label='Linear Busan'
            , color='blue'
            , linewidth=2
            , marker="o"
            , markerfacecolor='gray'
            , markersize=5
            )

# POLYNOMIAL
ax.plot(input[0:3,0], input[0:3,1], output_poly[0:3,0] 
            , label='Polynomial Seoul'
            , color='red'
            , linewidth=2
            , marker="o"
            , markerfacecolor='gray'
            , markersize=5
            )
ax.plot(input[3:6,0], input[3:6,1], output_poly[3:6,0] 
            , label='Polynomial Incheon'
            , color='red'
            , linewidth=2
            , marker="o"
            , markerfacecolor='gray'
            , markersize=5
            )
ax.plot(input[6:9,0], input[6:9,1], output_poly[6:9,0] 
            , label='Polynomial Busan'
            , color='red'
            , linewidth=2
            , marker="o"
            , markerfacecolor='gray'
            , markersize=5
            )
ax.legend()

# in 3D, use mouse to click (empty area) and drag to change viewing point

# important to block and to display plot properly (avoid black empty window in Fedora or Jupyter Notebook)
plt.show(block=True) # code will continue after the window is closed