## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge

# Exploring Dataset
- Dataset Link: [Real estate price prediction](https://www.kaggle.com/quantbruce/real-estate-price-prediction)

In [6]:
df = pd.read_csv('real_estate.csv')
df.head()

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.917,19.5,306.5947,9,24.98034,121.53951,42.2
2,3,2013.583,13.3,561.9845,5,24.98746,121.54391,47.3
3,4,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,5,2012.833,5.0,390.5684,5,24.97937,121.54245,43.1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414 entries, 0 to 413
Data columns (total 8 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   No                                      414 non-null    int64  
 1   X1 transaction date                     414 non-null    float64
 2   X2 house age                            414 non-null    float64
 3   X3 distance to the nearest MRT station  414 non-null    float64
 4   X4 number of convenience stores         414 non-null    int64  
 5   X5 latitude                             414 non-null    float64
 6   X6 longitude                            414 non-null    float64
 7   Y house price of unit area              414 non-null    float64
dtypes: float64(6), int64(2)
memory usage: 26.0 KB


In [5]:
df.describe()

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
count,414.0,414.0,414.0,414.0,414.0,414.0,414.0,414.0
mean,207.5,2013.148971,17.71256,1083.885689,4.094203,24.96903,121.533361,37.980193
std,119.655756,0.281967,11.392485,1262.109595,2.945562,0.01241,0.015347,13.606488
min,1.0,2012.667,0.0,23.38284,0.0,24.93207,121.47353,7.6
25%,104.25,2012.917,9.025,289.3248,1.0,24.963,121.528085,27.7
50%,207.5,2013.167,16.1,492.2313,4.0,24.9711,121.53863,38.45
75%,310.75,2013.417,28.15,1454.279,6.0,24.977455,121.543305,46.6
max,414.0,2013.583,43.8,6488.021,10.0,25.01459,121.56627,117.5


In [14]:
X = df.iloc[:, 2:-1]
X.head()

Unnamed: 0,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude
0,32.0,84.87882,10,24.98298,121.54024
1,19.5,306.5947,9,24.98034,121.53951
2,13.3,561.9845,5,24.98746,121.54391
3,13.3,561.9845,5,24.98746,121.54391
4,5.0,390.5684,5,24.97937,121.54245


In [15]:
Y = df.iloc[:, [-1]]
Y.head()

Unnamed: 0,Y house price of unit area
0,37.9
1,42.2
2,47.3
3,54.8
4,43.1


# Splitting of Datasset

In [22]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.25, random_state=0)
print(X_train.shape, X_val.shape)

(310, 5) (104, 5)


In [23]:
Y_train.head()

Unnamed: 0,Y house price of unit area
322,33.1
208,26.2
56,41.9
8,18.8
312,78.0


In [24]:
Y_train, Y_val = Y_train.to_numpy(), Y_val.to_numpy()

In [25]:
Y_train[:5]

array([[33.1],
       [26.2],
       [41.9],
       [18.8],
       [78. ]])

In [26]:
Y_train.shape

(310, 1)

# Adding Polynomial Features

In [27]:
degree = 5
poly = PolynomialFeatures(degree)
X_train_poly = poly.fit_transform(X_train)
X_val_poly = poly.transform(X_val)
print(X_train_poly.shape)

(310, 252)


# Feature Scaling

In [28]:
sc = StandardScaler()
sc.fit(X_train_poly)
X_train_scaled = sc.transform(X_train_poly)
X_val_scaled = sc.transform(X_val_poly)

In [29]:
X_train_scaled[:5]

array([[ 0.        , -0.40538356, -0.72684272, ...,  0.2496308 ,
         0.10215794, -0.20906313],
       [ 0.        , -0.52936105,  0.19272947, ..., -0.80025317,
        -0.20476421,  1.0018216 ],
       [ 0.        ,  1.42771227, -0.58273643, ...,  0.37890681,
         0.45446382,  0.49226172],
       [ 0.        ,  1.2594571 ,  3.44855944, ..., -2.15162301,
        -2.71066635, -3.14892377],
       [ 0.        ,  1.5871119 , -0.62407855, ...,  0.2615843 ,
         0.37256923,  0.49876869]])

# Training

In [30]:
model = Ridge(alpha=10)
model.fit(X_train_scaled, Y_train)

Ridge(alpha=10)

In [31]:
train_score = model.score(X_train_scaled, Y_train)
print("Train score: ", train_score)

Train score:  0.6715537733322601


In [33]:
val_score = model.score(X_val_scaled, Y_val)
print("Val Score: ",val_score)

Val Score:  0.711021479599499


# HyperParameter Tuning

In [34]:
params = {}
params['degrees'] = [1,2,3,4,5,6,7,8,9,10]
params['alpha'] = [0.01, 0.1, 10, 50]

In [35]:
def fit(X_train_scaled, Y_train, X_val_scaled, Y_val, degree, alpha, grid_search=False, exp_name=None):
    poly = PolynomialFeatures(degree)
    X_train_poly = poly.fit_transform(X_train)
    X_val_poly = poly.transform(X_val)
    
    sc = StandardScaler()
    sc.fit(X_train_poly)
    X_train_scaled = sc.transform(X_train_poly)
    X_val_scaled = sc.transform(X_val_poly)
    
    model = Ridge(alpha)
    model.fit(X_train_scaled, Y_train)
    
    train_score = model.score(X_train_scaled, Y_train)
    val_score = model.score(X_val_scaled, Y_val)
    
    if grid_search:
        scores = [exp_name, degree, alpha, train_score, val_score]
    else:
        print('Train Score: ',train_score)
        print('Val Score: ',val_score)
        return model,poly,sc

In [40]:
from tqdm import tqdm

result = []
exp_id = 1

for i in tqdm(range(len(params['degrees']))):
    degree = params['degrees'][i]
    for j in range(len(params['alpha'])):
        alpha = params['alpha'][j]
        exp_name = 'exp '+str(exp_id)
        exp_id += 1
        result.append(fit(X_train_scaled, Y_train, X_val_scaled, Y_val, degree, alpha, grid_search=True, exp_name=exp_name))
result

100%|██████████| 10/10 [00:00<00:00, 17.42it/s]


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]