<a href="https://colab.research.google.com/github/HoseinNekouei/Housing-Price-Prediction/blob/main/housin_price_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Preprocess Dataset**

In [15]:
from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_absolute_percentage_error,  mean_absolute_error, r2_score

In [2]:
# Review the uploaded file and provided detail information
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/dataset/data.csv', sep=',')

df.info()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22676 entries, 0 to 22675
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Price             22676 non-null  int64  
 1   Apartment type    22676 non-null  object 
 2   Metro station     22676 non-null  object 
 3   Minutes to metro  22676 non-null  int64  
 4   Region            22676 non-null  object 
 5   Number of rooms   22676 non-null  int64  
 6   Area              22676 non-null  float64
 7   Living area       22676 non-null  float64
 8   Kitchen area      22676 non-null  float64
 9   Floor             22676 non-null  int64  
 10  Number of floors  22676 non-null  int64  
 11  Renovation        22676 non-null  object 
dtypes: float64(3), int64(5), object(4)
memory usage: 2.1+ MB


In [3]:

def load_house_attributes():

  metro_station = df['Metro station'].value_counts().keys().tolist()
  counts = df['Metro station'].value_counts().tolist()

  for (metro_st, count) in zip(metro_station, counts):
    if count < 500:
      idx = df[df['Metro station']== metro_st].index
      df.drop(idx, inplace= True)

  train, test = train_test_split(df, test_size = 0.25, random_state = 42)

  return train, test


train, test = load_house_attributes()

print(f'shape of train data: {train.shape}')
print(f'shape of test data: {test.shape}')
print('----------')
print(f'the first record of train dataset is:\n {train.iloc[0]}')

shape of train data: (5946, 12)
shape of test data: (1982, 12)
----------
the first record of train dataset is:
 Price                         5985718
Apartment type           New building
Metro station       Красногвардейская
Minutes to metro                   13
Region                  Moscow region
Number of rooms                     1
Area                            47.71
Living area                      26.3
Kitchen area                     10.6
Floor                               5
Number of floors                   20
Renovation                   Cosmetic
Name: 19520, dtype: object


In [4]:
# normalize attributes
def preprocess_house_attributes(train, test):

  nomerical_data = ['Number of rooms','Area','Living area', 'Kitchen area',\
                    'Floor','Number of floors','Minutes to metro']

  # Normalized the nomerical data
  sc = StandardScaler()

  train_nomerical = sc.fit_transform(train[nomerical_data])
  test_nomerical = sc.transform(test[nomerical_data])

  print(f'the first record of the normalized train data:\n{train_nomerical[0]}')
  print(f'\nShape of the normalized test data: {test_nomerical.shape}')

  categorical_data = ['Apartment type', 'Metro station', 'Region','Renovation']

  # Normalized the categorical data
  one_hot_encoder = OneHotEncoder(sparse_output= False)

  train_categorical = one_hot_encoder.fit_transform(train[categorical_data])
  test_categorical = one_hot_encoder.transform(test[categorical_data])

  # Concatenate two vectors horizontally
  x_train = np.hstack([train_nomerical, train_categorical])
  x_test  = np.hstack([test_nomerical, test_categorical])

  print(f'shape of x_train: {x_train.shape}')
  print(f'shape of x_train: {x_test.shape}')

  return x_train, x_test

x_train, x_test = preprocess_house_attributes(train, test)

the first record of the normalized train data:
[-0.61899007 -0.06908953  0.06355223 -0.02898586 -0.90413926  1.14383518
  0.17281351]

Shape of the normalized test data: (1982, 7)
shape of x_train: (5946, 21)
shape of x_train: (1982, 21)


In [5]:
# Normalize Labels

def preprocess_house_label(train, test):

  print('This is a test for data accuracy',train['Price'].iloc[0])
  max_price = train['Price'].max()
  y_train = train['Price'] / max_price
  y_test = test['Price'] / max_price

  return y_train, y_test

y_train, y_test = preprocess_house_label(train, test)
print(f'This is a test for data accuracy after label normalized: {y_train.iloc[0]}')

This is a test for data accuracy 5985718
This is a test for data accuracy after label normalized: 0.1312657456140351


In [6]:
def algorithm(x_train, y_train):

  model = SGDRegressor(tol=0.00001)

  model.fit(x_train, y_train)
  print(model.n_iter_)

  return model

model = algorithm(x_train, y_train)

13


In [19]:
def show_results():

  y_pred = model.predict(x_test)

  MAPE = mean_absolute_percentage_error(y_test, y_pred)

  MAE = mean_absolute_error(y_test, y_pred)
  R2_SCORE = r2_score(y_test, y_pred)

  print(f'MAPE: {(MAPE * 100):.2f} %')
  print(f'MAE: {(MAE * 100):.2f} %')
  print(f'R2_Score: {(R2_SCORE * 100):.2f} %')

show_results()

MAPE: 9.23 %
MAE: 1.47 %
R2_Score: 85.22 %
