In [102]:
import pandas as pd

from sklearn.model_selection import train_test_split

## Importing the dataset from .csv

In [103]:
housing_data = pd.read_csv('csvDatasets/Housing_Data_Ready')

## Taking the needed variables

In [104]:
X = housing_data[['area','bathrooms','stories','airconditioning','parking','prefarea','furnishingstatus','mainroad']]
y = housing_data['price']

## Train/Test variables

In [105]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

## Linear Regression (and Stardantization)

In [106]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [107]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [108]:
modelLin = LinearRegression()
modelLin.fit(X_train_scaled, y_train)

In [109]:
y_pred_lin = modelLin.predict(X_test_scaled)

In [110]:
mse_lin = mean_squared_error(y_test, y_pred_lin)
r2_lin = r2_score(y_test, y_pred_lin)

In [111]:
print('Mean squared error', mse_lin)
print('R-squared',r2_lin)

Mean squared error 1336607494586.3037
R-squared 0.6691025403283974


In result we get, that LinearRegression have 64.8%

## Random Forest Regression

In [112]:
from sklearn.ensemble import RandomForestRegressor

In [113]:
modelRF = RandomForestRegressor(n_estimators=100,random_state=42)

In [114]:
modelRF.fit(X_train,y_train)

In [115]:
y_pred_rf = modelRF.predict(X_test)

In [116]:
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

In [117]:
print('Mean squared error: ',mse_rf)
print('R_squared',r2_rf)

Mean squared error:  1544847104873.9255
R_squared 0.6175496661104464


Random forest have only 59.2% that is worse than Linear Regression

## Checking on overfitting

In [118]:
y_train_pred = modelLin.predict(X_train)



In [119]:
train_mse = mean_squared_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

In [120]:
print('Training Mean Squared Error (MSE):', train_mse)
print('Training R²: ', train_r2)
print('Test Mean Squared Error (MSE): ',mse_lin)
print('Test R²: ', r2_lin)

Training Mean Squared Error (MSE): 8.484047192045778e+18
Training R²:  -2629670.1377147944
Test Mean Squared Error (MSE):  1336607494586.3037
Test R²:  0.6691025403283974


Our model is overfitted

## Optimizing the alpha parameter for Ridge Regression

In [121]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

In [122]:
alphas = [0.01, 0.1, 1, 10, 100, 1000]

In [123]:
ridge_model = Ridge()

In [124]:
param_grid = {'alpha': alphas}

In [125]:
grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='neg_mean_squared_error')

In [126]:
grid_search.fit(X_train, y_train)

In [127]:
print('Optimal variable alpha:',grid_search.best_params_)


Optimal variable alpha: {'alpha': 1}


## Training ridge model with optimal alpha

In [128]:
model_RD = Ridge(alpha=1)

In [129]:
model_RD.fit(X_train,y_train)

In [130]:
y_train_pred = model_RD.predict(X_train)
y_test_pred = model_RD.predict(X_test)

In [131]:
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

In [132]:
print('Training Mean Squared Error (MSE): ',train_mse)
print('Testing Mean Squared Error (MSE): ',test_mse)
print('Training R²: ',train_r2)
print('Testing R²: ',test_r2)

Training Mean Squared Error (MSE):  1194389076890.7068
Testing Mean Squared Error (MSE):  1340129140746.7104
Training R²:  0.6297933743642994
Testing R²:  0.6682307034031516


Now we see, that our model is get better

## Saving the model

In [133]:
import joblib

In [134]:
filename_model = 'model/linregmodel.sav'

In [135]:
joblib.dump(model_RD, filename_model)

['model/linregmodel.sav']

## Making the dataset with result data

In [143]:
data_result = X_test.copy()

In [144]:
data_result['real_price'] = y_test

In [145]:
data_result['predicted_price'] = y_test_pred

In [150]:
data_result['predicted_price'] = data_result['predicted_price'].round(0)

In [151]:
print(data_result)

     area  bathrooms  stories  airconditioning  parking  prefarea  \
316  4500          1        1                0        0         0   
77   5000          1        4                0        0         0   
360  2145          1        2                0        0         1   
90   7980          1        1                0        2         0   
493  7320          2        2                0        0         0   
..    ...        ...      ...              ...      ...       ...   
395  3240          1        2                0        2         0   
425  3300          1        2                0        1         0   
195  3968          1        2                0        0         0   
452  6000          2        4                1        1         0   
154  9166          1        1                1        2         0   

     furnishingstatus  mainroad  real_price  predicted_price  
316                 2         0     3570000        3196637.0  
77                  2         1     5600000  

## Categorizing prices

In [152]:
def categorized_prices(price):
    if price < 5500000:
        return 'Low'
    elif 5500000 <= price < 6500000:
        return 'Medium'
    else:
        return 'High'

In [154]:
data_result['price_category_real'] = data_result['real_price'].apply(categorized_prices)
data_result['price_category_predicted'] = data_result['predicted_price'].apply(categorized_prices)

In [155]:
data_result

Unnamed: 0,area,bathrooms,stories,airconditioning,parking,prefarea,furnishingstatus,mainroad,real_price,predicted_price,price_category_real,price_category_predicted
316,4500,1,1,0,0,0,2,0,3570000,3196637.0,Low,Low
77,5000,1,4,0,0,0,2,1,5600000,5185723.0,Medium,Low
360,2145,1,2,0,0,1,2,1,3332000,4215946.0,Low,Low
90,7980,1,1,0,2,0,1,1,5495000,4626760.0,Low,Low
493,7320,2,2,0,0,0,2,1,5950000,5782136.0,Medium,Medium
...,...,...,...,...,...,...,...,...,...,...,...,...
395,3240,1,2,0,2,0,1,1,3010000,3972165.0,Low,Low
425,3300,1,2,0,1,0,1,0,2835000,3336642.0,Low,Low
195,3968,1,2,0,0,0,1,0,4410000,3303065.0,Low,Low
452,6000,2,4,1,1,0,1,1,7910000,7170219.0,High,High


In [158]:
data_result.to_csv('csvDatasets/housing_data_Result.csv', index=False)