<h1 style='color:purple' align='center'>Data Science Regression Project: Predicting Home Prices</h1>

In [98]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib 
matplotlib.rcParams["figure.figsize"] = (20,10)

<h2 style='color:blue'>Data Load: Load home prices into a dataframe</h2>

In [99]:
df1 = pd.read_csv("tunisia_house_prices.csv")
df1.head()

Unnamed: 0,Location,Size_sqm,Bedrooms,Bathrooms,Year_Built,Property_Type,Price_TND
0,Sfax,91,5,1,2016,Apartment,196478
1,Bizerte,289,4,1,1985,Studio,387176
2,Kairouan,233,6,1,2002,Apartment,368223
3,Gabes,448,1,1,2009,Villa,734274
4,Kairouan,130,6,2,1983,Apartment,273007


In [100]:
df1.shape

(446, 7)

In [101]:
df1.columns

Index(['Location', 'Size_sqm', 'Bedrooms', 'Bathrooms', 'Year_Built',
       'Property_Type', 'Price_TND'],
      dtype='object')

In [102]:
df1.groupby('Location')['Location'].agg('count')

Location
Aryanah     43
Bizerte     43
Gabes       46
Gafsa       50
Kairouan    56
Monastir    58
Sfax        49
Sousse      51
Tunis       50
Name: Location, dtype: int64

In [103]:
df1['Location'].unique()

array(['Sfax', 'Bizerte', 'Kairouan', 'Gabes', 'Monastir', 'Gafsa',
       'Sousse', 'Aryanah', 'Tunis'], dtype=object)

In [104]:
df1['Location'].value_counts()

Location
Monastir    58
Kairouan    56
Sousse      51
Tunis       50
Gafsa       50
Sfax        49
Gabes       46
Bizerte     43
Aryanah     43
Name: count, dtype: int64

**Drop features that are not required to build our model**

In [105]:
df2 = df1.drop(['Year_Built','Property_Type'],axis='columns')
df2.shape
df2.head()

Unnamed: 0,Location,Size_sqm,Bedrooms,Bathrooms,Price_TND
0,Sfax,91,5,1,196478
1,Bizerte,289,4,1,387176
2,Kairouan,233,6,1,368223
3,Gabes,448,1,1,734274
4,Kairouan,130,6,2,273007


<h2 style='color:blue'>Data Cleaning: Handle NA values</h2>

In [106]:
df2.isnull().sum()

Location     0
Size_sqm     0
Bedrooms     0
Bathrooms    0
Price_TND    0
dtype: int64

In [107]:
df2.shape

(446, 5)

In [108]:
df2[df2.Bedrooms>5]

Unnamed: 0,Location,Size_sqm,Bedrooms,Bathrooms,Price_TND
2,Kairouan,233,6,1,368223
4,Kairouan,130,6,2,273007
5,Monastir,275,6,4,554863
7,Bizerte,202,6,3,436266
9,Gafsa,144,6,2,302385
...,...,...,...,...,...
426,Gabes,497,6,4,665458
437,Kairouan,70,6,3,248576
438,Tunis,156,6,1,283067
441,Bizerte,472,6,1,626213


In [109]:
df2.Size_sqm.unique()

array([ 91, 289, 233, 448, 130, 275, 165, 202,  76, 144, 315, 243, 495,
       379, 269,  61, 229, 486, 246, 239, 299, 357, 324,  90,  97,  63,
       102, 180, 481, 152, 393, 365, 258, 483, 418, 408, 313, 220,  98,
       394, 177, 331, 461, 136, 349, 492, 176, 327, 449, 248, 129, 409,
       383, 317, 226,  60,  51, 235, 347, 251, 443, 337, 125, 342, 401,
       121, 159, 157, 114,  64, 115, 189, 468, 162, 245, 334, 175, 252,
       230, 475, 276, 116,  94, 140, 186, 160, 195,  82, 288, 437, 318,
       207,  52, 429,  62, 237, 142, 291, 416, 150, 199, 141, 110, 458,
       485, 455, 430, 433, 423, 335, 470, 302, 476, 124, 411, 166, 261,
       436, 200, 427, 404,  50, 274, 424, 126,  73, 351, 406, 127,  75,
       487,  84, 467, 363, 309, 328, 403, 367, 279, 215, 391, 154,  88,
        55, 453, 169, 109, 255, 332, 128, 311, 497, 346, 168, 149, 100,
       187, 240, 489, 316,  53, 218,  77, 305, 178, 370, 281, 432, 474,
       340, 388, 456, 156, 211, 193, 375, 182, 325,  69, 297, 41

In [110]:
df2.loc[30]

Location      Gafsa
Size_sqm        152
Bedrooms          4
Bathrooms         3
Price_TND    349233
Name: 30, dtype: object

In [111]:
df2.head(3)

Unnamed: 0,Location,Size_sqm,Bedrooms,Bathrooms,Price_TND
0,Sfax,91,5,1,196478
1,Bizerte,289,4,1,387176
2,Kairouan,233,6,1,368223


In [112]:
df2.Location=df2.Location.apply(lambda x:x.strip())

In [113]:
dummies = pd.get_dummies(df2.Location)
dummies.head(10)

Unnamed: 0,Aryanah,Bizerte,Gabes,Gafsa,Kairouan,Monastir,Sfax,Sousse,Tunis
0,False,False,False,False,False,False,True,False,False
1,False,True,False,False,False,False,False,False,False
2,False,False,False,False,True,False,False,False,False
3,False,False,True,False,False,False,False,False,False
4,False,False,False,False,True,False,False,False,False
5,False,False,False,False,False,True,False,False,False
6,False,False,False,False,True,False,False,False,False
7,False,True,False,False,False,False,False,False,False
8,False,False,False,False,False,True,False,False,False
9,False,False,False,True,False,False,False,False,False


In [114]:
df3 = pd.concat([df2,dummies],axis='columns')
df3.head()

Unnamed: 0,Location,Size_sqm,Bedrooms,Bathrooms,Price_TND,Aryanah,Bizerte,Gabes,Gafsa,Kairouan,Monastir,Sfax,Sousse,Tunis
0,Sfax,91,5,1,196478,False,False,False,False,False,False,True,False,False
1,Bizerte,289,4,1,387176,False,True,False,False,False,False,False,False,False
2,Kairouan,233,6,1,368223,False,False,False,False,True,False,False,False,False
3,Gabes,448,1,1,734274,False,False,True,False,False,False,False,False,False
4,Kairouan,130,6,2,273007,False,False,False,False,True,False,False,False,False


In [115]:
df4 = df3.drop('Location',axis='columns')
df4.head(2)

Unnamed: 0,Size_sqm,Bedrooms,Bathrooms,Price_TND,Aryanah,Bizerte,Gabes,Gafsa,Kairouan,Monastir,Sfax,Sousse,Tunis
0,91,5,1,196478,False,False,False,False,False,False,True,False,False
1,289,4,1,387176,False,True,False,False,False,False,False,False,False


<h2 style='color:blue'>Build a Model</h2>

In [116]:
df4.shape

(446, 13)

In [117]:
X = df4.drop(['Price_TND'],axis='columns')
X.head(3)

Unnamed: 0,Size_sqm,Bedrooms,Bathrooms,Aryanah,Bizerte,Gabes,Gafsa,Kairouan,Monastir,Sfax,Sousse,Tunis
0,91,5,1,False,False,False,False,False,False,True,False,False
1,289,4,1,False,True,False,False,False,False,False,False,False
2,233,6,1,False,False,False,False,True,False,False,False,False


In [118]:
X.shape

(446, 12)

In [119]:
y = df4.Price_TND
y.head(3)

0    196478
1    387176
2    368223
Name: Price_TND, dtype: int64

In [120]:
len(y)

446

In [121]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [122]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

0.8182502088918072

<h2 style='color:blue'>Use K Fold cross validation to measure accuracy of our LinearRegression model</h2>

In [123]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

array([0.76637224, 0.78174554, 0.74283391, 0.78873619, 0.81287594])

**We can see that in 5 iterations we get a score above 70% all the time. This is pretty good but we want to test few other algorithms for regression to see if we can get even better score. We will use GridSearchCV for this purpose**

<h2 style='color:blue'>Find best model using GridSearchCV</h2>

In [124]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {}
        },
        'lasso': {
            'model': Lasso(max_iter=5000),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion': ['squared_error', 'friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(X,y)

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.778513,{}
1,lasso,0.778523,"{'alpha': 2, 'selection': 'random'}"
2,decision_tree,0.499797,"{'criterion': 'friedman_mse', 'splitter': 'best'}"


**Based on above results we can say that LinearRegression gives the best score. Hence we will use that.**

<h2 style='color:blue'>Test the model for few properties</h2>

In [125]:
X.columns

Index(['Size_sqm', 'Bedrooms', 'Bathrooms', 'Aryanah', 'Bizerte', 'Gabes',
       'Gafsa', 'Kairouan', 'Monastir', 'Sfax', 'Sousse', 'Tunis'],
      dtype='object')

In [126]:
import warnings
warnings.filterwarnings("ignore", message="X does not have valid feature names")

def predict_price(location,sqm,bath,room):    
    loc_index = np.where(X.columns==location)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = sqm
    x[1] = bath
    x[2] = room
    if loc_index >= 0:
        x[loc_index] = 1
    return lr_clf.predict([x])[0]

In [127]:
predict_price('Sousse',500, 2, 4)

np.float64(696683.4381054549)

<h2 style='color:blue'>Export the tested model to a pickle file</h2>

In [128]:
import pickle
with open('Tunisia_home_prices_model.pickle','wb') as f:
    pickle.dump(lr_clf,f)

<h2 style='color:blue'>Export location and column information to a file that will be useful later on in our prediction application</h2>

In [129]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))