<a href="https://colab.research.google.com/github/MichaelAshton/house-prices-prediction/blob/master/housing_price_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### imports

In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import *
import numpy as np
from sklearn.metrics import *
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import make_pipeline

## Download Dataset using Kaggle API##


### Step 1: Create a API Token in Kaggle Account 

* Go to Kaggle --> My Account --> Create New API Token

* Reference:  https://stackoverflow.com/questions/49310470/using-kaggle-datasets-into-google-colab


### Step 2: Upload your `kaggle.json` to Google Colab
 
 * Your kaggle.json is a "key" to let you use Kaggle API to download dataset from their website.
 
 
 ### Step 3: Download `Housing Price Prediction` Dataset 
 #### (you  need to have joined and accepted the rules for the competition else the download will fail)



In [1]:
# Install the Kaggle API client
!pip install -q kaggle
!apt-get install pv  >> /dev/null    # extract .tar file
# Upload Json File
import os
if not os.path.isfile('kaggle.json'):
  from google.colab import files
  files.upload() 

Saving kaggle.json to kaggle.json


In [2]:
# The Kaggle API client expects this file to be in ~/.kaggle 
%%bash

mkdir -p ~/.kaggle
cp kaggle.json ~/.kaggle/
chmod 600 ~/.kaggle/kaggle.json  #  avoids a warning on Kaggle tool startup.

printf "Santander Value Prediction Dataset:\n\n"
kaggle competitions files house-prices-advanced-regression-techniques

printf "\nStarting to download all dataset...\n"
kaggle competitions download -c house-prices-advanced-regression-techniques



Santander Value Prediction Dataset:

name                    size  creationDate         
---------------------  -----  -------------------  
sample_submission.csv   31KB  2018-11-28 22:31:52  
test.csv               441KB  2018-11-28 22:31:52  
train.csv              450KB  2018-11-28 22:31:49  
data_description.txt    13KB  2018-11-28 22:31:50  

Starting to download all dataset...
Downloading sample_submission.csv to /content

Downloading test.csv to /content

Downloading train.csv to /content

Downloading data_description.txt to /content



  0%|          | 0.00/31.2k [00:00<?, ?B/s]100%|##########| 31.2k/31.2k [00:00<00:00, 17.6MB/s]
  0%|          | 0.00/441k [00:00<?, ?B/s]100%|##########| 441k/441k [00:00<00:00, 48.6MB/s]
  0%|          | 0.00/450k [00:00<?, ?B/s]100%|##########| 450k/450k [00:00<00:00, 45.8MB/s]
  0%|          | 0.00/13.1k [00:00<?, ?B/s]100%|##########| 13.1k/13.1k [00:00<00:00, 7.94MB/s]


### Exploratory Data Analysis

In [0]:
train = pd.read_csv('train.csv', index_col=0)

In [0]:
test = pd.read_csv('test.csv', index_col=0)

In [7]:
train.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [0]:
y = train['SalePrice']

In [0]:
X = train.drop('SalePrice', axis=1)

In [0]:
X.info()

In [0]:
# drop_cols = ['FireplaceQu', 'Alley', 'PoolQC', 'Fence', 'MiscFeature']

In [0]:
# X.drop(drop_cols, axis=1, inplace=True)

In [61]:
#missing data
total = X.isnull().sum().sort_values(ascending=False)
percent = (X.isnull().sum()/X.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Unnamed: 0,Total,Percent
PoolQC,1453,0.995205
MiscFeature,1406,0.963014
Alley,1369,0.937671
Fence,1179,0.807534
FireplaceQu,690,0.472603
LotFrontage,259,0.177397
GarageCond,81,0.055479
GarageType,81,0.055479
GarageYrBlt,81,0.055479
GarageFinish,81,0.055479


In [62]:
#dealing with missing data
X = X.drop((missing_data[missing_data['Total'] > 1]).index,1)
X = X.drop(X.loc[X['Electrical'].isnull()].index)
X.isnull().sum().max() #just checking that there's no missing data missing...

0

In [0]:
lin = LinearRegression()

In [0]:
X = pd.get_dummies(X)

In [0]:
indices = X.index.intersection(y.index)
X = X.loc[indices].copy(deep=True)
y = y.loc[indices].copy(deep=True)

In [0]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [0]:
#Validation function
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, X, y, scoring="r2", cv = kf))
    return(rmse)

In [0]:
lin = make_pipeline(StandardScaler(), lin)

In [0]:
lasso = make_pipeline(StandardScaler(), Lasso())

In [91]:
score = rmsle_cv(lin)


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)


\Linear Regression score: 119064236810094208.0000 (168330068097910144.0000)



  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


In [0]:
print("\Linear Regression score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [98]:
score = rmsle_cv(lasso)


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  """


In [99]:
print("\Lasso Regression score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

\Lasso Regression score: nan (nan)

