## Regression Task 1

##  Prepare Problem


In [1]:
# Load libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # MATLAB-like way of plotting


In [2]:
# sklearn package for machine learning in python:
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error, r2_score
from pandas import set_option

In [3]:
# Load dataset
house_price = pd.read_csv('houseprice_data.csv')

## Summarize Data

In [4]:
# display dataset in rows and columns
display(house_price.tail())



Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
21608,360000.0,3,2.5,1530,1131,3.0,0,0,3,8,1530,0,2009,0,98103,47.6993,-122.346,1530,1509
21609,400000.0,4,2.5,2310,5813,2.0,0,0,3,8,2310,0,2014,0,98146,47.5107,-122.362,1830,7200
21610,402101.0,2,0.75,1020,1350,2.0,0,0,3,7,1020,0,2009,0,98144,47.5944,-122.299,1020,2007
21611,400000.0,3,2.5,1600,2388,2.0,0,0,3,8,1600,0,2004,0,98027,47.5345,-122.069,1410,1287
21612,325000.0,2,0.75,1020,1076,2.0,0,0,3,7,1020,0,2008,0,98144,47.5941,-122.299,1020,1357


In [5]:
# list the column names
house_price.columns

Index(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15'],
      dtype='object')

## Descriptive statistics

In [6]:
# descriptions
set_option('precision', 1)
print(house_price.describe())

         price  bedrooms  bathrooms  sqft_living  sqft_lot   floors  \
count  2.2e+04   21613.0    21613.0      21613.0   2.2e+04  21613.0   
mean   5.4e+05       3.4        2.1       2079.9   1.5e+04      1.5   
std    3.7e+05       0.9        0.8        918.4   4.1e+04      0.5   
min    7.5e+04       0.0        0.0        290.0   5.2e+02      1.0   
25%    3.2e+05       3.0        1.8       1427.0   5.0e+03      1.0   
50%    4.5e+05       3.0        2.2       1910.0   7.6e+03      1.5   
75%    6.4e+05       4.0        2.5       2550.0   1.1e+04      2.0   
max    7.7e+06      33.0        8.0      13540.0   1.7e+06      3.5   

       waterfront     view  condition    grade  sqft_above  sqft_basement  \
count     2.2e+04  21613.0    21613.0  21613.0     21613.0        21613.0   
mean      7.5e-03      0.2        3.4      7.7      1788.4          291.5   
std       8.7e-02      0.8        0.7      1.2       828.1          442.6   
min       0.0e+00      0.0        1.0      1.0      

## Data Analysis

In [7]:
# correlation
set_option('precision', 3)
print(house_price.corr(method='pearson'))

# Considering attributes that are strongly correlated to price
# Using values not less that 0.25 to select the attributes that are strongly correlated to price ,
# Attributes that are strongly correlated with price are :
# bedrooms,bathrooms,sqft_living,floors, waterfront,view,condition,grade,sqft_basement  


               price  bedrooms  bathrooms  sqft_living  sqft_lot  floors  \
price          1.000     0.308      0.525        0.702     0.090   0.257   
bedrooms       0.308     1.000      0.516        0.577     0.032   0.175   
bathrooms      0.525     0.516      1.000        0.755     0.088   0.501   
sqft_living    0.702     0.577      0.755        1.000     0.173   0.354   
sqft_lot       0.090     0.032      0.088        0.173     1.000  -0.005   
floors         0.257     0.175      0.501        0.354    -0.005   1.000   
waterfront     0.266    -0.007      0.064        0.104     0.022   0.024   
view           0.397     0.080      0.188        0.285     0.075   0.029   
condition      0.036     0.028     -0.125       -0.059    -0.009  -0.264   
grade          0.667     0.357      0.665        0.763     0.114   0.458   
sqft_above     0.606     0.478      0.685        0.877     0.184   0.524   
sqft_basement  0.324     0.303      0.284        0.435     0.015  -0.246   
yr_built    

In [8]:

house_price_df=house_price.drop(['sqft_lot','sqft_above','yr_built','yr_renovated','zipcode','lat','long','sqft_living15','sqft_lot15'],axis=1)

In [9]:
house_price_df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,floors,waterfront,view,condition,grade,sqft_basement
0,221900.0,3,1.0,1180,1.0,0,0,3,7,0
1,538000.0,3,2.25,2570,2.0,0,0,3,7,400
2,180000.0,2,1.0,770,1.0,0,0,3,6,0
3,604000.0,4,3.0,1960,1.0,0,0,5,7,910
4,510000.0,3,2.0,1680,1.0,0,0,3,8,0


In [10]:
# split the dataset between X and y
# column 1 to column 9 goes to X
# column 0 goes to y,in pandas 0 begins counting 

data = house_price_df.values
X= data[:,1:9]
y =data[:,0]

In [11]:
# split the data into training and test sets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 1/3, random_state=0)

In [19]:
# fit the linear least-squres regression line to the training data:
regr = LinearRegression()
regr.fit(X_train, y_train)

LinearRegression()

In [13]:
# The coefficients
print('Coefficients: ', regr.coef_)

# The intercept
print('Intercept: ', regr.intercept_)

# The mean squared error
print('Mean squared error: %.8f' % mean_squared_error(y_test, regr.predict(X_test)))

# The R^2 value:
print('Coefficient of determination: %.2f' % r2_score(y_test, regr.predict(X_test)))

Coefficients:  [-2.55896995e+04 -1.44711642e+04  1.79868159e+02 -6.93629616e+03
  6.15071895e+05  6.27212981e+04  5.79747982e+04  1.02884487e+05]
Intercept:  -711395.620268748
Mean squared error: 58746875703.93355560
Coefficient of determination: 0.59


## Inclusion of many features

In [14]:
# split the dataset between X and y
# column 1 to column 19 goes to X
# column 0 goes to y,in pandas 0 begins counting 
data2 = house_price.values
X_new= data2[:,1:19]
y_new =data2[:,0]

In [15]:
# split the data into training and test sets:
X_new_train, X_new_test, y_new_train, y_new_test = train_test_split(X_new, y_new, test_size= 1/3, random_state=0)

In [16]:
# fit the linear least-squres regression line to the training data:
regr = LinearRegression()
regr.fit(X_new_train, y_new_train)

LinearRegression()

In [17]:
# The coefficients
print('Coefficients: ', regr.coef_)

# The intercept
print('Intercept: ', regr.intercept_)

# The mean squared error
print('Mean squared error: %.8f' % mean_squared_error(y_new_test, regr.predict(X_new_test)))

# The R^2 value:
print('Coefficient of determination: %.2f' % r2_score(y_new_test, regr.predict(X_new_test)))

Coefficients:  [-2.90159750e+04  3.62350707e+04  1.00277487e+02  2.60557590e-01
  1.20714321e+04  6.23902318e+05  5.24545624e+04  2.84274069e+04
  9.62392553e+04  6.32953609e+01  3.69821263e+01 -2.53449507e+03
  1.30628298e+01 -5.13110492e+02  6.08507465e+05 -2.07692124e+05
  3.36366489e+01 -4.85347802e-01]
Intercept:  290782.7101999346
Mean squared error: 45437269303.41967010
Coefficient of determination: 0.69


## A way of visualising the Datasets