# House Sales in King County, USA


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.linear_model import LinearRegression


# Module 1: Importing Data Sets


Load the csv:


In [4]:
file_name='"C:/Users\ACER\Downloads\house_price_data.csv"'
df=pd.read_csv(file_name)

FileNotFoundError: [Errno 2] No such file or directory: 'house_price_data.csv'

We use the method <code>head</code> to display the first 5 columns of the dataframe.


In [None]:
df.head()

### Question 1

Display the data types of each column using the function dtypes, then take a screenshot and submit it, include your code in the image.


In [None]:
df.dtypes

We use the method describe to obtain a statistical summary of the dataframe.


In [None]:
df.describe()

# Module 2: Data Wrangling


### Question 2

Drop the columns <code>"id"</code>  and <code>"Unnamed: 0"</code> from axis 1 using the method <code>drop()</code>, then use the method <code>describe()</code> to obtain a statistical summary of the data. Take a screenshot and submit it, make sure the <code>inplace</code> parameter is set to <code>True</code>


In [None]:
df.drop(['id','Unnamed: 0'],axis=1,inplace=True)
df.head()

We can see we have missing values for the columns <code> bedrooms</code>  and <code> bathrooms </code>


In [None]:
df.describe()

In [None]:
print("number of NaN values for the column bedrooms :", df['bedrooms'].isnull().sum())
print("number of NaN values for the column bathrooms :", df['bathrooms'].isnull().sum())

We can replace the missing values of the column <code>'bedrooms'</code> with the mean of the column  <code>'bedrooms' </code> using the method <code>replace()</code>. Don't forget to set the <code>inplace</code> parameter to <code>True</code>


In [None]:
mean=df['bedrooms'].mean()
df['bedrooms'].replace(np.nan,mean, inplace=True)

We also replace the missing values of the column <code>'bathrooms'</code> with the mean of the column  <code>'bathrooms' </code> using the method <code>replace()</code>. Don't forget to set the <code> inplace </code>  parameter top <code> True </code>


In [None]:
mean=df['bathrooms'].mean()
df['bathrooms'].replace(np.nan,mean, inplace=True)

In [None]:
print("number of NaN values for the column bedrooms :", df['bedrooms'].isnull().sum())
print("number of NaN values for the column bathrooms :", df['bathrooms'].isnull().sum())

# Module 3: Exploratory Data Analysis


### Question 3

Use the method <code>value_counts</code> to count the number of houses with unique floor values, use the method <code>.to_frame()</code> to convert it to a dataframe.


In [None]:
df['floors'].value_counts().to_frame()

### Question 4

Use the function <code>boxplot</code> in the seaborn library  to  determine whether houses with a waterfront view or without a waterfront view have more price outliers.


In [None]:
sns.boxplot(x='waterfront',y='price',data=df)

### Question 5

Use the function <code>regplot</code>  in the seaborn library  to  determine if the feature <code>sqft_above</code> is negatively or positively correlated with price.


In [None]:
sns.regplot(x='sqft_above',y='price',data=df)

We can use the Pandas method <code>corr()</code>  to find the feature other than price that is most correlated with price.


In [None]:
df.corr()['price'].sort_values()

# Module 4: Model Development


We can Fit a linear regression model using the  longitude feature <code>'long'</code> and  caculate the R^2.


In [None]:
X = df[['long']]
Y = df['price']
lm = LinearRegression()
lm.fit(X,Y)
lm.score(X, Y)

### Question  6

Fit a linear regression model to predict the <code>'price'</code> using the feature <code>'sqft_living'</code> then calculate the R^2. Take a screenshot of your code and the value of the R^2.


In [None]:
X=df[['sqft_living']]
Y=df['price']
lm=LinearRegression()
lm.fit(X,Y)
lm.score(X,Y)

### Question 7

Fit a linear regression model to predict the <code>'price'</code> using the list of features:


In [None]:
features =["floors", "waterfront","lat" ,"bedrooms" ,"sqft_basement" ,"view" ,"bathrooms","sqft_living15","sqft_above","grade","sqft_living"]     

Then calculate the R^2. Take a screenshot of your code.


In [None]:
lm=LinearRegression()
X=df[features]
Y=df['price']
lm.fit(X,Y)
lm.score(X,Y)

### This will help with Question 8

Create a list of tuples, the first element in the tuple contains the name of the estimator:

<code>'scale'</code>

<code>'polynomial'</code>

<code>'model'</code>

The second element in the tuple  contains the model constructor

<code>StandardScaler()</code>

<code>PolynomialFeatures(include_bias=False)</code>

<code>LinearRegression()</code>


In [None]:
Input=[('scale',StandardScaler()),('polynomial', PolynomialFeatures(include_bias=False)),('model',LinearRegression())]

### Question 8

Use the list to create a pipeline object to predict the 'price', fit the object using the features in the list <code>features</code>, and calculate the R^2.


In [None]:
pipe=Pipeline(Input)
X=df[features]
Y=df['price']
pipe.fit(X,Y)
yhat=pipe.predict(X[features])
pipe.score(X,Y)

# Module 5: Model Evaluation and Refinement


Import the necessary modules:


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
print("done")

We will split the data into training and testing sets:


In [None]:
features =["floors", "waterfront","lat" ,"bedrooms" ,"sqft_basement" ,"view" ,"bathrooms","sqft_living15","sqft_above","grade","sqft_living"]    
X = df[features]
Y = df['price']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=1)


print("number of test samples:", x_test.shape[0])
print("number of training samples:",x_train.shape[0])

### Question 9

Create and fit a Ridge regression object using the training data, set the regularization parameter to 0.1, and calculate the R^2 using the test data.


In [None]:
from sklearn.linear_model import Ridge

In [None]:
ridge=Ridge(alpha=0.1)
ridge.fit(x_train,y_train)
ridge.score(x_test,y_test)

### Question 10

Perform a second order polynomial transform on both the training data and testing data. Create and fit a Ridge regression object using the training data, set the regularisation parameter to 0.1, and calculate the R^2 utilising the test data provided. Take a screenshot of your code and the R^2.


In [None]:
pf = PolynomialFeatures(degree=2,include_bias=False)
pf.fit_transform(x_train,y_train)
pf.fit_transform(x_test,y_test)
Ridge=Ridge(alpha=0.1)
Ridge.fit(x_train,y_train)
Ridge.score(x_test,y_test)