![](https://www.reno.gov/Home/ShowImage?id=7739&t=635620964226970000)

**Competition Description from Kaggle**  
With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this competition challenges you to predict the final price of each home.

**Data description**  
This is a detailed description of the 79 features and their entries, quite important for this competition.  
You can download the txt file here: [**download**](https://www.kaggle.com/c/5407/download/data_description.txt)

**Here's a simplified version of the same problem (selected variables) for our homework**

## Load the dataset

In [1]:
import pandas as pd
import numpy as np
import csv

In [2]:
data = pd.read_csv('../input/house_prices.csv')

In [3]:
data.head()

Unnamed: 0,LotFrontage,LotArea,Street,OverallQual,OverallCond,YearBuilt,MoSold,YrSold,SalePrice
0,65.0,8450,Pave,7,5,2003,2,2008,208500
1,80.0,9600,Pave,6,8,1976,5,2007,181500
2,68.0,11250,Pave,7,5,2001,9,2008,223500
3,60.0,9550,Pave,7,5,1915,2,2006,140000
4,84.0,14260,Pave,8,5,2000,12,2008,250000


In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.select_dtypes('object').nunique()
data.Street.unique()

## Exploratory Data Analysis

In [None]:
# Check and manage missing data

In [4]:
no_of_columns = data.shape[0]
percentage_of_missing_data = data.isnull().sum()/no_of_columns
print(percentage_of_missing_data)

LotFrontage    0.177397
LotArea        0.000000
Street         0.000000
OverallQual    0.000000
OverallCond    0.000000
YearBuilt      0.000000
MoSold         0.000000
YrSold         0.000000
SalePrice      0.000000
dtype: float64


In [5]:
data.fillna(data.mean(),inplace=True)

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

data['Street'] = le.fit_transform(data['Street'])
data.head()

Unnamed: 0,LotFrontage,LotArea,Street,OverallQual,OverallCond,YearBuilt,MoSold,YrSold,SalePrice
0,65.0,8450,1,7,5,2003,2,2008,208500
1,80.0,9600,1,6,8,1976,5,2007,181500
2,68.0,11250,1,7,5,2001,9,2008,223500
3,60.0,9550,1,7,5,1915,2,2006,140000
4,84.0,14260,1,8,5,2000,12,2008,250000


In [8]:
from sklearn.preprocessing import scale

In [9]:
data_scaled = scale(data)

In [12]:
print(data_scaled)
print(len(data_scaled[0]))

[[-0.22937175 -0.20714171  0.06423821 ... -1.5991111   0.13877749
   0.34727322]
 [ 0.4519361  -0.09188637  0.06423821 ... -0.48911005 -0.61443862
   0.00728832]
 [-0.09311018  0.07347998  0.06423821 ...  0.99089135  0.13877749
   0.53615372]
 ...
 [-0.18395123 -0.14781027  0.06423821 ... -0.48911005  1.64520971
   1.07761115]
 [-0.09311018 -0.08016039  0.06423821 ... -0.8591104   1.64520971
  -0.48852299]
 [ 0.22483348 -0.05811155  0.06423821 ... -0.1191097   0.13877749
  -0.42084081]]
9


In [13]:
with open('../input/house_prices.csv', 'w') as file:
        writer = csv.writer(file)
        for i in range(len(data_scaled[0])):
            writer.writerows(data_scaled)

In [14]:
data = pd.read_csv('../input/house_prices.csv')
data.head()

Unnamed: 0,-0.22937175311444924,-0.20714170777431132,0.06423820868685405,0.6514792433257054,-0.5171998069472914,1.0509937888999856,-1.599111099180035,0.1387774889497933,0.34727321973650555
0,0.451936,-0.091886,0.064238,-0.071836,2.179628,0.156734,-0.48911,-0.614439,0.007288
1,-0.09311,0.07348,0.064238,0.651479,-0.5172,0.984752,0.990891,0.138777,0.536154
2,-0.456474,-0.096897,0.064238,0.651479,-0.5172,-1.863632,-1.599111,-1.367655,-0.515281
3,0.633618,0.375148,0.064238,1.374795,-0.5172,0.951632,2.100892,0.138777,0.869843
4,0.679039,0.360616,0.064238,-0.795151,-0.5172,0.719786,1.360892,0.891994,-0.477505


In [None]:
# Explore the relationship between the independent variables and the target (SalePrice)

In [None]:
x = data.iloc[:,:-2]
y = data.iloc[:,-1]
print(x.shape)
print(y.shape)

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test , y_train, y_test = train_test_split(x,y, test_size = 0.25, random_state = 45)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()

In [None]:
x_train = mm.fit_transform(x_train)
x_test = mm.transform(x_test)

In [None]:
from sklearn.linear_model import  LinearRegression

model_lr = LinearRegression()

In [None]:
model_lr.fit(x_train,y_train)
y_pred = model_lr.predict(x_test)

In [None]:
print("Training Accuracy :", model_lr.score(x_train,y_train))
print("Testing Accuracy :", model_lr.score(x_test,y_test))

In [None]:
feature_importances = dict(zip(x.columns , model_lr.coef_))

sorted(feature_importances.items(), key=lambda item: item[1])

## Create a model to predict SalePrice

## Check the model's performance
let's focus on accuracy by using the Root Mean Squared Error metric

In [None]:
from sklearn.metrics import mean_squared_error

# creating a confusion matrix
mean_squared_error(y_test, y_pred)