# Boston House dataset Analysis

## 1- Import Modules

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

## 2- read data

In [2]:
from sklearn.datasets import load_boston
boston_dataset = load_boston()
print(boston_dataset.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [3]:
type(boston_dataset)

sklearn.utils.Bunch

In [4]:
boston_dataset.filename

'/home/ashkan/anaconda3/envs/mapsa-ml/lib/python3.9/site-packages/sklearn/datasets/data/boston_house_prices.csv'

In [5]:
# boston_dataset.data
boston_dataset.keys()
boston_dataset.feature_names
# boston_dataset['target']

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [6]:
boston_df = pd.DataFrame(data= boston_dataset.data , columns=boston_dataset.feature_names)

In [7]:
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


## scaling features to a range

In [8]:
from sklearn import preprocessing
x = boston_df.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
boston_df = pd.DataFrame(x_scaled)
boston_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,0.18,0.067815,0.0,0.314815,0.577505,0.641607,0.269203,0.0,0.208015,0.287234,1.0,0.08968
1,0.000236,0.0,0.242302,0.0,0.17284,0.547998,0.782698,0.348962,0.043478,0.104962,0.553191,1.0,0.20447
2,0.000236,0.0,0.242302,0.0,0.17284,0.694386,0.599382,0.348962,0.043478,0.104962,0.553191,0.989737,0.063466
3,0.000293,0.0,0.06305,0.0,0.150206,0.658555,0.441813,0.448545,0.086957,0.066794,0.648936,0.994276,0.033389
4,0.000705,0.0,0.06305,0.0,0.150206,0.687105,0.528321,0.448545,0.086957,0.066794,0.648936,1.0,0.099338


In [9]:
boston_df['price'] = boston_dataset.target
boston_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,price
0,0.000000,0.18,0.067815,0.0,0.314815,0.577505,0.641607,0.269203,0.000000,0.208015,0.287234,1.000000,0.089680,24.0
1,0.000236,0.00,0.242302,0.0,0.172840,0.547998,0.782698,0.348962,0.043478,0.104962,0.553191,1.000000,0.204470,21.6
2,0.000236,0.00,0.242302,0.0,0.172840,0.694386,0.599382,0.348962,0.043478,0.104962,0.553191,0.989737,0.063466,34.7
3,0.000293,0.00,0.063050,0.0,0.150206,0.658555,0.441813,0.448545,0.086957,0.066794,0.648936,0.994276,0.033389,33.4
4,0.000705,0.00,0.063050,0.0,0.150206,0.687105,0.528321,0.448545,0.086957,0.066794,0.648936,1.000000,0.099338,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.000633,0.00,0.420455,0.0,0.386831,0.580954,0.681771,0.122671,0.000000,0.164122,0.893617,0.987619,0.219095,22.4
502,0.000438,0.00,0.420455,0.0,0.386831,0.490324,0.760041,0.105293,0.000000,0.164122,0.893617,1.000000,0.202815,20.6
503,0.000612,0.00,0.420455,0.0,0.386831,0.654340,0.907312,0.094381,0.000000,0.164122,0.893617,1.000000,0.107892,23.9
504,0.001161,0.00,0.420455,0.0,0.386831,0.619467,0.889804,0.114514,0.000000,0.164122,0.893617,0.991301,0.131071,22.0


In [10]:
boston_df.describe()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,price
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,0.040544,0.113636,0.391378,0.06917,0.349167,0.521869,0.676364,0.242381,0.371713,0.422208,0.622929,0.898568,0.301409,22.532806
std,0.096679,0.233225,0.251479,0.253994,0.238431,0.134627,0.289896,0.191482,0.378576,0.321636,0.230313,0.230205,0.197049,9.197104
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
25%,0.000851,0.0,0.173387,0.0,0.131687,0.445392,0.433831,0.088259,0.130435,0.175573,0.510638,0.94573,0.14404,17.025
50%,0.002812,0.0,0.338343,0.0,0.314815,0.507281,0.76828,0.188949,0.173913,0.272901,0.68617,0.986232,0.265728,21.2
75%,0.041258,0.125,0.646628,0.0,0.49177,0.586798,0.93898,0.369088,1.0,0.914122,0.808511,0.998298,0.420116,25.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,50.0


In [11]:
new_bosto_df = boston_df[boston_df['CRIM']<10]

plt.scatter(y=new_bosto_df['price'], x=new_bosto_df['CRIM'])

KeyError: 'CRIM'

In [None]:
boston_df.describe()

## 2- Data Visualization

In [None]:
sns.pairplot(boston_df)

In [None]:
sns.distplot(boston_df['price'])

In [None]:
boston_data_corr = boston_df.corr()

In [None]:
sns.heatmap(boston_data_corr, cmap='BuGn')

## 3- Trainig Section

In [None]:
X = boston_df.iloc[:, :-1].values
y = boston_df.iloc[:,-1].values
X.shape

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.4 , random_state=85)
X_train.shape

In [None]:
X_train

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lm_boston = LinearRegression()

In [None]:
lm_boston.fit(X_train,y_train)

In [None]:
lm_boston.intercept_

In [None]:
lm_boston.coef_

In [None]:
#  TODO dataframe with column name COEFF and index's name features and value of coef per row

## 4- Prediction and Evaluation

In [None]:
prediction = lm_boston.predict(X_test)

In [None]:
sns.regplot(y_test,prediction)

In [None]:
plt.hist(y_test-prediction)

In [None]:
np.corrcoef(y_test, prediction)

In [None]:
sns.distplot(y_test-prediction)

In [None]:
from sklearn import metrics

In [None]:
y_test.mean()

In [None]:
 MSE = metrics.mean_squared_error(y_test,prediction)

In [None]:
import math

In [None]:
math.sqrt(MSE)