In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn as skt

%matplotlib inline

In [2]:
housing = pd.read_csv("housingdata.csv")

In [3]:
housing.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [4]:
housing.info() #shows the no. of entries
#506 rows, 14 columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


In [5]:
housing['CHAS'].value_counts() #gives the frequencies of values of a feature

0    471
1     35
Name: CHAS, dtype: int64

In [6]:
housing.describe() 

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [7]:
#housing.hist(bins = 50, figsize = (20, 15))

## Train Test Splitting

In [8]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size = 0.2, random_state = 42)
print(f"Train Set: {len(train_set)}\nTest Set: {len(test_set)}")

Train Set: 404
Test Set: 102


In [9]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits = 1, train_size = 0.8, random_state = 42)
for train_index, test_index in split.split(housing, housing['CHAS']):
    split_train_set = housing.loc[train_index]
    split_test_set = housing.loc[test_index]

In [10]:
split_test_set['CHAS'].value_counts()

0    95
1     7
Name: CHAS, dtype: int64

In [11]:
housing = split_train_set.copy()
housing_test = split_test_set.copy()

## Looking for Correlations

In [12]:
corr_matrix = housing.corr()
corr_matrix['MEDV'].sort_values(ascending = False)

MEDV       1.000000
RM         0.679894
B          0.361761
ZN         0.339741
DIS        0.240451
CHAS       0.205066
AGE       -0.364596
RAD       -0.374693
CRIM      -0.393715
NOX       -0.422873
TAX       -0.456657
INDUS     -0.473516
PTRATIO   -0.493534
LSTAT     -0.740494
Name: MEDV, dtype: float64

# TAXRM ATTRIBUTE

In [13]:
#housing["TAXRM"] = housing['TAX']/housing['RM']

In [14]:
housing.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
254,0.04819,80.0,3.64,0,0.392,6.108,32.0,9.2203,1,315,16.4,392.89,6.57,21.9
348,0.01501,80.0,2.01,0,0.435,6.635,29.7,8.344,4,280,17.0,390.94,5.99,24.5
476,4.87141,0.0,18.1,0,0.614,6.484,93.6,2.3053,24,666,20.2,396.21,18.68,16.7
321,0.18159,0.0,7.38,0,0.493,6.376,54.3,4.5404,5,287,19.6,396.9,6.87,23.1
326,0.30347,0.0,7.38,0,0.493,6.312,28.9,5.4159,5,287,19.6,396.9,6.15,23.0


In [15]:
housing = split_train_set.drop('MEDV', axis = 1)
housing_labels = split_train_set['MEDV'].copy()

## Missing Attributes

In [16]:
#To take care of missing attributes,
    #1. Get rid of missing data points
    #2. Get rid of whole attribute
    #3. Set the value to -> 0, mean, median

In [17]:
#Option1 - data_name.dropna(subset = [Feature])
#Option2 - data_name.drop(Feature, axis = 1{axis = 1 to remove columns})
#Option3 - data_name[Feature].median()[or]mean()
           #To fill empty cells, data_name[Feature].fillna(value{median, mean, 0})

In [18]:
#For Option3, to add median in every empty cell in every column
##from sklearn.impute import SimpleImputer
##imputer = SimpleImputer(strategy = "median")
##imputer.fit(housing)

## Scikit - Learn Design

Three types of objects
1. Estimators
2. Transformers
3. Predictors

## Making Pipeline

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
my_pipeline = Pipeline([('imputer', SimpleImputer(strategy = "median")), ('std_scaler', StandardScaler()),])

In [20]:
housing_tr_trans = my_pipeline.fit_transform(housing)

In [21]:
housing_tr_trans.shape

(404, 13)

# Model

In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
#model = LinearRegression()
#model = DecisionTreeRegressor()
model = RandomForestRegressor()
model.fit(housing_tr_trans, housing_labels)

## Evaluating the model

In [23]:
from sklearn.metrics import mean_squared_error
housing_predict = model.predict(housing_tr_trans)
mse = mean_squared_error(housing_labels, housing_predict)
rmse = np.sqrt(mse)

In [24]:
rmse

1.2317494722451625

In [25]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, housing_tr_trans, housing_labels, scoring = "neg_mean_squared_error")
rmse_scores = np.sqrt(-scores)

In [26]:
rmse_scores

array([2.88875551, 3.5483514 , 3.05697482, 4.1095744 , 3.18727542])

In [27]:
def print_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Std Dev: ", scores.std())

In [28]:
print_scores(rmse_scores)

Scores:  [2.88875551 3.5483514  3.05697482 4.1095744  3.18727542]
Mean:  3.358186311749539
Std Dev:  0.43383199601137107


## Testing

In [31]:
X_test = split_test_set.drop("MEDV", axis = 1)
Y_test = split_test_set["MEDV"].copy()
X_test_prepared = my_pipeline.transform(X_test)
final_predict = model.predict(X_test_prepared)
final_mse = mean_squared_error(Y_test, final_predict)
final_rmse = np.sqrt(final_mse)
#print(final_predict, list(Y_test))
final_rmse

2.940666407732884