In [1]:
import sklearn
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt

In [2]:
housing = pd.read_csv("Housing.csv")

In [3]:
housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,1.0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,1.0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,0.5
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,1.0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,1.0


In [4]:
housing.describe()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
count,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,0.858716,0.177982,0.350459,0.045872,0.315596,0.693578,0.234862,0.465138
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,0.348635,0.382849,0.477552,0.209399,0.46518,0.861586,0.424302,0.380686
min,1750000.0,1650.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3430000.0,3600.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4340000.0,4600.0,3.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
75%,5740000.0,6360.0,3.0,2.0,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0
max,13300000.0,16200.0,6.0,4.0,4.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0


In [5]:
#Train-Test Splitting

In [6]:
'''def split_train_test(data, test_ratio):
    np.random.seed(42)
    shuffled = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled[:test_set_size]
    train_indices = shuffled[test_set_size:]
    
    return data.iloc[train_indices], data.iloc[test_indices]

train_set, test_set = split_train_test(housing, 0.2) '''

'def split_train_test(data, test_ratio):\n    np.random.seed(42)\n    shuffled = np.random.permutation(len(data))\n    test_set_size = int(len(data) * test_ratio)\n    test_indices = shuffled[:test_set_size]\n    train_indices = shuffled[test_set_size:]\n    \n    return data.iloc[train_indices], data.iloc[test_indices]\n\ntrain_set, test_set = split_train_test(housing, 0.2) '

In [7]:
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}")

Rows in train set: 436
Rows in test set: 109


In [8]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['hotwaterheating']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [9]:
strat_train_set['hotwaterheating'].value_counts()

hotwaterheating
0    416
1     20
Name: count, dtype: int64

In [10]:
## Looking for correlations
# corr_matrix = housing.corr()
# corr_matrix['price'].sort_values(ascending=False)
# attributes = ["area", "price", "basement", "bedrooms"]
# scatter_matrix(housing[attributes], figsize=(12,8))
# housing.plot(kind="scatter", x="area", y="price", alpha=0.8)

In [11]:
housing = strat_train_set.drop("area", axis=1)
housing_labels = strat_train_set["area"].copy()

In [12]:
imputer = SimpleImputer(strategy='median')
X = imputer.fit_transform(housing)
housing_tr = pd.DataFrame(X, columns=housing.columns)
my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),
])

In [13]:
housing_num_tr = my_pipeline.fit_transform(housing_tr)

In [14]:
housing_num_tr.shape

(436, 12)

In [15]:
# model = DecisionTreeRegressor()
# model = LinearRegression()
model = RandomForestRegressor()
model.fit(housing_num_tr, housing_labels)

In [16]:
some_data = housing.iloc[:5]

In [17]:
some_labels = housing_labels.iloc[:5]

In [18]:
prepared_data = my_pipeline.transform(some_data)

In [19]:
model.predict(prepared_data)

array([3546.77, 3180.51, 7832.8 , 6408.07, 4805.71])

In [20]:
list(some_labels)

[3120, 3000, 8880, 5960, 4600]

In [21]:
housing_predictions = model.predict(housing_num_tr)
mse = mean_squared_error(housing_labels, housing_predictions)
rmse = np.sqrt(mse)

In [22]:
rmse

729.3887833034026

In [23]:
scores = cross_val_score(model, housing_num_tr, housing_labels, scoring="neg_mean_squared_error")
rmse_scores = np.sqrt(-scores)

In [24]:
list(rmse_scores)

[1954.4052570926087,
 1916.5212095577097,
 1863.1367023057303,
 1920.073492723957,
 2120.6183134645307]

In [25]:
def print_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard deviations: ", scores.std())

In [26]:
print_scores(rmse_scores)

Scores:  [1954.40525709 1916.52120956 1863.13670231 1920.07349272 2120.61831346]
Mean:  1954.9509950289073
Standard deviations:  87.8283680301492
