## Machine learning notebook

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#Data prep
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from DataPrepUtils import CombinedAttributesAdder as CAA

#Models
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor


#Evaluation
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score


In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

## Read in data

In [None]:
""" 
Pandas here is a bit like Xarray, i.e., it let's us read in data
and maninipulte it aithe statements like data['XYZ'] etc.
"""

file='../Datasets/housing/housing.csv'

housing = pd.read_csv( file )

housing_original_DF = housing

print("got housing data")



## setting up a new category based on income

In [None]:
""" 
Set up new income category 'income_cat' as a new key of 'housing'.
See, just like xarray dataset.
Note: median_income is reported as a numerical value [0,14] i.e. 
units of 10K$. Hence the odd bin edges for 'bins'.
"""
housing["income_cat"] = pd.cut( housing["median_income"],
                                bins=[0., 1.5, 3.0, 4.5, 6.0, np.inf],
                                labels=[1, 2, 3, 4, 5] )



""" Naive random split """
train_set_1, test_set_1 =  train_test_split( housing, test_size=0.2, random_state=42 )


""" better splitting that recognizes need to preserve Income stats """
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42 )

for train_index, test_index in split.split( housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set  = housing.loc[test_index]

    



In [None]:
#print(np.shape(housing))
bins=np.arange(6)*1.0+.5

d0=housing["income_cat"]
hd0=np.histogram(d0,bins=bins)
print(hd0[0]/len(d0))

fig=plt.figure(1)
housing.hist(figsize=(9,9))
#plt.title("Full data")
plt.suptitle("Full data",fontsize=24)
plt.show()
fig=plt.figure(2)
strat_train_set.hist(figsize=(9,9))
plt.suptitle("Training data",fontsize=24)
plt.show()

## Replace 'housing' pd frame with training data

In [None]:

# Copy training set back to 'housing'
# Also 'drop' learning target from training set.
housing = strat_train_set.drop("median_house_value", axis=1 )

# In ML book this object was called 'housing_labels'
# which is a confusing name. Probably derved from categorical
# ML problem where the learning target is a 'label'.
# In any case, this is just a Pandas Series copied from the 
# 'strat_train_set' DataFrame.
housing_median_value = strat_train_set["median_house_value"].copy()

# Now copy 'housing' to 'housing_num' while executing 
# 'drop' of the categorical variable 'ocean_proximity'
housing_num = housing.drop("ocean_proximity", axis=1 )

In [None]:
strat_train_set.columns

In [None]:
housing.columns

### These Pandas objects hide lots of information.  
### 'housing_median_value' is a 'Series' which has an index field and a values field.  
### So then for example

plt.plot(housing_median_value)

### is equivalent to 

plt.plot(housing_median_value.index,housing_median_value.values)

In [None]:
plt.plot(housing_median_value.index, housing_median_value.values,'x')
plt.plot(strat_train_set['median_house_value'],'.')
print(np.shape(housing_median_value.index))

## Fancy plot

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
             s=housing["population"]/100, label="population", figsize=(10,7),
             c="median_income", cmap=plt.get_cmap("jet"), colorbar=True,
             sharex=False)
plt.legend()

#save_fig("housing_prices_scatterplot")


In [None]:
housing_cat = housing[["ocean_proximity"]]
housing_cat.head(10)
housing_cat.columns

In [None]:
#import importlib
#importlib.reload(CAA)

imputer = SimpleImputer( strategy = "median" )

imputer.fit( housing_num )

X=imputer.transform( housing_num )  # X is an np ndarray


#attr_adder = CAA(add_bedrooms_per_room=False)
#housing_extra_attribs_values = attr_adder.transform(X)

In [None]:
print(np.shape(X))
#print(X[0:10,:])
#print(np.shape(housing_extra_attribs_values))

In [None]:

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CAA(add_bedrooms_per_room=True)),
        ('std_scaler', StandardScaler()),
    ])

housing_num_tr = num_pipeline.fit_transform(housing_num)

housing_num_tr_DF = pd.DataFrame(
    housing_num_tr,
    columns=list(housing_num.columns)+["rooms_per_household", "population_per_household","bedrooms_per_room"],
    index=housing.index)
housing_num_tr_DF.head()

In [None]:
print(np.shape(housing_num_tr))

In [None]:
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)

In [None]:
print(np.shape(housing_prepared))
XX=np.asarray(housing_num)

In [None]:
fig=plt.figure(figsize=(20,9))

ax=fig.add_subplot(2,2,1)
p1=ax.plot(housing_num['median_income'],'x')
ax=fig.add_subplot(2,2,2)
p2=ax.plot(housing_num_tr[:,7])
ax=fig.add_subplot(2,2,3)
p3=ax.plot(housing_num_tr_DF['median_income'],'o')
ax=fig.add_subplot(2,2,4)
p4=ax.plot(housing_prepared[:,7])

his=np.histogram(XX[:,7])
print(np.mean(XX[:,7]))
print(np.std(XX[:,7]))


In [None]:
plt.plot(his[1][1:],his[0])

Now test out some ML models ...

In [None]:
forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(housing_prepared, housing_median_value)

In [None]:
housing_prediction=forest_reg.predict(housing_prepared)

In [None]:
np.shape(housing_median_value)

In [None]:
plt.scatter(housing_prediction,housing_median_value)

In [None]:
error=mean_squared_error(housing_prediction,housing_median_value)

In [None]:
print(np.sqrt(error))

In [None]:
#This cell takes a while to finish

scores = cross_val_score(forest_reg, housing_prepared, housing_median_value,
                         scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-scores)
display(forest_rmse_scores)

In [None]:
print('Hello')
tree_reg = DecisionTreeRegressor(random_state=42,max_depth=None)
tree_reg.fit(housing_prepared, housing_median_value)
housing_prediction_tree = tree_reg.predict(housing_prepared)
print('Hello')


In [None]:
plt.scatter(housing_prediction_tree,housing_median_value)
print(tree_reg.get_depth())
