### Import Necessary Libraries and Initialize the Random Seed

In [2]:
from sklearn.model_selection import GroupKFold, cross_val_predict, cross_val_score
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
import pandas
import numpy
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
numpy.random.seed = 41

### Load the Dataframe, Normalize, and Perform Label Preprocessing

In [3]:
encoder = LabelEncoder()
scaledCols = []
scaler = MinMaxScaler()
ScaledHousingData = pandas.DataFrame()
HousingData = pandas.read_csv("../Datasets/housing.csv")
HousingData['ocean_proximity'] = encoder.fit_transform(HousingData['ocean_proximity'])
HousingData.drop(HousingData.loc[HousingData['total_bedrooms'].isna()].index, inplace=True)

for col in HousingData.columns:
    scaledCols.append(pandas.DataFrame(scaler.fit_transform(pandas.DataFrame(HousingData[col])), columns=[col]))

for i in range(len(scaledCols)):
    ScaledHousingData[HousingData.columns[i]] = scaledCols[i]


### Prepare Data for CV

In [20]:
X = ScaledHousingData.drop(columns=['median_house_value', 'ocean_proximity'], axis='columns').values
Y = ScaledHousingData['median_house_value'].values
classes = KMeans(n_clusters=100, random_state=41).fit(ScaledHousingData).labels_
classes


array([78, 80, 29, ..., 67, 67, 67], dtype=int32)

### Perform CV With 5 Folds and Get Average Score

In [21]:
gkf = GroupKFold(n_splits=4)
model = LinearRegression()
data = gkf.split(X, Y, classes)
train_indices, test_indices = [list(traintest) for traintest in zip(*data)]
housing_cv = [*zip(train_indices,test_indices)]

predictions = cross_val_predict(model, X, Y, cv=housing_cv)
RMSE = -cross_val_score(model, X, Y, cv=housing_cv, scoring='neg_root_mean_squared_error')
r2 = cross_val_score(model, X, Y, cv=housing_cv, scoring='r2')

print("Fold RMSE Scores: ", RMSE)
print("Average RMSE Score: ", numpy.average(RMSE))


Fold RMSE Scores:  [0.15004119 0.13804321 0.14880209 0.14427074]
Average RMSE Score:  0.14528930585064764


In [22]:
print("Fold R^2 Scores: ", r2)
print("Average R^2 Score: ", numpy.average(r2))

Fold R^2 Scores:  [0.60554051 0.64654624 0.58959232 0.65886563]
Average R^2 Score:  0.6251361739024704


In [23]:
%matplotlib qt
imp_features = ['median_income', 'longitude', 'latitude']
# fig = plt.figure(figsize=(10, 20))
# plot_3d = fig.add_subplot(projection='3d')
# xs = ScaledHousingData[imp_features[0]].head(100)
ys = ScaledHousingData[imp_features[1]].head(10000)
zs = ScaledHousingData[imp_features[2]].head(10000)
plt.scatter(ys, zs, c=classes[0:10000], cmap='Paired')

plt.show()