In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv("housing.csv")

In [None]:
data 
#checking the data

In [None]:
data.info()

In [None]:
data.dropna(inplace=True)
#dealing with non-null data

In [None]:
data.info()

In [None]:
from sklearn.model_selection import train_test_split

X = data.drop(['median_house_value'], axis=1)
y = data['median_house_value']

In [None]:
X_train , X_test , y_train , y_test =  train_test_split(X, y, test_size=0.2)

In [None]:
train_data = X_train.join(y_train)

In [None]:
train_data

In [None]:
train_data.hist(figsize=(15,8))

In [None]:
numeric_columns = train_data.select_dtypes(include=['number'])
plt.figure(figsize=(15, 8))
sns.heatmap(numeric_columns.corr(), annot=True, cmap="YlGnBu")

In [None]:
train_data['total_rooms']= np.log(train_data['total_rooms'] + 1)
train_data['total_bedrooms']= np.log(train_data['total_bedrooms'] + 1)
train_data['population']= np.log(train_data['population'] + 1)
train_data['households']= np.log(train_data['households'] + 1)

In [None]:
train_data.hist(figsize=(15,8))
#showing the data in a more welcoming graphs

In [None]:
train_data.ocean_proximity.value_counts()

In [None]:
train_data = train_data.join(pd.get_dummies(train_data.ocean_proximity)).drop(['ocean_proximity'], axis =1)

In [None]:
train_data

In [None]:
plt.figure(figsize=(15, 8))
sns.heatmap(train_data.corr(), annot=True, cmap="YlGnBu")

In [None]:
plt.figure(figsize=(15,8))
sns.scatterplot(x="latitude", y="longitude", data=train_data, hue="median_house_value", palette="coolwarm")
#inland is up , coast is down on the map of california

In [None]:
train_data['bedroom_ratio'] = train_data['total_bedrooms']/train_data['total_rooms']
train_data['household_rooms'] = train_data['total_rooms']/train_data['households']

In [None]:
plt.figure(figsize=(15, 8))
sns.heatmap(train_data.corr(), annot=True, cmap="YlGnBu")

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = train_data.drop(['median_house_value'],axis=1 )
y_train = train_data['median_house_value']
X_train_s = scaler.fit_transform(X_train)

#training
reg = LinearRegression()

reg.fit(X_train_s, y_train)

In [None]:
#testing data
test_data = X_test.join(y_test)

test_data['total_rooms']= np.log(test_data['total_rooms'] + 1)
test_data['total_bedrooms']= np.log(test_data['total_bedrooms'] + 1)
test_data['population']= np.log(test_data['population'] + 1)
test_data['households']= np.log(test_data['households'] + 1)

test_data = test_data.join(pd.get_dummies(test_data.ocean_proximity)).drop(['ocean_proximity'], axis =1)

test_data['bedroom_ratio'] = test_data['total_bedrooms']/test_data['total_rooms']
test_data['household_rooms'] = test_data['total_rooms']/test_data['households']

In [None]:
X_test = test_data.drop(['median_house_value'],axis=1 )
y_test = test_data['median_house_value']

In [None]:
X_test_s = scaler.transform(X_test)

In [None]:
reg.score(X_test_s, y_test)

In [None]:
#trying second model
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor()

forest.fit(X_train_s, y_train)

In [None]:
forest.score(X_test_s, y_test)

In [None]:
from sklearn.model_selection import GridSearchCV

forest = RandomForestRegressor()

param_grid = {
    "n_estimators": [100, 200, 300],
    "min_samples_split": [2, 4],
    "max_depth": [None, 4, 8]
}

grid_search = GridSearchCV(forest, param_grid, cv=5, scoring="neg_mean_squared_error", return_train_score=True)

grid_search.fit(X_train_s,y_train)

In [None]:
grid_search.best_estimator_

In [None]:
grid_search.best_estimator_.score(X_test_s,y_test)