In [None]:
# Importing necessary libraries
import numpy as np                  # For numerical computations
import pandas as pd                 # For data manipulation using dataframes
import matplotlib.pyplot as plt     # For data visualization
import seaborn as sns               # For data visualization
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:

from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()
housing_df = pd.DataFrame(data.data, columns=data.feature_names)
housing_df['MedHouseVal'] = data.target



In [None]:
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [None]:
# Display the first few rows of the dataset
housing_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [None]:
# checking the number of rows and Columns in the data frame
housing_df.shape

(20640, 9)

In [None]:
housing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [None]:
housing_df["MedHouseVal"].value_counts()

Unnamed: 0_level_0,count
MedHouseVal,Unnamed: 1_level_1
5.00001,965
1.37500,122
1.62500,117
1.12500,103
1.87500,93
...,...
3.59200,1
0.54900,1
3.77600,1
0.81200,1


In [None]:
housing_df["MedHouseVal"].value_counts()

Unnamed: 0_level_0,count
MedHouseVal,Unnamed: 1_level_1
5.00001,965
1.37500,122
1.62500,117
1.12500,103
1.87500,93
...,...
3.59200,1
0.54900,1
3.77600,1
0.81200,1


In [None]:
housing_df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


In [None]:
# Create histograms for each feature in the DataFrame 'data'
# figsize=(10, 8) sets the size of the figure to 10 inches by 8 inches
housing_df.hist(figsize=(16, 12))

# Display the histograms
# plt.show() renders the plots and displays them in the output
plt.show()

In [None]:
# Visualizing the correlation matrix to see how features are related to the target variable
corr_matrix = housing_df.corr()

# Using a heatmap to display the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
X = housing_df.drop('MedHouseVal', axis=1)
y = housing_df['MedHouseVal']

In [None]:
sample_incomplete_rows = housing_df[housing_df.isnull().any(axis=1)].head()
sample_incomplete_rows

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

In [None]:
scaler = StandardScaler()
# Fitting the scaler on the training data and transforming both training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Initializing the Linear Regression model
model = LinearRegression()
# Training the model on the training data
model.fit(X_train_scaled, y_train)


In [None]:
# Making predictions on the test data
y_pred = model.predict(X_train_scaled)
mse = mean_squared_error(y_train, y_pred)
print(f"Mean Squared Error: {mse}")
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")
r2 = r2_score(y_train, y_pred)
print(f"R-squared: {r2}")


In [None]:
plt.scatter(y_train, y_pred)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual Price vs Preicted Price")
plt.show()

In [None]:
# Making predictions on the test data
y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")
r2 = r2_score(y_test, y_pred)
print(f"R-squared: {r2}")

In [None]:
plt.scatter(y_test, y_pred)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual Price vs Preicted Price")
plt.show()

In [None]:
def predict_house_value(input_data):
    # Change the input data to a numpy array
    input_data_as_numpy_array = np.asarray(input_data)

    # Reshape the numpy array as we are predicting for only one instance
    input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

    # Scale the input data using the same scaler used for training
    input_data_scaled = scaler.transform(input_data_reshaped)

    # Predicting the house value
    prediction = model.predict(input_data_scaled)
    print(f"Predicted Median House Value: {prediction[0] * 100000:.2f} USD")


input_data = (8.3252, 41.0, 6.98412698412698, 1.02380952380952, 322.0, 2.55555555555556, 37.88, -122.23)
predict_house_value(input_data)

In [None]:
def predict_house_value(input_data):
    # Change the input data to a numpy array
    input_data_as_numpy_array = np.asarray(input_data)

    # Reshape the numpy array as we are predicting for only one instance
    input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

    # Scale the input data using the same scaler used for training
    input_data_scaled = scaler.transform(input_data_reshaped)

    # Predicting the house value
    prediction = model.predict(input_data_scaled)
    print(f"Predicted Median House Value: {prediction[0] * 100000:.2f} USD")


input_data = (8.3252, 41.0, 6.98412698412698, 1.02380952380952, 322.0, 2.55555555555556, 37.88, -122.23)
predict_house_value(input_data)

In [None]:
from sklearn.ensemble import RandomForestRegressor
# Initializing the Random Forest Regression model
model = RandomForestRegressor()
model.fit(X_train_scaled, y_train)


In [None]:
# Making predictions on the train data
y_pred = model.predict(X_train_scaled)
mse = mean_squared_error(y_train, y_pred)
print(f"Mean Squared Error: {mse}")
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")
r2 = r2_score(y_train, y_pred)
print(f"R-squared: {r2}")

In [None]:
plt.scatter(y_train, y_pred)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual Price vs Preicted Price")
plt.show()

In [None]:
# Making predictions on the test data
y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")
r2 = r2_score(y_test, y_pred)
print(f"R-squared: {r2}")

In [None]:
plt.scatter(y_test, y_pred)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual Price vs Preicted Price")
plt.show()

In [None]:
def predict_house_value(input_data):
    # Change the input data to a numpy array
    input_data_as_numpy_array = np.asarray(input_data)

    # Reshape the numpy array as we are predicting for only one instance
    input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

    # Scale the input data using the same scaler used for training
    input_data_scaled = scaler.transform(input_data_reshaped)

    # Predicting the house value
    prediction = model.predict(input_data_scaled)
    print(f"Predicted Median House Value: {prediction[0] * 100000:.2f} USD")


input_data = (8.3252, 41.0, 6.98412698412698, 1.02380952380952, 322.0, 2.55555555555556, 37.88, -122.23)
predict_house_value(input_data)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Initialize the RandomForestRegressor model
model = RandomForestRegressor()

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider at each split
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid,
                           cv=5,  # Number of folds in cross-validation
                           n_jobs=-1,  # Use all available cores
                           verbose=1,  # Print progress
                           scoring='neg_mean_squared_error')  # Evaluation metric

# Fit GridSearchCV to the data
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Get the best model
best_model = grid_search.best_estimator_

print("Best parameters found: ", best_params)
print("Best model trained with the parameters.")

In [None]:
# Making predictions on the test data
y_pred = best_model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")
r2 = r2_score(y_test, y_pred)
print(f"R-squared: {r2}")