# Assignment2 - Supervised Learning flow

# Part 1 - Student details:
* Please write the First Name and last 4 digits of the i.d. for each student. For example:
<pre>Israel 9812</pre>

In [None]:
# student 1: idan 4202
# student 2: guy 6303

## Part 2 - Initial Preparations 
You could add as many code cells as needed

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Loading the datasets
train_data = pd.read_csv('housing_train.csv')  # Loading training dataset
test_data = pd.read_csv('housing_test.csv')  # Loading testing dataset

# Displaying first few rows of both datasets
print(train_data.head())
print(test_data.head())

## Part 3 - Experiments
You could add as many code cells as needed

In [None]:
# Feature Engineering
# Creating new features by transforming and combining existing features
train_data['MedInc^2'] = train_data['MedInc'] ** 2  # Creating a new feature as the square of 'MedInc' in training data
test_data['MedInc^2'] = test_data['MedInc'] ** 2  # Creating the same feature in test data

train_data['PopRoom'] = train_data['Population'] / train_data['AveRooms']  # Creating a feature by dividing 'Population' by 'AveRooms'
test_data['PopRoom'] = test_data['Population'] / test_data['AveRooms']

train_data['log_MedInc'] = np.log(train_data['MedInc'])  # Applying log transformation to 'MedInc'
test_data['log_MedInc'] = np.log(test_data['MedInc'])

train_data['RoomsPerHouse'] = train_data['AveRooms'] / train_data['AveOccup']  # Ratio of 'AveRooms' to 'AveOccup'
test_data['RoomsPerHouse'] = test_data['AveRooms'] / test_data['AveOccup']

train_data['BedroomsPerRoom'] = train_data['AveBedrms'] / train_data['AveRooms']  # Ratio of 'AveBedrms' to 'AveRooms'
test_data['BedroomsPerRoom'] = test_data['AveBedrms'] / test_data['AveRooms']

# Clustering to create 'LocationCluster'
from sklearn.cluster import KMeans  # Importing KMeans clustering
kmeans = KMeans(n_clusters=5, random_state=42)  # Setting KMeans with 5 clusters
train_data['LocationCluster'] = kmeans.fit_predict(train_data[['Latitude', 'Longitude']])  # Creating location-based clusters for training data
test_data['LocationCluster'] = kmeans.predict(test_data[['Latitude', 'Longitude']])  # Assigning clusters to test data

print(train_data.head())

In [None]:
# Splitting data into features and target
X_train = train_data.drop('MedHouseVal', axis=1)  # Dropping the target column from training data to get features
y_train = train_data['MedHouseVal']  # Storing the target column separately
X_test = test_data.drop('MedHouseVal', axis=1)  # Dropping target from test data as well

# Scaling the features
scaler = StandardScaler()  # Initializing the StandardScaler for normalization
X_train_scaled = scaler.fit_transform(X_train)  # Scaling the training features
X_test_scaled = scaler.transform(X_test)  # Scaling the test features

In [None]:
# Random Forest Regressor model
model_rf = RandomForestRegressor(n_estimators=100)  # Initializing RandomForestRegressor with 100 trees
model_rf.fit(X_train_scaled, y_train)  # Fitting the model to training data
y_pred = model_rf.predict(X_test_scaled)  # Making predictions on the test set

# Print the first five predictions
print("Predictions on test set: ", y_pred[:5])

In [None]:
# Display basic statistics of the training data such as mean, standard deviation, min, max, etc.
print(train_data.describe())

# Plot a scatter plot showing the relationship between Median Income and Median House Value
plt.figure(figsize=(10, 6))
sns.scatterplot(x=train_data['MedInc'], y=train_data['MedHouseVal'])
plt.title('Median Income vs Median House Value')  # Set the title for the plot
plt.show()  # Display the plot

# Plot a histogram showing the distribution of House Age in the dataset with a KDE (Kernel Density Estimate) for smoothness
plt.figure(figsize=(10, 6))
sns.histplot(train_data['HouseAge'], bins=30, kde=True)  # 30 bins in the histogram, KDE for smoothing
plt.title('Distribution of House Age')  # Set the title for the plot
plt.show()  # Display the plot

# Plot a heatmap showing the correlation between the different features in the dataset
plt.figure(figsize=(10, 8))
sns.heatmap(train_data.corr(), annot=True, cmap="coolwarm", fmt='.2f')  # 'coolwarm' color map, 2 decimal places for annotations
plt.title('Correlation Heatmap of Features')  # Set the title for the heatmap
plt.show()  # Display the heatmap

# Plot a boxplot to show the relationship between House Age and Median House Value
plt.figure(figsize=(10, 6))
sns.boxplot(x='HouseAge', y='MedHouseVal', data=train_data)  # Boxplot of House Age against Median House Value
plt.title('Boxplot of House Value vs House Age')  # Set the title for the plot
plt.xticks(rotation=90)  # Rotate the x-axis labels by 90 degrees for better readability
plt.show()  # Display the plot

# Plot a boxplot to show the relationship between Location Cluster (a new feature created from KMeans clustering) and Median House Value
plt.figure(figsize=(10, 6))
sns.boxplot(x='LocationCluster', y='MedHouseVal', data=train_data)  # Boxplot of Location Cluster against Median House Value
plt.title('Boxplot of House Value vs Location Cluster')  # Set the title for the plot
plt.show()  # Display the plot


## Part 4 - Training 
Use the best combination of feature engineering, model (algorithm and hyperparameters) from the experiment part (part 3)

In [None]:
# Part 4 - Training

# Performing Grid Search to find the best hyperparameters
param_grid = {
    'n_estimators': [50, 100, 200],  # Grid search on different numbers of trees
    'max_depth': [10, 20, None]  # Grid search on different max depth values
}

model_rf = RandomForestRegressor()  # Initializing the model without specific parameters
grid_search = GridSearchCV(model_rf, param_grid, cv=5, scoring='neg_mean_squared_error')  # Performing 5-fold cross-validation on the grid search
grid_search.fit(X_train_scaled, y_train)  # Fitting the grid search on the training data

# Displaying the best hyperparameters
print("Best Parameters: ", grid_search.best_params_)

## Part 5 - Apply on test and show model performance estimation

In [None]:
# Part 5 - Apply on test and show model performance estimation

# Making predictions using the best estimator from GridSearchCV
y_pred = grid_search.best_estimator_.predict(X_test_scaled)

# Calculating the RMSE for model performance
rmse = np.sqrt(mean_squared_error(test_data['MedHouseVal'], y_pred))
print("Test RMSE: ", rmse)

# Displaying the first five predictions
print("Predictions on test set: ", y_pred[:5])
