<a href="https://colab.research.google.com/github/NandhanaRameshkumar/-Home-Value-Prediction-Model/blob/main/Task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import joblib


In [None]:
np.random.seed(42)
n_samples = 1000
square_feet = np.random.randint(500, 5000, size=n_samples)
num_bedrooms = np.random.randint(1, 6, size=n_samples)
num_bathrooms = np.random.randint(1, 4, size=n_samples)
lot_size = np.random.randint(1000, 10000, size=n_samples)
year_built = np.random.randint(1900, 2021, size=n_samples)
garage_size = np.random.randint(0, 4, size=n_samples)
neighborhoods = ['A', 'B', 'C', 'D']
neighborhood = np.random.choice(neighborhoods, size=n_samples)
styles = ['Ranch', 'Colonial', 'Victorian', 'Modern']
home_style = np.random.choice(styles, size=n_samples)
home_value = (square_feet * 100 + num_bedrooms * 10000 + num_bathrooms * 5000 +
              lot_size * 10 + garage_size * 2000 + (year_built - 1900) * 300 +
              np.random.normal(0, 10000, size=n_samples))
data = pd.DataFrame({
    'square_feet': square_feet,
    'num_bedrooms': num_bedrooms,
    'num_bathrooms': num_bathrooms,
    'lot_size': lot_size,
    'year_built': year_built,
    'garage_size': garage_size,
    'neighborhood': neighborhood,
    'home_style': home_style,
    'home_value': home_value
})

In [None]:
# Define the target variable and features
target = 'home_value'
features = data.drop(columns=[target]).columns


In [None]:
# Separate features and target
X = data[features]
y = data[target]

In [None]:
# Preprocess the data
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Create a pipeline that includes preprocessing and the model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', RandomForestRegressor())])

In [None]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10]
}


In [None]:
# Use GridSearchCV to optimize hyperparameters
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:

# Get the best model
best_model = grid_search.best_estimator_

In [None]:
# Evaluate the model on the test set
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 273480135.4046684
R-squared: 0.9845607574776657


In [None]:
# Fine-tune the model if necessary (example with Linear Regression)
pipeline_lr = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', LinearRegression())])

In [None]:
# Train the linear regression model
pipeline_lr.fit(X_train, y_train)

In [None]:
# Evaluate the linear regression model
y_pred_lr = pipeline_lr.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print(f'Linear Regression Mean Squared Error: {mse_lr}')
print(f'Linear Regression R-squared: {r2_lr}')
# Save the best model to a file for future use
joblib.dump(best_model, 'best_home_value_model.pkl')

Linear Regression Mean Squared Error: 96366997.81335427
Linear Regression R-squared: 0.9945596288074522


In [None]:
# Save the best model to a file for future use
joblib.dump(best_model, 'best_home_value_model.pkl')

['best_home_value_model.pkl']

In [None]:

# Documentation
documentation = f"""
# Home Value Prediction Model

## Model Architecture
The model uses a RandomForestRegressor for predicting home values. It includes data preprocessing steps such as handling missing values, encoding categorical variables, and standardizing numerical features.

## Training Process
1. Load and preprocess the data.
2. Split the data into training and testing sets.
3. Train the model using GridSearchCV to find the best hyperparameters.
4. Evaluate the model using metrics like Mean Squared Error and R-squared.
5. Fine-tune the model if necessary.

## Usage Instructions
1. Load the saved model using joblib:
    ```python
    import joblib
    model = joblib.load('best_home_value_model.pkl')
    ```
2. Prepare the input data for prediction:
    ```python
    # Ensure your input data has the same structure as the training data
    input_data = pd.DataFrame([...])  # Replace with your actual input data
    ```
3. Make predictions:
    ```python
    predictions = model.predict(input_data)
    ```

## Hyperparameter Optimization
The hyperparameters of the RandomForestRegressor were optimized using GridSearchCV with the following parameter grid:
- `n_estimators`: [100, 200, 300]
- `max_depth`: [None, 10, 20, 30]
- `min_samples_split`: [2, 5, 10]

## Evaluation
The model was evaluated using Mean Squared Error and R-squared on the test set. The results are as follows:
- Mean Squared Error: {mse}
- R-squared: {r2}

For the Linear Regression model:
- Mean Squared Error: {mse_lr}
- R-squared: {r2_lr}
"""

# Print the documentation
print(documentation)


# Home Value Prediction Model

## Model Architecture
The model uses a RandomForestRegressor for predicting home values. It includes data preprocessing steps such as handling missing values, encoding categorical variables, and standardizing numerical features.

## Training Process
1. Load and preprocess the data.
2. Split the data into training and testing sets.
3. Train the model using GridSearchCV to find the best hyperparameters.
4. Evaluate the model using metrics like Mean Squared Error and R-squared.
5. Fine-tune the model if necessary.

## Usage Instructions
1. Load the saved model using joblib:
    ```python
    import joblib
    model = joblib.load('best_home_value_model.pkl')
    ```
2. Prepare the input data for prediction:
    ```python
    # Ensure your input data has the same structure as the training data
    input_data = pd.DataFrame([...])  # Replace with your actual input data
    ```
3. Make predictions:
    ```python
    predictions = model.predict(input_data)
    `