In [None]:

# Step 1: Import Libraries and Load Dataset
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the dataset

data =  pd.read_csv(r"C:\Users\gupta\Downloads\archive\House Price India.csv")

# Display the first few rows of the dataset
data.head()

In [None]:
# Step 2: Exploratory Data Analysis (EDA)
# Overview of the dataset
data.info()

# Statistical summary
data.describe()

# Check for missing values
missing_values = data.isnull().sum()
print("Missing values:\n", missing_values)

# Visualize the distribution of the target variable (Price)
plt.figure(figsize=(8, 6))
sns.histplot(data['Price'], kde=True, bins=30, color='blue')
plt.title('Distribution of House Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

# Correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

# Scatter plot: Living Area vs Price
plt.figure(figsize=(8, 6))
sns.scatterplot(x=data['living area'], y=data['Price'])
plt.title('Living Area vs Price')
plt.xlabel('Living Area')
plt.ylabel('Price')
plt.show()


In [None]:
# Step 3: Data Preprocessing
# Handle missing values
data.fillna(data.median(), inplace=True)

# Encode categorical variables
if 'waterfront present' in data.columns:
    le = LabelEncoder()
    data['waterfront present'] = le.fit_transform(data['waterfront present'])

# Feature scaling
scaler = StandardScaler()
numerical_features = ['living area', 'lot area', 'Distance from the airport', 
                      'Number of schools nearby', 'Area of the house(excluding basement)', 
                      'Area of the basement']
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Drop unnecessary columns
data.drop(['id', 'Date', 'Postal Code', 'Lattitude', 'Longitude'], axis=1, inplace=True)

# Create a new feature (e.g., total area)
data['total_area'] = data['living area'] + data['Area of the basement']

# Verify the dataset after preprocessing
data.head()


In [13]:
# Step 4: Train-Test Split
# Define features and target variable
X = data.drop('Price', axis=1)
y = data['Price']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 5: Model Development
# Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)



In [None]:
# Evaluate Random Forest step 6
rf_preds = rf_model.predict(X_test)
print("Random Forest Performance:")
print("MAE:", mean_absolute_error(y_test, rf_preds))
print("MSE:", mean_squared_error(y_test, rf_preds))
print("R2 Score:", r2_score(y_test, rf_preds))

# Evaluate Gradient Boosting step 7
gb_preds = gb_model.predict(X_test)
print("\nGradient Boosting Performance:")
print("MAE:", mean_absolute_error(y_test, gb_preds))
print("MSE:", mean_squared_error(y_test, gb_preds))
print("R2 Score:", r2_score(y_test, gb_preds))


In [None]:
# Step 8: Feature Importance
# Feature Importance for Random Forest
plt.figure(figsize=(10, 6))
importance = rf_model.feature_importances_
sns.barplot(x=importance, y=X.columns)
plt.title('Feature Importance (Random Forest)')
plt.show()

In [None]:
# Model Evaluation Code: step 9
# Evaluate Random Forest
rf_preds = rf_model.predict(X_test)
print("Random Forest Performance:")
print("MAE:", mean_absolute_error(y_test, rf_preds))
print("MSE:", mean_squared_error(y_test, rf_preds))
print("RMSE:", np.sqrt(mean_squared_error(y_test, rf_preds)))
print("R2 Score:", r2_score(y_test, rf_preds))

# Evaluate Gradient Boosting
gb_preds = gb_model.predict(X_test)
print("\nGradient Boosting Performance:")
print("MAE:", mean_absolute_error(y_test, gb_preds))
print("MSE:", mean_squared_error(y_test, gb_preds))
print("RMSE:", np.sqrt(mean_squared_error(y_test, gb_preds)))
print("R2 Score:", r2_score(y_test, gb_preds))

In [None]:
import joblib    #step 10

# Save the best model (Random Forest in this case)
best_model = rf_model  # Replace with gb_model if it's better
joblib.dump(best_model, 'best_model.pkl')
print("Model saved as 'best_model.pkl'.")
