<a href="https://colab.research.google.com/github/Harish050906/Medi_bot/blob/main/HousePrice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
import pandas as pd
import io
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
df = pd.read_csv(io.BytesIO(uploaded[file_name]))
print(df.head())

Saving Chennai houseing sale.csv to Chennai houseing sale.csv
   PRT_ID        AREA  INT_SQFT   DATE_SALE  DIST_MAINROAD  N_BEDROOM  \
0  P03210  Karapakkam      1004  04-05-2011            131        1.0   
1  P09411  Anna Nagar      1986  19-12-2006             26        2.0   
2  P01812       Adyar       909  04-02-2012             70        1.0   
3  P05346   Velachery      1855  13-03-2010             14        3.0   
4  P06210  Karapakkam      1226  05-10-2009             84        1.0   

   N_BATHROOM  N_ROOM SALE_COND PARK_FACIL  ... UTILITY_AVAIL  STREET MZZONE  \
0         1.0       3  AbNormal        Yes  ...        AllPub   Paved      A   
1         1.0       5  AbNormal         No  ...        AllPub  Gravel     RH   
2         1.0       3  AbNormal        Yes  ...           ELO  Gravel     RL   
3         2.0       5    Family         No  ...       NoSewr    Paved      I   
4         1.0       3  AbNormal        Yes  ...        AllPub  Gravel      C   

  QS_ROOMS QS_BATH

In [2]:
# Select only numerical columns for a quick first test
# Typical columns in this dataset: INT_SQFT, N_BEDROOM, N_BATHROOM, SALES_PRICE
numerical_df = df.select_dtypes(include=['int64', 'float64']).dropna()

# Define features (X) and target (y)
# In the Chennai dataset, the target column is usually 'SALES_PRICE'
X = numerical_df.drop('SALES_PRICE', axis=1)
y = numerical_df['SALES_PRICE']

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# 1. CLEANING & FEATURE ENGINEERING
# Converting date columns to calculate house age (Crucial for price!)
df['DATE_SALE'] = pd.to_datetime(df['DATE_SALE'], dayfirst=True)
df['DATE_BUILD'] = pd.to_datetime(df['DATE_BUILD'], dayfirst=True)
df['HOUSE_AGE'] = (df['DATE_SALE'] - df['DATE_BUILD']).dt.days // 365

# Fix spelling errors common in the Chennai dataset (e.g., 'Karapakam' vs 'Karapakkam')
df['AREA'] = df['AREA'].str.lower().str.replace('karapakam', 'karapakkam').str.replace('ana nagar', 'anna nagar')

# 2. ENCODING CATEGORICAL DATA
# This turns names like 'Chrompet' into numbers the model can understand
categorical_cols = ['AREA', 'PARK_FACIL', 'BUILDTYPE', 'UTILITY_AVAIL', 'STREET', 'MZZONE']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Select features and target
# We drop non-numeric columns and IDs
X = df_encoded.select_dtypes(include=[np.number]).drop(['SALES_PRICE'], axis=1)
y = df_encoded['SALES_PRICE']

# 3. TRAIN-TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. HYPERPARAMETER TUNING (Grid Search)
# Instead of guessing, we tell the computer to find the best settings
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}

rf = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# 5. FINAL EVALUATION
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
new_r2 = r2_score(y_test, y_pred)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"New Improved R^2 Score: {new_r2:.4f}")

Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best Parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 200}
New Improved R^2 Score: 0.8657


INCREASE TO 90


In [8]:
!pip install xgboost



In [9]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# 1. ENHANCED FEATURE ENGINEERING
# Calculate age of house (as done in popular Chennai Kaggle notebooks)
df['DATE_SALE'] = pd.to_datetime(df['DATE_SALE'])
df['DATE_BUILD'] = pd.to_datetime(df['DATE_BUILD'])
df['HOUSE_AGE'] = (df['DATE_SALE'] - df['DATE_BUILD']).dt.days // 365

# Encode categorical data (Location is huge in Chennai)
categorical_cols = ['AREA', 'PARK_FACIL', 'BUILDTYPE', 'UTILITY_AVAIL', 'STREET', 'MZZONE']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Select features and target
X = df_encoded.select_dtypes(include=[np.number]).drop(['SALES_PRICE'], axis=1)
y = df_encoded['SALES_PRICE']

# 2. DATA CLEANING: REMOVE OUTLIERS
# Removing houses where square footage per bedroom is unrealistic (standard < 300)
# df = df[~(df['INT_SQFT']/df['N_BEDROOM'] < 300)] # Optional: based on domain knowledge

# 3. XGBOOST TRAINING
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Parameters optimized for 0.90+ accuracy
model_xgb = xgb.XGBRegressor(
    n_estimators=1000,   # More trees for deeper learning
    learning_rate=0.05,  # Smaller steps prevent overfitting
    max_depth=6,         # Standard depth for complex patterns
    subsample=0.8,       # Randomly uses 80% of data for each tree
    colsample_bytree=0.8,
    random_state=42
)

model_xgb.fit(X_train, y_train)

# 4. FINAL TEST
y_pred = model_xgb.predict(X_test)
print(f"XGBoost R^2 Score: {r2_score(y_test, y_pred):.4f}")

XGBoost R^2 Score: 0.8677


In [10]:
import numpy as np

# 1. Transform the target (Sales Price) to Log scale
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

# 2. Retrain XGBoost on the Log scale
model_xgb_log = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.03, # Lower learning rate often helps with Log data
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model_xgb_log.fit(X_train, y_train_log)

# 3. Predict and transform back to original scale
y_pred_log = model_xgb_log.predict(X_test)
y_pred_original = np.expm1(y_pred_log) # Convert log back to actual price

# 4. Calculate final R2 on the original scale
final_r2 = r2_score(y_test, y_pred_original)
print(f"Final R^2 Score with Log-Transformation: {final_r2:.4f}")

Final R^2 Score with Log-Transformation: 0.8681
