In [10]:
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import train_test_split

## Import the Cleaned Dataset (CleanedData.csv)

In [None]:
df = pd.read_csv('CleanedData.csv')

## Change the dtype of categorical columns to 'category'

In [3]:
df['location'] = df['location'].astype('category')
df['Transaction'] = df['Transaction'].astype('category')
df['Furnishing'] = df['Furnishing'].astype('category')
df['facing'] = df['facing'].astype('category')
df['Ownership'] = df['Ownership'].astype('category')

## Divide the Input and Output features and do train_test_split

In [None]:
X = df.drop('Amount(in rupees)', axis=1)
y = df['Amount(in rupees)']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Train the Model (LightGBM)

In [6]:
lgbm = lgb.LGBMRegressor()
lgbm.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003032 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 436
[LightGBM] [Info] Number of data points in the train set: 142269, number of used features: 10
[LightGBM] [Info] Start training from score 15.922183


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


## Make Prediction and change the prediction to non Log Transformed version same with the test data for checking the accuracy

In [8]:
y_pred = lgbm.predict(X_test)
final_pred = np.expm1(y_pred)

In [None]:
y_test_original = np.expm1(y_test)

## Perform different Accuracy Tests

In [13]:
from sklearn.metrics import mean_squared_error, r2_score

rmse = np.sqrt(mean_squared_error(y_test_original, final_pred))
r2 = r2_score(y_test_original, final_pred)

In [14]:
print(f"Model Performance:")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R²): {r2:.2f}")

Model Performance:
Root Mean Squared Error (RMSE): 6335519.30
R-squared (R²): 0.80


In [15]:
comparison_df = pd.DataFrame({'Actual Price': y_test_original, 'Predicted Price': final_pred})
print("\n--- Sample Predictions ---")
print(comparison_df.head())


--- Sample Predictions ---
        Actual Price  Predicted Price
123789    15600000.0     1.597919e+07
71212      5600000.0     5.533493e+06
104130     4200000.0     4.124815e+06
2721       5100000.0     5.829158e+06
157129     3400000.0     3.915377e+06


### Here are the top 5 worse predictions which shows that the model itself isn't flawed and works well on most the data only lacks on the very high end of the homes which justifies the high RMSE value

In [18]:

comparison_df = pd.DataFrame({
    'Actual Price': y_test_original,
    'Predicted Price': final_pred
})

# 2. Calculate the absolute error for each prediction
comparison_df['Error'] = abs(comparison_df['Actual Price'] - comparison_df['Predicted Price'])

# 3. Sort the DataFrame by the error to see the worst predictions
worst_predictions = comparison_df.sort_values(by='Error', ascending=False)

print("--- Top 5 Worst Predictions ---")
print(worst_predictions.head())

--- Top 5 Worst Predictions ---
        Actual Price  Predicted Price         Error
51423   4.200000e+08     1.032563e+08  3.167437e+08
111332  3.000000e+08     4.489555e+07  2.551045e+08
50142   2.700000e+08     4.583406e+07  2.241659e+08
111883  3.200000e+08     9.633245e+07  2.236675e+08
109243  4.200000e+08     1.981104e+08  2.218896e+08


## Exporting the model

In [20]:
import joblib as jb
jb.dump(lgbm, 'LightGBM_model.pkl')
print("Model saved successfully!")

Model saved successfully!
