In [15]:
#imoport libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [16]:
#read the data set
df=pd.read_csv('cleaning_data.csv')
df.head()

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk
0,Electronic City Phase II,1056.0,2.0,1.0,39.07,2
1,Chikka Tirupathi,2600.0,5.0,3.0,120.0,4
2,Uttarahalli,1440.0,2.0,3.0,62.0,3
3,Lingadheeranahalli,1521.0,3.0,1.0,95.0,3
4,Kothanur,1200.0,2.0,1.0,51.0,2


In [17]:
x=df.drop('price',axis=1)

In [18]:
y=df['price']

In [19]:
y.head()

0     39.07
1    120.00
2     62.00
3     95.00
4     51.00
Name: price, dtype: float64

In [20]:
num_features=x.select_dtypes(exclude='object').columns
cat_features=x.select_dtypes(include='object').columns

newtansform=OneHotEncoder()
ohtransform=StandardScaler()

processor=ColumnTransformer(
[
    ("OneHotEncoder",newtansform,cat_features),
    ("StandardScaler",ohtransform,num_features)
    ]
)

In [21]:
x=processor.fit_transform(x)

In [22]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [23]:
x_test.shape

(2437, 1269)

In [24]:
x_train.shape

(9748, 1269)

In [25]:
def evauate(true,predicted):
    mse = mean_squared_error(true,predicted)
    mae= mean_absolute_error(true,predicted)
    r2_scor= r2_score(true,predicted)
    return mse,mae,r2_scor

In [26]:
from sklearn.impute import SimpleImputer

# Define your models
models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'RandomForestRegressor': RandomForestRegressor(),
    'AdaBoostRegressor': AdaBoostRegressor(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'XGBRegressor': XGBRegressor(),
}

# Impute missing values in X_train and X_test
imputer = SimpleImputer(strategy='mean')
x_train_imputed = imputer.fit_transform(x_train)
x_test_imputed = imputer.transform(x_test)

model_list = []
evaluation_results = []

for model_name, model in models.items():
    model.fit(x_train_imputed, y_train)
    
    y_train_predict = model.predict(x_train_imputed)
    y_test_predict = model.predict(x_test_imputed)
    
    train_mse, train_mae, train_r2 = evauate(y_train, y_train_predict)
    test_mse, test_mae, test_r2 = evauate(y_test, y_test_predict)
    
    model_list.append(model)
    evaluation_results.append({
        'model_name': model_name,
        'train_mse': train_mse,
        'train_mae': train_mae,
        'train_r2': train_r2,
        'test_mse': test_mse,
        'test_mae': test_mae,
        'test_r2': test_r2
    })

# Print the results
for result in evaluation_results:
    print(f"Model: {result['model_name']}\n")
    print("The performance of the training data:\n")
    print(f"Mean squared error: {result['train_mse']}\nMean absolute error: {result['train_mae']}\nR2: {result['train_r2']}\n")
    print("The performance of the test data:\n")
    print(f"Mean squared error: {result['test_mse']}\nMean absolute error: {result['test_mae']}\nR2: {result['test_r2']}")
    print("----------------------------------------------------")


Model: LinearRegression

The performance of the training data:

Mean squared error: 6635.09234953249
Mean absolute error: 31.977156961888312
R2: 0.637551107711262

The performance of the test data:

Mean squared error: 11338.73784302222
Mean absolute error: 40.91337383328446
R2: 0.315809585091562
----------------------------------------------------
Model: Lasso

The performance of the training data:

Mean squared error: 10839.821934196498
Mean absolute error: 41.70982652769872
R2: 0.4078633354766319

The performance of the test data:

Mean squared error: 12449.692031954226
Mean absolute error: 44.13478015010588
R2: 0.24877353416659764
----------------------------------------------------
Model: Ridge

The performance of the training data:

Mean squared error: 6898.797114973849
Mean absolute error: 33.34744966557333
R2: 0.6231459577765792

The performance of the test data:

Mean squared error: 10996.056012668785
Mean absolute error: 39.87697731299341
R2: 0.3364873383774256
--------------

In [27]:
# Convert evaluation results into a DataFrame
df_results = pd.DataFrame(evaluation_results)
# Sort the DataFrame by 'test_r2' in descending order
df_results.sort_values(by='test_r2', ascending=False)

Unnamed: 0,model_name,train_mse,train_mae,train_r2,test_mse,test_mae,test_r2
6,KNeighborsRegressor,4827.836115,26.655787,0.736274,7975.721371,34.454713,0.518737
7,XGBRegressor,1850.026661,24.528843,0.89894,8040.032954,34.675834,0.514857
3,RandomForestRegressor,843.520905,11.10155,0.953922,9407.049036,32.980675,0.43237
2,Ridge,6898.797115,33.34745,0.623146,10996.056013,39.876977,0.336487
0,LinearRegression,6635.09235,31.977157,0.637551,11338.737843,40.913374,0.31581
5,DecisionTreeRegressor,33.822987,1.14088,0.998152,12328.327563,37.992035,0.256097
1,Lasso,10839.821934,41.709827,0.407863,12449.692032,44.13478,0.248774
4,AdaBoostRegressor,12734.265899,71.217193,0.304377,16727.065932,74.778451,-0.009327
