In [1]:
#imoport libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [2]:
#read the data set
df=pd.read_csv('cleaning_data.csv')
df.head()

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk,per_squre_price
0,Electronic City Phase II,1056.0,2.0,1.0,39.07,2,3699.810606
1,Chikka Tirupathi,2600.0,5.0,3.0,120.0,4,4615.384615
2,Uttarahalli,1440.0,2.0,3.0,62.0,3,4305.555556
3,Lingadheeranahalli,1521.0,3.0,1.0,95.0,3,6245.890861
4,Kothanur,1200.0,2.0,1.0,51.0,2,4250.0


In [3]:
x=df.drop('price',axis=1)

In [4]:
df.columns

Index(['location', 'total_sqft', 'bath', 'balcony', 'price', 'bhk',
       'per_squre_price'],
      dtype='object')

In [5]:
y=df['price']

In [6]:
y.head()

0     39.07
1    120.00
2     62.00
3     95.00
4     51.00
Name: price, dtype: float64

In [7]:
num_features=x.select_dtypes(exclude='object').columns
cat_features=x.select_dtypes(include='object').columns

newtansform=OneHotEncoder()
ohtransform=StandardScaler()

processor=ColumnTransformer(
[
    ("OneHotEncoder",newtansform,cat_features),
    ("StandardScaler",ohtransform,num_features)
    ]
)

In [8]:
x=processor.fit_transform(x)

In [9]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [10]:
x_test.shape

(2437, 1270)

In [11]:
x_train.shape

(9748, 1270)

In [12]:
def evauate(true,predicted):
    mse = mean_squared_error(true,predicted)
    mae= mean_absolute_error(true,predicted)
    r2_scor= r2_score(true,predicted)
    return mse,mae,r2_scor

In [13]:
from sklearn.impute import SimpleImputer

# Define your models
models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'RandomForestRegressor': RandomForestRegressor(),
    'AdaBoostRegressor': AdaBoostRegressor(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'XGBRegressor': XGBRegressor(),
}

# Impute missing values in X_train and X_test
imputer = SimpleImputer(strategy='mean')
x_train_imputed = imputer.fit_transform(x_train)
x_test_imputed = imputer.transform(x_test)

model_list = []
evaluation_results = []

for model_name, model in models.items():
    model.fit(x_train_imputed, y_train)
    
    y_train_predict = model.predict(x_train_imputed)
    y_test_predict = model.predict(x_test_imputed)
    
    train_mse, train_mae, train_r2 = evauate(y_train, y_train_predict)
    test_mse, test_mae, test_r2 = evauate(y_test, y_test_predict)
    
    model_list.append(model)
    evaluation_results.append({
        'model_name': model_name,
        'train_mse': train_mse,
        'train_mae': train_mae,
        'train_r2': train_r2,
        'test_mse': test_mse,
        'test_mae': test_mae,
        'test_r2': test_r2
    })

# Print the results
for result in evaluation_results:
    print(f"Model: {result['model_name']}\n")
    print("The performance of the training data:\n")
    print(f"Mean squared error: {result['train_mse']}\nMean absolute error: {result['train_mae']}\nR2: {result['train_r2']}\n")
    print("The performance of the test data:\n")
    print(f"Mean squared error: {result['test_mse']}\nMean absolute error: {result['test_mae']}\nR2: {result['test_r2']}")
    print("----------------------------------------------------")


Model: LinearRegression

The performance of the training data:

Mean squared error: 6607.742739127861
Mean absolute error: 31.63270639749799
R2: 0.6390451089208133

The performance of the test data:

Mean squared error: 11222.039911740012
Mean absolute error: 40.425276040959524
R2: 0.32285125120363856
----------------------------------------------------
Model: Lasso

The performance of the training data:

Mean squared error: 10721.086532992764
Mean absolute error: 40.832849375534174
R2: 0.4143493815442175

The performance of the test data:

Mean squared error: 12218.081733414641
Mean absolute error: 43.18359381843705
R2: 0.2627491237294506
----------------------------------------------------
Model: Ridge

The performance of the training data:

Mean squared error: 6867.529033878217
Mean absolute error: 32.92062731234821
R2: 0.6248540095654748

The performance of the test data:

Mean squared error: 10867.01908121575
Mean absolute error: 39.34570147965064
R2: 0.34427355170130924
---------

In [14]:
# Convert evaluation results into a DataFrame
df_results = pd.DataFrame(evaluation_results)
# Sort the DataFrame by 'test_r2' in descending order
df_results.sort_values(by='test_r2', ascending=False)

Unnamed: 0,model_name,train_mse,train_mae,train_r2,test_mse,test_mae,test_r2
7,XGBRegressor,24.659887,2.526044,0.998653,901.381798,5.058289,0.94561
3,RandomForestRegressor,84.467279,1.038694,0.995386,903.913003,2.823939,0.945457
5,DecisionTreeRegressor,0.00212,0.000752,1.0,1067.659205,4.052028,0.935576
6,KNeighborsRegressor,2864.8464,18.923125,0.843505,4292.801142,23.523668,0.740968
4,AdaBoostRegressor,6140.885475,73.909785,0.664548,7287.853755,74.757002,0.560244
2,Ridge,6867.529034,32.920627,0.624854,10867.019081,39.345701,0.344274
0,LinearRegression,6607.742739,31.632706,0.639045,11222.039912,40.425276,0.322851
1,Lasso,10721.086533,40.832849,0.414349,12218.081733,43.183594,0.262749


In [15]:
predict

NameError: name 'predict' is not defined