In [21]:
import pandas as pd
from ydata_profiling import ProfileReport
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import numpy as np



In [37]:
data = pd.read_csv('cleaned_dataset.csv')
data.head()

Unnamed: 0,OverallQual,GarageCars,ExterQual,GrLivArea,FullBath,KitchenQual,YearBuilt,1stFlrSF,BsmtQual,Fireplaces,SalePrice
0,7,2,Gd,1710,2,Gd,2003,856,Gd,0,208500
1,6,2,TA,1262,2,TA,1976,1262,Gd,1,181500
2,7,2,Gd,1786,2,Gd,2001,920,Gd,1,223500
3,7,3,TA,1717,1,Gd,1915,961,TA,1,140000
4,8,3,Gd,2198,2,Gd,2000,1145,Gd,1,250000


In [38]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   OverallQual  1460 non-null   int64 
 1   GarageCars   1460 non-null   int64 
 2   ExterQual    1460 non-null   object
 3   GrLivArea    1460 non-null   int64 
 4   FullBath     1460 non-null   int64 
 5   KitchenQual  1460 non-null   object
 6   YearBuilt    1460 non-null   int64 
 7   1stFlrSF     1460 non-null   int64 
 8   BsmtQual     1423 non-null   object
 9   Fireplaces   1460 non-null   int64 
 10  SalePrice    1460 non-null   int64 
dtypes: int64(8), object(3)
memory usage: 125.6+ KB


In [24]:
data.drop(columns=["Id"], inplace=True)

KeyError: "['Id'] not found in axis"

In [32]:
profile = ProfileReport(data, title="House Prices")
profile.to_notebook_iframe()
profile.to_file("reports/profile_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [39]:
X = data.drop(columns=['SalePrice'])
y = data['SalePrice']

categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_cols),
        ('cat', categorical_pipeline, categorical_cols)
    ]
)

X = preprocessor.fit_transform(X)

model = RandomForestRegressor()



In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [41]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

In [42]:
evaluation_results = {
    "MAE": mae,
    "MSE": mse,
    "RMSE": rmse,
    "R2": r2
}

evaluation_results

{'MAE': 19666.885122635358,
 'MSE': 992808140.8454765,
 'RMSE': np.float64(31508.858132999307),
 'R2': 0.8562366477666841}