In [2]:
#import all necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Data preprocessing and data transformation
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline

# Model selection and evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Models
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import warnings
warnings.filterwarnings('ignore')

In [4]:
# Load the dataset
train_data = pd.read_csv("C:/Users/Admin/Downloads/train.csv")
test_data = pd.read_csv("C:/Users/Admin/Downloads/test.csv")

In [5]:
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


Analyse the Dataset

In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [7]:
train_data.dtypes.value_counts()

object     43
int64      35
float64     3
Name: count, dtype: int64

We're dealing with three datatypes objects, int64, and float64

In [8]:
train_data.shape

(1460, 81)

In [9]:
#split the data into testing and trsining with the tareted variables

X = train_data.drop('SalePrice', axis=1)  # Features (all columns except 'SalePrice')
y = train_data['SalePrice']               # Target variable

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=1998)

In [10]:
#checking for missing values
missing_values=X_train.isnull().sum()
missing_values = missing_values[missing_values > 0]
print(missing_values)

LotFrontage      197
Alley           1096
MasVnrType       698
MasVnrArea         8
BsmtQual          30
BsmtCond          30
BsmtExposure      31
BsmtFinType1      30
BsmtFinType2      31
Electrical         1
FireplaceQu      551
GarageType        56
GarageYrBlt       56
GarageFinish      56
GarageQual        56
GarageCond        56
PoolQC          1163
Fence            937
MiscFeature     1121
dtype: int64


The dataset has lots of missing values in the above columns. So, we will have to check missing value in each column and handle it appropriately

In [11]:
columns_to_drop = missing_values[missing_values > 20].index

X_train.drop(columns=columns_to_drop, inplace=True)

print(X_train)

        Id  MSSubClass MSZoning  LotArea Street LotShape LandContour  \
768    769          20       RL     9100   Pave      Reg         Lvl   
1102  1103          20       RL     7000   Pave      Reg         Lvl   
816    817          20       RL    11425   Pave      IR1         Lvl   
391    392          60       RL    12209   Pave      IR1         Lvl   
747    748          70       RM    11700   Pave      IR1         Lvl   
...    ...         ...      ...      ...    ...      ...         ...   
224    225          20       RL    13472   Pave      Reg         Lvl   
1188  1189          60       RL     8935   Pave      IR1         Lvl   
897    898          90       RL     7018   Pave      Reg         Lvl   
673    674          20       RL    14442   Pave      Reg         Lvl   
840    841          70       RH    12155   Pave      IR1         Lvl   

     Utilities LotConfig LandSlope  ... OpenPorchSF EnclosedPorch 3SsnPorch  \
768     AllPub    Inside       Gtl  ...          33     