In [20]:
import pandas as pd

# Reading csv files
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

# Display the first few rows of the training and test data
print(train_data.head())
print(test_data.head())

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape   
0   1          60       RL         65.0     8450   Pave   NaN      Reg  \
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold   
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2  \
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [21]:
# Checking the shape of the training data
print("Training data shape: ", train_data.shape)

# Inspecting the column names and data types
print(train_data.info())


Training data shape:  (1460, 81)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460

In [22]:
# Check for missing values in training data
missing_values_train = train_data.isnull().sum() # sum of missing values in each column
print("Missing values in training data:", missing_values_train[missing_values_train > 0])

# Check for missing values in test data
missing_values_test = test_data.isnull().sum()
print("Missing values in test data:", missing_values_test[missing_values_test > 0])



Missing values in training data: LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64
Missing values in test data: MSZoning           4
LotFrontage      227
Alley           1352
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType       894
MasVnrArea        15
BsmtQual          44
BsmtCond          45
BsmtExposure      44
BsmtFinType1      42
BsmtFinSF1         1
BsmtFinType2      42
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
FireplaceQu      730
GarageType        76
GarageYrBlt       78
GarageFinish      78
G

In [23]:
# Identify categorical and numerical columns
categorical_columns = train_data.select_dtypes(include=["object"]).columns
numerical_columns = train_data.select_dtypes(exclude=["object"]).columns

print("Categorical Columns:", categorical_columns)
print("Numerical Columns: ", numerical_columns)

# Remove 'SalePrice' from numerical columns as it's our target variable
numerical_columns = numerical_columns.drop('SalePrice')

Categorical Columns: Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')
Numerical Columns:  Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullB

In [24]:
# Filling the missing values in numerical columns with the mean of the column
train_data[numerical_columns] = train_data[numerical_columns].fillna(train_data[numerical_columns].mean())

# Make sure the columns exist in both train and test before filling missing values in test data
common_numerical_columns = numerical_columns.intersection(test_data.columns)
test_data[common_numerical_columns] = test_data[common_numerical_columns].fillna(test_data[common_numerical_columns].mean())

# Filling the missing values in categorical columns with the mode of the column
for col in categorical_columns:
    train_data[col] = train_data[col].fillna(train_data[col].mode()[0])
    if col in test_data.columns:
        test_data[col] = test_data[col].fillna(test_data[col].mode()[0])

In [25]:
# One-hot encoding the categorical columns
train_data_encode = pd.get_dummies(train_data)
test_data_encode = pd.get_dummies(test_data)

# Align the training and test data to have the same columns
train_data_encode, test_data_encode = train_data_encode.align(test_data_encode, join = "inner", axis = 1) # inner join keeps only the columns present in both dataframes

# Add the target variable back to the training data
train_data_encode["SalePrice"] = train_data["SalePrice"]

In [26]:
# Seperate the target variable and variables for training
X = train_data_encode.drop("SalePrice", axis = 1) # axis = 1 means drop column
y = train_data_encode["SalePrice"] # target variable

In [27]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)
print("Training data shape: ", X_train.shape)
print("Test data shape: ", X_test.shape)

Training data shape:  (1022, 270)
Test data shape:  (438, 270)


In [28]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score # r2_score is the coefficient of determination

# R2 score is the proportion of the variance in the dependent variable that is predictable from the independent variables
# It provides an indication of goodness of fit and therefore a measure of how well unseen samples are likely to be predicted by the model


# Create a linear regression model
model = LinearRegression()

# Fit the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate the mean squared error and R2 score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error: ", mse)
print("R2 Score: ", r2)

Mean Squared Error:  947391354.5559202
R2 Score:  0.8642335017252055
