# Let's start the process by importing necessary modules

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

# Define the RMSE computation function

In [2]:
def compute_rmsle(y_test, y_pred, precision=2):
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

# load the datasets
df_train = pd.read_csv('/home/kutty/Desktop/github project/dsp-kishorekumar-mourougane/data/house_train.csv')
df_test = pd.read_csv('/home/kutty/Desktop/github project/dsp-kishorekumar-mourougane/data/house_test.csv')

df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# For simplicity we will use 2 continuous and 2 categorical features for this demonstration
Continuous features: "LotArea", "OverallQual", 
Categorical features: "MSZoning", "Street"


In [3]:
# Selecting relevant columns for modeling
df_selected = df_train[["LotArea", "OverallQual", "MSZoning", "Street", "SalePrice"]]
y = df_selected["SalePrice"]
df_selected = df_selected.drop(["SalePrice"], axis=1)

df_selected.head()

Unnamed: 0,LotArea,OverallQual,MSZoning,Street
0,8450,7,RL,Pave
1,9600,6,RL,Pave
2,11250,7,RL,Pave
3,9550,7,RL,Pave
4,14260,8,RL,Pave


In [4]:
# Show data types
df_train.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object

In [5]:
# Check for missing values
df_train.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

# Select important features (based on initial investigation)
I'll select the first 2 numerical and 2 categorical features that don't have any missing values. 

For numerical features, let's choose "LotArea" and "OverallQual" which represents lot size in square feet and rates the overall material and finish of the house respectively.

For the categorical features, let's choose "MSZoning" and "Street" which identifies the general zoning classification of the sale and type of road access to the property respectively.

In [6]:
numerical_features = ['LotArea', 'OverallQual']
categorical_features = ['MSZoning', 'Street']

# Final X and Y
X = df_train[numerical_features + categorical_features]
y = df_train['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train.head()

Unnamed: 0,LotArea,OverallQual,MSZoning,Street
921,8777,5,RL,Pave
520,10800,4,RL,Pave
401,8767,7,RL,Pave
280,11287,7,RL,Pave
1401,7415,6,RL,Pave


For numerical features, we'll use the StandardScaler which standardizes the features by removing the mean and scaling to unit variance. 

For categorical features, let's use OneHotEncoder which creates binary features for each category/label present in the column.

In [7]:
# Scale numerical features
scaler = StandardScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

X_train.head()

Unnamed: 0,LotArea,OverallQual,MSZoning,Street
921,-0.17883,-0.828481,RL,Pave
520,0.026099,-1.558795,RL,Pave
401,-0.179843,0.632147,RL,Pave
280,0.075432,0.632147,RL,Pave
1401,-0.3168,-0.098167,RL,Pave


In [8]:
# Encode categorical features
encoder = OneHotEncoder(drop='first', sparse=False)
X_train_encoded = encoder.fit_transform(X_train[categorical_features])
X_test_encoded = encoder.transform(X_test[categorical_features])

# Convert these arrays into DataFrame
X_train_encoded_df = pd.DataFrame(X_train_encoded, index=X_train.index, columns=encoder.get_feature_names_out(categorical_features))
X_test_encoded_df = pd.DataFrame(X_test_encoded, index=X_test.index, columns=encoder.get_feature_names_out(categorical_features))

# Drop the original columns 
X_train.drop(columns=categorical_features, inplace=True)
X_test.drop(columns=categorical_features, inplace=True)

# Concatenate the original data and the encoded data
X_train = pd.concat([X_train, X_train_encoded_df], axis=1)
X_test = pd.concat([X_test, X_test_encoded_df], axis=1)

X_train.head()



Unnamed: 0,LotArea,OverallQual,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Pave
921,-0.17883,-0.828481,0.0,0.0,1.0,0.0,1.0
520,0.026099,-1.558795,0.0,0.0,1.0,0.0,1.0
401,-0.179843,0.632147,0.0,0.0,1.0,0.0,1.0
280,0.075432,0.632147,0.0,0.0,1.0,0.0,1.0
1401,-0.3168,-0.098167,0.0,0.0,1.0,0.0,1.0


# The numerical features are scaled, and the categorical features have been transformed using one-hot encoding.
# Train model using Linear Regression

In [9]:
model = LinearRegression()
model.fit(X_train, np.log1p(y_train))
y_pred_train = np.expm1(model.predict(X_train))
y_pred_test = np.expm1(model.predict(X_test))

# Calculate RMSLE on the test set
rmsle_train = compute_rmsle(y_train, y_pred_train)
rmsle_test = compute_rmsle(y_test, y_pred_test)

rmsle_train, rmsle_test

(0.21, 0.21)

# The Root-Mean-Squared-Error (RMSE) between the logarithms of the predicted and observed sales prices in both the training and validation sets is approximately 0.21.

In [10]:
# Compute average RMSLE using cross-validation

cv_scores = cross_val_score(model, X_train, np.log1p(y_train), cv=5, scoring='neg_mean_squared_error')
cv_scores = np.sqrt(-cv_scores)

np.mean(cv_scores)

0.21018939567317765