# Let's start the process by importing necessary modules

In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_log_error
from joblib import dump, load

# Define the RMSE computation function

In [3]:
def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

#Assigning data path for DATASETS
Data_path = '/home/kutty/Desktop/github project/dsp-kishorekumar-mourougane/data/'
Dataset_path = Data_path + 'house_train.csv'
# load the datasets
df_original = pd.read_csv(Dataset_path)
df = df_original.copy()
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


We need to select two continuous and two categorical features for our model. Let's take a closer look at the contents of the data, including details about the features and their types. We can get a better understanding of the data through summary statistics. Let's display this

In [4]:
# Display general information about the dataframe
df.info()
# Display summary statistics
df.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
count,1460.0,1460.0,1460,1201.0,1460.0,1460,91,1460,1460,1460,...,1460.0,7,281,54,1460.0,1460.0,1460.0,1460,1460,1460.0
unique,,,5,,,2,2,4,4,2,...,,3,4,4,,,,9,6,
top,,,RL,,,Pave,Grvl,Reg,Lvl,AllPub,...,,Gd,MnPrv,Shed,,,,WD,Normal,
freq,,,1151,,,1454,50,925,1311,1459,...,,3,157,49,,,,1267,1198,
mean,730.5,56.89726,,70.049958,10516.828082,,,,,,...,2.758904,,,,43.489041,6.321918,2007.815753,,,180921.19589
std,421.610009,42.300571,,24.284752,9981.264932,,,,,,...,40.177307,,,,496.123024,2.703626,1.328095,,,79442.502883
min,1.0,20.0,,21.0,1300.0,,,,,,...,0.0,,,,0.0,1.0,2006.0,,,34900.0
25%,365.75,20.0,,59.0,7553.5,,,,,,...,0.0,,,,0.0,5.0,2007.0,,,129975.0
50%,730.5,50.0,,69.0,9478.5,,,,,,...,0.0,,,,0.0,6.0,2008.0,,,163000.0
75%,1095.25,70.0,,80.0,11601.5,,,,,,...,0.0,,,,0.0,8.0,2009.0,,,214000.0


let's use 'GrLivArea' and 'LotArea' as continuous features, and 'Neighborhood' and 'MSZoning' as categorical features. 'GrLivArea' and 'LotArea' are numerical and represent areas of the house which can be important for predicting the house price, whereas 'Neighborhood' and 'MSZoning' are nominal and represent geographical and zoning classifications, respectively. Let's now create a new DataFrame with only these selected features along with our target feature, 'SalePrice'. 

In [5]:
# Selecting The Features
features = ['GrLivArea', 'LotArea', 'Neighborhood', 'MSZoning']
target = ['SalePrice']

# Creating A New DataFrame With Selected Features
df_selected = df[features + target]

# Removing Missing Values
df_selected = df_selected.dropna()

# Display the first few rows of the new dataframe
df_selected.head()

Unnamed: 0,GrLivArea,LotArea,Neighborhood,MSZoning,SalePrice
0,1710,8450,CollgCr,RL,208500
1,1262,9600,Veenker,RL,181500
2,1786,11250,CollgCr,RL,223500
3,1717,9550,Crawfor,RL,140000
4,2198,14260,NoRidge,RL,250000


# Let's start by splitting the dataset into a train and test set. 
We'll use a standard split of 70% for training and 30% for testing. After that, we'll perform the scaling

In [6]:
# Splitting the dataset into train and test sets
train_df, test_df = train_test_split(df_selected, test_size=0.3, random_state=42)

# Separating Features and the Target Variable For Train and Test sets
X_train = train_df[features]
y_train = train_df[target]
X_test = test_df[features]
y_test = test_df[target]


In [7]:
# Define columns to be scaled/encoded
scale_cols = ['GrLivArea', 'LotArea']
encode_cols = ['Neighborhood', 'MSZoning']

# Scaling Continuous Variables
scaler = StandardScaler()
scaler.fit(X_train[scale_cols])
X_train_scaled = scaler.transform(X_train[scale_cols])
X_test_scaled = scaler.transform(X_test[scale_cols])


For numerical features, we'll use the StandardScaler which standardizes the features by removing the mean and scaling to unit variance. 

For categorical features, let's use OneHotEncoder which creates binary features for each category/label present in the column.

In [9]:
# One-hot Encoding Categorical Variables
encoder = OneHotEncoder(drop='first', sparse=False)
encoder.fit(X_train[encode_cols])
X_train_encoded = encoder.transform(X_train[encode_cols])
X_test_encoded = encoder.transform(X_test[encode_cols])

# Create dataframes from numpy arrays returned by scaler and encoder
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=scale_cols, index=X_train.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=scale_cols, index=X_test.index)
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(encode_cols), index=X_train.index)
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(encode_cols), index=X_test.index)

# concat scaled and encoded dataframes to have features ready for training/testing
X_train = pd.concat([X_train_scaled_df, X_train_encoded_df], axis=1)
X_test = pd.concat([X_test_scaled_df, X_test_encoded_df], axis=1)

The numerical features are scaled, and the categorical features have been transformed using one-hot encoding.

# Model Training
Train model using Linear Regression

In [10]:
# Initialize the Linear Regression Model
model = LinearRegression()
# Fit the Model on the Training Data
model.fit(X_train, y_train)

# Predict on the Test Set
y_pred = model.predict(X_test)

# Show the first few predictions
y_pred[:5]

array([[125676.07828921],
       [336436.75612153],
       [ 93902.20426864],
       [157887.19010127],
       [221516.06689756]])

The results are the predicted 'SalePrice' for the given house features in the test data.
# Model Evaluation
The Root-Mean-Squared-Error (RMSE)
between the logarithm of the predicted value and the logarithm of the observed sales price.

In [11]:
# Evaluate the Model
rmsle = compute_rmsle(y_test, y_pred)
rmsle

0.19

observed sales prices in both the training and validation sets is approximately 0.19.

# Model Inference

In [12]:
# Load the new unseen data
new_data = pd.read_csv(Data_path + 'house_test.csv')
# Display the first few rows of the dataframe
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [13]:
# Display general information about the dataframe
df.info()
# Display summary statistics
df.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
count,1460.0,1460.0,1460,1201.0,1460.0,1460,91,1460,1460,1460,...,1460.0,7,281,54,1460.0,1460.0,1460.0,1460,1460,1460.0
unique,,,5,,,2,2,4,4,2,...,,3,4,4,,,,9,6,
top,,,RL,,,Pave,Grvl,Reg,Lvl,AllPub,...,,Gd,MnPrv,Shed,,,,WD,Normal,
freq,,,1151,,,1454,50,925,1311,1459,...,,3,157,49,,,,1267,1198,
mean,730.5,56.89726,,70.049958,10516.828082,,,,,,...,2.758904,,,,43.489041,6.321918,2007.815753,,,180921.19589
std,421.610009,42.300571,,24.284752,9981.264932,,,,,,...,40.177307,,,,496.123024,2.703626,1.328095,,,79442.502883
min,1.0,20.0,,21.0,1300.0,,,,,,...,0.0,,,,0.0,1.0,2006.0,,,34900.0
25%,365.75,20.0,,59.0,7553.5,,,,,,...,0.0,,,,0.0,5.0,2007.0,,,129975.0
50%,730.5,50.0,,69.0,9478.5,,,,,,...,0.0,,,,0.0,6.0,2008.0,,,163000.0
75%,1095.25,70.0,,80.0,11601.5,,,,,,...,0.0,,,,0.0,8.0,2009.0,,,214000.0


In [15]:
# Selecting the same features as used for training
new_data_selected = new_data[features]
new_data_selected.head()

Unnamed: 0,GrLivArea,LotArea,Neighborhood,MSZoning
0,896,11622,NAmes,RH
1,1329,14267,NAmes,RL
2,1629,13830,Gilbert,RL
3,1604,9978,Gilbert,RL
4,1280,5005,StoneBr,RL


It appears there are missing values (`NaN`) in the categorical columns of the new data which weren't handled during encoding. Typically, we need to ensure that missing values are either filled or removed and that the new data categories are aligned with those in the training data. Let's address the issue by removing or imputing missing values in our new data before encoding

In [16]:
# Removing rows with missing values in the categorical columns for the new data
new_data_selected_clean = new_data_selected.dropna(subset=encode_cols)

# Preprocessing the cleaned new data
# Scaling Continuous Variables
new_data_scaled_clean = scaler.transform(new_data_selected_clean[scale_cols])

# One-hot Encoding Categorical Variables
new_data_encoded_clean = encoder.transform(new_data_selected_clean[encode_cols])

# Create dataframes from numpy arrays
new_data_scaled_df_clean = pd.DataFrame(new_data_scaled_clean, columns=scale_cols, index=new_data_selected_clean.index)
new_data_encoded_df_clean = pd.DataFrame(new_data_encoded_clean, columns=encoder.get_feature_names_out(encode_cols), index=new_data_selected_clean.index)

# Concat scaled and encoded dataframes
new_data_processed_clean = pd.concat([new_data_scaled_df_clean, new_data_encoded_df_clean], axis=1)

# Predicting the house prices for the new unseen data
y_pred_new_clean = model.predict(new_data_processed_clean)

# Display the first few predictions
y_pred_new_clean[:5]

array([[ 86599.53618678],
       [149959.00170879],
       [197363.79769082],
       [193495.3798033 ],
       [260224.92489878]])

# Persist the trained model, encoder and scaler objects

In [17]:
New_Path = '/home/kutty/Desktop/github project/dsp-kishorekumar-mourougane/models'
# Import joblib for persisting python objects
import joblib
joblib.dump(model, New_Path + '/model.joblib')
joblib.dump(encoder, New_Path + '/encoder.joblib')
joblib.dump(scaler, New_Path + '/scaler.joblib')

['/home/kutty/Desktop/github project/dsp-kishorekumar-mourougane/models/scaler.joblib']

The model, encoder, and scaler objects have been saved successfully in the 'models' directory. Now, let's refactor the model inference section to use these on-disk objects instead of the ones in memory.
# Loading the model, encoder, and scaler from disk

In [31]:
model_disk = joblib.load('/home/kutty/Desktop/github project/dsp-kishorekumar-mourougane/models/model.joblib')
encoder_disk = joblib.load('/home/kutty/Desktop/github project/dsp-kishorekumar-mourougane/models/encoder.joblib')
scaler_disk = joblib.load('/home/kutty/Desktop/github project/dsp-kishorekumar-mourougane/models/scaler.joblib')
# Preprocessing the cleaned new data
# Scaling Continuous Variables using the scaler loaded from disk
new_data_scaled_clean_disk = scaler_disk.transform(new_data_selected_clean[scale_cols])

# One-hot Encoding Categorical Variables using the encoder loaded from disk
new_data_encoded_clean_disk = encoder_disk.transform(new_data_selected_clean[encode_cols])

# Create dataframes from numpy arrays
new_data_scaled_df_clean_disk = pd.DataFrame(new_data_scaled_clean_disk, columns=scale_cols, index=new_data_selected_clean.index)
new_data_encoded_df_clean_disk = pd.DataFrame(new_data_encoded_clean_disk, columns=encoder_disk.get_feature_names_out(encode_cols), index=new_data_selected_clean.index)

# Concat scaled and encoded dataframes
new_data_processed_clean_disk = pd.concat([new_data_scaled_df_clean_disk, new_data_encoded_df_clean_disk], axis=1)

# Predicting the house prices for the new unseen data using the model loaded from disk
y_pred_new_clean_disk = model_disk.predict(new_data_processed_clean_disk)

# Display the first few predictions
y_pred_new_clean_disk[:5]

array([[ 86599.53618678],
       [149959.00170879],
       [197363.79769082],
       [193495.3798033 ],
       [260224.92489878]])

# code refactoring
Function to build the model

In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from joblib import dump

def build_model(data: pd.DataFrame) -> dict:
    # Feature Selection
    features = ["GrLivArea", "LotArea", "Neighborhood", "MSZoning"]  
    target = ["SalePrice"]

    # Split the data into train and test sets
    X = data[features]
    y = data[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Encoding and Scaling Initialization
    encode_cols = ["Neighborhood", "MSZoning"]  # Columns to be one-hot encoded
    encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
    scale_cols = ["GrLivArea", "LotArea"]  # Columns to be scaled
    scaler = StandardScaler()

    # Encoding categorical columns
    encoder.fit(X_train[encode_cols])
    X_train_encoded = encoder.transform(X_train[encode_cols])
    X_test_encoded = encoder.transform(X_test[encode_cols])

    # Scaling numerical columns
    scaler.fit(X_train[scale_cols])
    X_train_scaled = scaler.transform(X_train[scale_cols])  
    X_test_scaled = scaler.transform(X_test[scale_cols])

    # Concatenating (joining) the scaled and encoded variables
    X_train_prepared = np.hstack((X_train_scaled, X_train_encoded))
    X_test_prepared = np.hstack((X_test_scaled, X_test_encoded))

    # Train the model
    model = LinearRegression()
    model.fit(X_train_prepared, y_train)
    
    dump(model, '/home/kutty/Desktop/github project/dsp-kishorekumar-mourougane/models/model.joblib')
    dump(encoder, '/home/kutty/Desktop/github project/dsp-kishorekumar-mourougane/models/encoder.joblib')
    dump(scaler, '/home/kutty/Desktop/github project/dsp-kishorekumar-mourougane/models/scaler.joblib')
    
    return {"message": "Model has been successfully trained and saved."}


Function to make prediction

In [33]:
def make_predictions(input_data: pd.DataFrame) -> np.ndarray:
    # Loading model artifacts
    model = load('/home/kutty/Desktop/github project/dsp-kishorekumar-mourougane/models/model.joblib')
    encoder = load('/home/kutty/Desktop/github project/dsp-kishorekumar-mourougane/models/encoder.joblib')
    scaler = load('/home/kutty/Desktop/github project/dsp-kishorekumar-mourougane/models/scaler.joblib')
    
    # Using the same features as training
    features = ["GrLivArea", "LotArea", "Neighborhood", "MSZoning"]
    encode_cols = ["Neighborhood", "MSZoning"]  # Columns to be one-hot encoded
    scale_cols = ["GrLivArea", "LotArea"]  # Columns to be scaled

    # Select the features
    input_data_selected = input_data[features].dropna(subset=encode_cols)

    # Preprocessing for new data
    input_data_encoded = encoder.transform(input_data_selected[encode_cols])
    input_data_scaled = scaler.transform(input_data_selected[scale_cols])

    # Concat scaled and encoded dataframes
    input_data_prepared = np.hstack((input_data_scaled, input_data_encoded))

    # Predicting the house prices for the new unseen data
    y_pred = model.predict(input_data_prepared)
    
    return y_pred

In [34]:
# Load the training data
data = pd.read_csv(Data_path + 'house_train.csv')

# Training the model
build_model(data)

# Load the new unseen data
new_data = pd.read_csv(Data_path + 'house_test.csv')

# Making predictions
y_pred = make_predictions(new_data)

# First 5 predictions
y_pred[:5]

array([[ 86599.53618678],
       [149959.00170879],
       [197363.79769082],
       [193495.3798033 ],
       [260224.92489878]])

# Testing dataframe equality

In [35]:
# Save the dataframes as .parquet
train_df.to_parquet(Data_path + 'train_df.parquet', index=False)
test_df.to_parquet(Data_path + 'test_df.parquet', index=False)

# Load the saved dataframes
train_df_loaded = pd.read_parquet(Data_path + 'train_df.parquet')
test_df_loaded = pd.read_parquet(Data_path + 'test_df.parquet')

# Check that the loaded dataframes are equal to the original
pd.testing.assert_frame_equal(train_df, train_df_loaded)
pd.testing.assert_frame_equal(test_df, test_df_loaded)

AssertionError: DataFrame.index are different

DataFrame.index values are different (99.90215 %)
[left]:  Int64Index([ 135, 1452,  762,  932,  435,  629, 1210, 1118, 1084,  158,
            ...
             330, 1238,  466,  121, 1044, 1095, 1130, 1294,  860, 1126],
           dtype='int64', length=1022)
[right]: RangeIndex(start=0, stop=1022, step=1)

It seems that there was a problem with comparing our saved and loaded data. The issue is the inconsistency in index between the saved and loaded dataframes. This happened because the index was reset during the train-test split and was not saved when writing to .parquet file.

To solve this, I will reset the index before saving the dataframe. Let's try to save, load, and compare the dataframes again after resetting the index.

In [36]:
# Reset index of the dataframes before saving
train_df_reset = train_df.reset_index(drop=True)
test_df_reset = test_df.reset_index(drop=True)

# Save the dataframes as .parquet again
train_df_reset.to_parquet(Data_path + 'train_df.parquet', index=False)
test_df_reset.to_parquet(Data_path + 'test_df.parquet', index=False)

# Load the saved dataframes
train_df_loaded = pd.read_parquet(Data_path + 'train_df.parquet')
test_df_loaded = pd.read_parquet(Data_path + 'test_df.parquet')

# Check that the loaded dataframes are equal to the original
pd.testing.assert_frame_equal(train_df_reset, train_df_loaded)
pd.testing.assert_frame_equal(test_df_reset, test_df_loaded)

Finally, we confirmed the reproducibility of our data processing by saving and reloading the processed data.