In [122]:
# Import libraries

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import joblib
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error


# Model Building

## MODEL TRAINING 

### Dataset loading and splitting into train and test

In [102]:
# Loading data

df = pd.read_csv('../../data/train.csv')
df.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [81]:
# splitting into train and test

from sklearn.model_selection import train_test_split

X, y = df.drop(columns=['SalePrice']), df['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

X_train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
1023,1024,120,RL,43.0,3182,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2008,WD,Normal
810,811,20,RL,78.0,10140,Pave,,Reg,Lvl,AllPub,...,0,648,Fa,GdPrv,,0,1,2006,WD,Normal
1384,1385,50,RL,60.0,9060,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,10,2009,WD,Normal
626,627,20,RL,,12342,Pave,,IR1,Lvl,AllPub,...,0,0,,GdWo,Shed,600,8,2007,WD,Normal
813,814,20,RL,75.0,9750,Pave,,Reg,Lvl,AllPub,...,0,0,,,Shed,500,4,2007,COD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,1096,20,RL,78.0,9317,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,3,2007,WD,Normal
1130,1131,50,RL,65.0,7804,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,12,2009,WD,Normal
1294,1295,20,RL,60.0,8172,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Normal
860,861,50,RL,55.0,7642,Pave,,Reg,Lvl,AllPub,...,0,0,,GdPrv,,0,6,2007,WD,Normal


### Preprocessing and feature engineering of the train set

In [103]:
def select_features(X_train, y_train):
    # Feature selection
    features = ['Foundation', 'KitchenQual', 'TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF']
    df_train = X_train[features].join(y_train)
    
    # Removing duplicates
    df_train = df_train[~df_train[features].duplicated(keep='first')]
    df_train = df_train.reset_index(drop=True)
    
    y_train = df_train['SalePrice']
    df_train = df_train.drop(columns=['SalePrice'])
    return df_train, y_train


In [83]:
df_train , y_train = select_features(X_train, y_train)

In [104]:
def scale_continuous_features(df_train, path):
    # Continuous feature scaling
    continuous_columns_train = df_train.select_dtypes(include='number').columns
    
    scaler = StandardScaler()
    joblib.dump(scaler, path+'scaler.joblib')
    
    scaler.fit(df_train[continuous_columns_train])
    scaled_columns_train = scaler.transform(df_train[continuous_columns_train])
    
    df_continuous_train = pd.DataFrame(data=scaled_columns_train, columns=continuous_columns_train)
    
    return df_continuous_train




In [85]:
path = 'C:/Users/jayes/Desktop/DSP_GIT/dsp-jayeshkaushik-narayanareddy/models/'
df_continuous_train = scale_continuous_features(df_train, path)

In [120]:
def scale_categorical_features(df_train, path):
    # Categorical feature scaling
    categorical_columns_train = df_train.select_dtypes(include='object').columns
    
    encoder = OneHotEncoder()
    joblib.dump(encoder, path+'encoder.joblib')
    
    encoder.fit(df_train[categorical_columns_train])
    categorical_features_encoded = encoder.transform(df_train[categorical_columns_train])
    
    df_categorical_train = pd.DataFrame(categorical_features_encoded.toarray(), columns=encoder.get_feature_names_out(categorical_columns_train))
    if 'Foundation_Wood' in df_categorical_train.columns:
        df_categorical_train.drop(['Foundation_Wood'], axis=1, inplace=True)

    
    return df_categorical_train


In [87]:
df_categorical_train = scale_categorical_features(df_train, path)

In [106]:
def combine_features(df_continuous_train, df_categorical_train):
    # Combining scaled features
    X_train = df_continuous_train.join(df_categorical_train)
    
    return X_train

In [89]:
X_train = combine_features(df_continuous_train, df_categorical_train)

In [107]:
def train_model(X_train, y_train, model_path):
    # Train the model
    model = LinearRegression()
    model.fit(X_train, y_train)
    # Save the model 
    joblib.dump(model, model_path+'model.joblib')

In [91]:
train_model(X_train, y_train, path)

In [108]:
def make_predictions(input_data: pd.DataFrame) -> np.ndarray:
    
    path = 'C:/Users/jayes/Desktop/DSP_GIT/dsp-jayeshkaushik-narayanareddy/models/'
    model_saved = joblib.load(path+'model.joblib')
    predictions = model_saved.predict(input_data)
    predictions[predictions < 0] = 0
    
    return predictions

In [112]:
#make_predictions(X_test)

In [109]:
def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

In [116]:
def build_model(data: pd.DataFrame) -> dict[str, str]:
    
    from sklearn.model_selection import train_test_split

    X, y = df.drop(columns=['SalePrice']), df['SalePrice']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    
    path = 'C:/Users/jayes/Desktop/DSP_GIT/dsp-jayeshkaushik-narayanareddy/models/'
    df_train , y_train = select_features(X_train, y_train)
    df_continuous_train = scale_continuous_features(df_train, path)
    df_categorical_train = scale_categorical_features(df_train, path)
    X_train = combine_features(df_continuous_train, df_categorical_train)
    train_model(X_train, y_train, path)
    
    df_test , y_test = select_features(X_test, y_test)
    df_continuous_test = scale_continuous_features(df_test, path)
    df_categorical_test = scale_categorical_features(df_test, path)
    X_test = combine_features(df_continuous_test, df_categorical_test)
    
    y_pred = make_predictions(X_test)
    
    evaluate= compute_rmsle(y_test, y_pred)
    
    return evaluate

In [126]:
build_model(df)

0.22

### model training 

In [154]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

In [12]:
import joblib

path = 'C:/Users/jayes/Desktop/DSP_GIT/dsp-jayeshkaushik-narayanareddy/models/'
joblib.dump(model, path+'model.joblib')

NameError: name 'model' is not defined

## MODEL EVALUATION 

### Preprocessing and feature engineering of the test set

In [15]:
#feature selection 
features = ['Foundation', 'KitchenQual', 'TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF']
df_test = X_test[features].join(y_test)
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 365 entries, 892 to 1418
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Foundation    365 non-null    object
 1   KitchenQual   365 non-null    object
 2   TotRmsAbvGrd  365 non-null    int64 
 3   WoodDeckSF    365 non-null    int64 
 4   YrSold        365 non-null    int64 
 5   1stFlrSF      365 non-null    int64 
 6   SalePrice     365 non-null    int64 
dtypes: int64(5), object(2)
memory usage: 30.9+ KB


In [16]:
# checking dulicates 
df_test[features].duplicated(keep='first').sum()

#removing dulpicates 
df_test = df_test[~df_test[features].duplicated(keep='first')]
df_test = df_test.reset_index(drop=True)

In [17]:
#continuous feature scaling 
continuous_columns_test = df_test[features].select_dtypes(include='number').columns
print(continuous_columns_test)

from sklearn.preprocessing import StandardScaler
scaler_test = StandardScaler()

scaler_test.fit(df_test[continuous_columns_test])
scaled_columns_test = scaler_test.transform(df_test[continuous_columns_test])

df_continuous_test = pd.DataFrame(data=scaled_columns_test, columns=continuous_columns_test)
df_continuous_test.head()

Index(['TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF'], dtype='object')


Unnamed: 0,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF
0,-0.233638,0.954269,-1.354405,-0.183993
1,1.620229,0.899006,1.641704,0.939331
2,-0.851594,-0.814167,1.641704,-0.288005
3,0.384318,-0.814167,-1.354405,-0.350412
4,-0.233638,1.285851,0.892676,1.251365


In [18]:
# categorical feature scaling 
categorical_columns_test = df_test[features].select_dtypes(include='object').columns
print(categorical_columns_test)

from sklearn.preprocessing import OneHotEncoder
encoder_test = OneHotEncoder()

encoder_test.fit(df_test[categorical_columns_test])
categorical_features_encoded_test = encoder_test.transform(df_test[categorical_columns_test])

df_categorical_test = pd.DataFrame(categorical_features_encoded_test.toarray(), columns=encoder_test.get_feature_names_out(categorical_columns_test))
df_categorical_test.head()

Index(['Foundation', 'KitchenQual'], dtype='object')


Unnamed: 0,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [19]:
X_test = df_continuous_test.join(df_categorical_test)
X_test

Unnamed: 0,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA
0,-0.233638,0.954269,-1.354405,-0.183993,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.620229,0.899006,1.641704,0.939331,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.851594,-0.814167,1.641704,-0.288005,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.384318,-0.814167,-1.354405,-0.350412,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.233638,1.285851,0.892676,1.251365,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,1.002273,-0.814167,-0.605378,-0.053979,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
361,-0.233638,-0.814167,0.892676,-1.333321,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
362,0.384318,0.954269,-0.605378,0.377669,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
363,0.384318,0.733215,0.892676,-0.415419,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [20]:
y_test=df_test['SalePrice']
y_test

0      154500
1      325000
2      115000
3      159000
4      315500
        ...  
360    195000
361    120000
362    228500
363    248000
364    124000
Name: SalePrice, Length: 365, dtype: int64

### Model predictions on the test set

In [120]:
y_pred = model.predict(X_test)
y_pred[y_pred < 0] = 0

### Model evaluation

In [121]:
import numpy as np
from sklearn.metrics import mean_squared_log_error

def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

compute_rmsle(y_test, y_pred)

0.22

# MODEL INFERENCE

In [122]:
# Loading data

df_test_data = pd.read_csv('../../data/test.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


### Preprocessing and feature engineering of this data

In [123]:
#feature selection 
features = ['Foundation', 'KitchenQual', 'TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF']
df_test_file = df_test_data[features]
df_test_file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Foundation    1459 non-null   object
 1   KitchenQual   1458 non-null   object
 2   TotRmsAbvGrd  1459 non-null   int64 
 3   WoodDeckSF    1459 non-null   int64 
 4   YrSold        1459 non-null   int64 
 5   1stFlrSF      1459 non-null   int64 
dtypes: int64(4), object(2)
memory usage: 68.5+ KB


In [124]:
# checking dulicates 
df_test_file[features].duplicated(keep='first').sum()

#removing dulpicates 
df_test_file = df_test_file[~df_test_file[features].duplicated(keep='first')]
df_test_file= df_test_file.reset_index(drop=True)

In [161]:
#continuous feature scaling 
continuous_columns_test_file = df_test_file[features].select_dtypes(include='number').columns
print(continuous_columns_test_file)

scaler_test_file = joblib.load(path+'scaler.joblib')

scaler_test_file.fit(df_test_file[continuous_columns_test_file])
scaled_columns_test_file = scaler_test_file.transform(df_test_file[continuous_columns_test_file])

df_continuous_test_file = pd.DataFrame(data=scaled_columns_test_file, columns=continuous_columns_test_file)
df_continuous_test_file.head()

Index(['TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF'], dtype='object')


Unnamed: 0,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF
0,-0.92489,0.359033,1.71568,-0.667328
1,-0.263462,2.333145,1.71568,0.420259
2,-0.263462,0.920836,1.71568,-0.586952
3,0.397966,2.075652,1.71568,-0.591975
4,-0.92489,-0.733361,1.71568,0.297183


In [162]:
# categorical feature scaling 
categorical_columns_test_file = df_test_file[features].select_dtypes(include='object').columns
print(categorical_columns_test_file)


encoder_test_file = joblib.load(path+'encoder.joblib')

encoder_test_file.fit(df_test_file[categorical_columns_test_file])
categorical_features_encoded_test_file = encoder_test_file.transform(df_test_file[categorical_columns_test_file])

df_categorical_test_file = pd.DataFrame(categorical_features_encoded_test_file.toarray(), columns=encoder_test_file.get_feature_names_out(categorical_columns_test_file))
df_categorical_test_file.drop(['Foundation_Wood'],axis=1,inplace=True)
df_categorical_test_file.drop(['KitchenQual_nan'],axis=1,inplace=True)
df_categorical_test_file.head()

Index(['Foundation', 'KitchenQual'], dtype='object')


Unnamed: 0,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [143]:
X_test_file = df_continuous_test_file.join(df_categorical_test_file)
X_test_file

Unnamed: 0,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA
0,-0.924890,0.359033,1.715680,-0.667328,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.263462,2.333145,1.715680,0.420259,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.263462,0.920836,1.715680,-0.586952,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.397966,2.075652,1.715680,-0.591975,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.924890,-0.733361,1.715680,0.297183,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1426,1.059394,0.515089,-1.361898,0.498123,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1427,-0.263462,-0.733361,-1.361898,-1.546439,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1428,0.397966,2.965173,-1.361898,0.156525,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1429,-0.263462,-0.109136,-1.361898,-0.481459,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


### Predicting the house prices of this data

In [158]:
model_saved = joblib.load(path+'model.joblib')

y_pred_test_file = model_saved.predict(X_test_file)
y_pred_test_file[y_pred_test_file < 0] = 0

predicted_df = pd.DataFrame({'y_pred_test_file': y_pred_test_file})
final_predicted_values = pd.concat([X_test_file, predicted_df], axis=1)

final_predicted_values

Unnamed: 0,TotRmsAbvGrd,WoodDeckSF,YrSold,1stFlrSF,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,y_pred_test_file
0,-0.924890,0.359033,1.715680,-0.667328,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,120357.717039
1,-0.263462,2.333145,1.715680,0.420259,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,208210.409804
2,-0.263462,0.920836,1.715680,-0.586952,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,166305.033059
3,0.397966,2.075652,1.715680,-0.591975,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,224826.606814
4,-0.924890,-0.733361,1.715680,0.297183,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,194267.529305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1426,1.059394,0.515089,-1.361898,0.498123,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,214679.779883
1427,-0.263462,-0.733361,-1.361898,-1.546439,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,108803.070831
1428,0.397966,2.965173,-1.361898,0.156525,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,187154.409367
1429,-0.263462,-0.109136,-1.361898,-0.481459,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,162388.383684
