# Loading Dataset

In [3]:
import numpy as np
import pandas as pd

dataset = pd.read_csv('./train.csv')
dataset.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
# let's split the target variable 'SalePrice' from our dataset
y = dataset['SalePrice']
X = dataset.drop(['SalePrice'],axis = 1)



# Splitting Data 

In [5]:
# i will use scikit learn to split the dataset into training and test set
from sklearn.model_selection import train_test_split


# spliting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.head()



Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
254,255,20,RL,70.0,8400,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
1066,1067,60,RL,59.0,7837,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,5,2009,WD,Normal
638,639,30,RL,67.0,8777,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,5,2008,WD,Normal
799,800,50,RL,60.0,7200,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,6,2007,WD,Normal
380,381,50,RL,50.0,5000,Pave,Pave,Reg,Lvl,AllPub,...,0,0,,,,0,5,2010,WD,Normal


In [6]:
X_train.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MiscVal            int64
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
Length: 80, dtype: object

In [7]:
X_train_new = X_train[['LotArea','TotalBsmtSF','Neighborhood','BldgType']]


X_train_new.head()

Unnamed: 0,LotArea,TotalBsmtSF,Neighborhood,BldgType
254,8400,1314,NAmes,1Fam
1066,7837,799,Gilbert,1Fam
638,8777,796,Edwards,1Fam
799,7200,731,SWISU,1Fam
380,5000,1026,SWISU,1Fam


# Data preprocessing

In [8]:
X_train_new.isnull().sum()


LotArea         0
TotalBsmtSF     0
Neighborhood    0
BldgType        0
dtype: int64

### Scaling numerical Features

In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

#fitting training data
scaler.fit(X_train_new[['LotArea','TotalBsmtSF']])

#transforming training data
X_train_new[['LotArea','TotalBsmtSF']] = scaler.transform(X_train_new[['LotArea','TotalBsmtSF']])

print(X_train_new.head())
print(f"shape of X_train_new is {X_train_new.shape}")


#selecting the features in test data
X_test = X_test[['LotArea','TotalBsmtSF','Neighborhood','BldgType']]


#transforming test data
X_test[['LotArea','TotalBsmtSF']] = scaler.transform(X_test[['LotArea','TotalBsmtSF']])



       LotArea  TotalBsmtSF Neighborhood BldgType
254  -0.212896     0.572612        NAmes     1Fam
1066 -0.265245    -0.596547      Gilbert     1Fam
638  -0.177841    -0.603357      Edwards     1Fam
799  -0.324474    -0.750921        SWISU     1Fam
380  -0.529035    -0.081209        SWISU     1Fam
shape of X_train_new is (1168, 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_new[['LotArea','TotalBsmtSF']] = scaler.transform(X_train_new[['LotArea','TotalBsmtSF']])


### Encoding Categorical Features

In [10]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()

# Fit the encoder on both 'BldgType' and 'Neighborhood'
enc.fit(X_train_new[['BldgType', 'Neighborhood']])

#feature_aray is the encoded features without label
feature_array = enc.transform(X_train_new[['BldgType', 'Neighborhood']]).toarray()

# Get the categories of 'BldgType' and 'Neighborhood'
feature_labels = enc.categories_

# Flatten the array of categories
#we use flatting because the two features neighborhood and BldgType are not in the same dimension, bldgtype has 5 categories and neighborhood has 25
feature_labels_flat = [label for sublist in feature_labels for label in sublist]

#after flatting we get one list with 30 elements


# Create a DataFrame with the feature array and proper column names
df = pd.DataFrame(feature_array, columns=feature_labels_flat)

df.head()

Unnamed: 0,1Fam,2fmCon,Duplex,Twnhs,TwnhsE,Blmngtn,Blueste,BrDale,BrkSide,ClearCr,...,NoRidge,NridgHt,OldTown,SWISU,Sawyer,SawyerW,Somerst,StoneBr,Timber,Veenker
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
## Task the teacher about this code, without resetting index, the number of rows increased, check the shape of X_train_encoded before and after this code 
X_train_new.reset_index(drop=True, inplace=True)
df.reset_index(drop=True, inplace=True)

X_train_encoded = pd.concat([X_train_new, df], axis=1)


# Drop the original features : 'Neighborhood' and 'BldfType'
X_train_encoded.drop(['Neighborhood', 'BldgType'], axis=1, inplace=True)

X_train_encoded.head()



Unnamed: 0,LotArea,TotalBsmtSF,1Fam,2fmCon,Duplex,Twnhs,TwnhsE,Blmngtn,Blueste,BrDale,...,NoRidge,NridgHt,OldTown,SWISU,Sawyer,SawyerW,Somerst,StoneBr,Timber,Veenker
0,-0.212896,0.572612,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.265245,-0.596547,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.177841,-0.603357,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.324474,-0.750921,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.529035,-0.081209,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# encoding the test data

X_test_features_array = enc.transform(X_test[['BldgType', 'Neighborhood']]).toarray()

X_test_features_labels = enc.categories_

X_test_labels_flat = [label for sublist in X_test_features_labels for label in sublist]

df2 = pd.DataFrame(X_test_features_array, columns=X_test_labels_flat)

## ask the teacher about this code, without resetting index, the number of rows increased, check the shape of X_train_encoded before and after this code 
X_test.reset_index(drop=True, inplace=True)
df2.reset_index(drop=True, inplace=True)


X_test_encoded = pd.concat([X_test, df2], axis=1)

X_test_encoded.drop(['Neighborhood', 'BldgType'], axis=1, inplace=True)

X_test_encoded.head()





Unnamed: 0,LotArea,TotalBsmtSF,1Fam,2fmCon,Duplex,Twnhs,TwnhsE,Blmngtn,Blueste,BrDale,...,NoRidge,NridgHt,OldTown,SWISU,Sawyer,SawyerW,Somerst,StoneBr,Timber,Veenker
0,-0.211594,-0.006292,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.145643,0.910874,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.160826,-0.122072,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.529035,-0.131153,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.205338,1.267297,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [13]:
print(X_train_encoded.columns)

Index(['LotArea', 'TotalBsmtSF', '1Fam', '2fmCon', 'Duplex', 'Twnhs', 'TwnhsE',
       'Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr',
       'Crawfor', 'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel',
       'NAmes', 'NPkVill', 'NWAmes', 'NoRidge', 'NridgHt', 'OldTown', 'SWISU',
       'Sawyer', 'SawyerW', 'Somerst', 'StoneBr', 'Timber', 'Veenker'],
      dtype='object')


In [14]:
X_test_encoded.columns

Index(['LotArea', 'TotalBsmtSF', '1Fam', '2fmCon', 'Duplex', 'Twnhs', 'TwnhsE',
       'Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr',
       'Crawfor', 'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel',
       'NAmes', 'NPkVill', 'NWAmes', 'NoRidge', 'NridgHt', 'OldTown', 'SWISU',
       'Sawyer', 'SawyerW', 'Somerst', 'StoneBr', 'Timber', 'Veenker'],
      dtype='object')

# Model Training and Evaluation

In [15]:
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import LinearRegression


#training the model 
model = LinearRegression()
model.fit(X_train_encoded,y_train)
y_pred = model.predict(X_test_encoded)

def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)
    

In [16]:
print(f" the rmsle error is  {compute_rmsle(y_test,y_pred)}")

 the rmsle error is  0.25
