<a href="https://colab.research.google.com/github/MrBigBrane/Machine-Learning/blob/main/Kaggle_Predicting_House_Prices.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [631]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [632]:
train_data = pd.read_csv("/content/train.csv")
test_data = pd.read_csv("/content/test.csv")

In [633]:
# Drop the label column and get label set
X_train_data = train_data.drop("SalePrice", axis=1)
y_train_data = train_data["SalePrice"]

X_test_data = test_data.drop(columns='Id')
X_test_copy = test_data.copy()

# Id is not useful for prediction
X_train_data.drop(columns='Id',inplace=True)
X_test_data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
4,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,144,0,,,,0,1,2010,WD,Normal


In [634]:
# Dropping any column that has more than 50% n/a values
for col in X_train_data.columns:
    if (X_train_data[col].isnull().sum()/X_train_data.shape[0])*100>50:
        X_train_data.drop(columns=col,inplace=True)

for col in X_test_data.columns:
    if (X_test_data[col].isnull().sum()/X_test_data.shape[0])*100>50:
        X_test_data.drop(columns=col,inplace=True)

In [635]:
# Get column names that have string values for onehotencoding
string_cols = X_train_data.select_dtypes(include=['object']).columns
string_cols_test = X_test_data.select_dtypes(include=['object']).columns

# Get column names that have numerical data
num_cols = X_train_data.select_dtypes(include=['int64', 'float64']).columns
num_cols_test = X_test_data.select_dtypes(include=['int64', 'float64']).columns

In [636]:
# Filling n/a values in columns with numerical data with median of each column
X_train_data[num_cols_test] = X_train_data[num_cols_test].fillna(X_train_data[num_cols_test].median())
X_test_data[num_cols_test] = X_test_data[num_cols_test].fillna(X_test_data[num_cols_test].median())

# Filling n/a values in columns with letter/string data with the most common value of each column
X_train_data[string_cols_test] = X_train_data[string_cols_test].fillna(X_train_data[string_cols_test].mode().iloc[0])
X_test_data[string_cols_test] = X_test_data[string_cols_test].fillna(X_test_data[string_cols_test].mode().iloc[0])

In [637]:
# Check that there are no N/A values left
for col in X_train_data.columns:
    if X_train_data[col].isnull().sum()>0:
        print(col,'->',X_train_data[col].value_counts())

for col in X_test_data.columns:
    if X_test_data[col].isnull().sum()>0:
        print(col,'->',X_test_data[col].value_counts())

FireplaceQu -> FireplaceQu
Gd    380
TA    313
Fa     33
Ex     24
Po     20
Name: count, dtype: int64


In [638]:
X_train_data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,61,0,0,0,0,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,42,0,0,0,0,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,35,272,0,0,0,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,84,0,0,0,0,0,12,2008,WD,Normal


### One Hot Encoding

In [639]:
# One hot encode only the string value columns
X_train_encoded = pd.get_dummies(X_train_data, columns=string_cols_test, drop_first=True)
X_test_encoded = pd.get_dummies(X_test_data, columns=string_cols_test, drop_first=True)

# Columns with non-binary values (to distinguish for scaling)
non_binary_columns = [
    col for col in X_train_encoded.columns if X_train_encoded[col].nunique() != 2
]
# Columns with only binary values (to distinguish for scaling)
binary_columns = [
    col for col in X_train_encoded.columns if X_train_encoded[col].nunique() == 2
]

non_binary_columns_test = [
    col for col in X_test_encoded.columns if X_test_encoded[col].nunique() != 2
]
binary_columns_test = [
    col for col in X_test_encoded.columns if X_test_encoded[col].nunique() == 2
]

# Cast boolean ohe to int
X_train_encoded[binary_columns_test] = X_train_encoded[binary_columns_test].astype(int)
X_test_encoded[binary_columns_test] = X_test_encoded[binary_columns_test].astype(int)

In [640]:
common_string_cols = list(set(X_train_encoded) & set(X_test_encoded.columns))
X_train_encoded = X_train_encoded[common_string_cols]
X_test_encoded = X_test_encoded[common_string_cols]

### Split and Scale Data

In [641]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

X_train, X_test, y_train, y_test = train_test_split(X_train_encoded, y_train_data, test_size=0.2)


# Scale non-binary data columns with min max scaler (worked best)
scaler = MinMaxScaler()
X_train[non_binary_columns_test] = scaler.fit_transform(X_train[non_binary_columns_test])
X_test[non_binary_columns_test] = scaler.transform(X_test[non_binary_columns_test])

# Scale test data
X_test_encoded[non_binary_columns_test] = scaler.transform(X_test_encoded[non_binary_columns_test])

## Testing Lasso Model

In [642]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso

# Define parameter grid
param_grid = {'alpha': [0.01, 0.1, 1, 10, 100, 150, 200, 300, 500]}

# Grid search for Lasso
lasso = Lasso(max_iter=10000)
grid_search = GridSearchCV(lasso, param_grid, cv=5, scoring='neg_root_mean_squared_error')
grid_search.fit(X_train, y_train)

# Best alpha
print("Best alpha:", grid_search.best_params_['alpha'])

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best alpha: 300


In [643]:
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'alpha': 300}


In [644]:
final_model = Lasso(
    alpha=best_params['alpha'],
    max_iter=10000
)

# Train the model
final_model.fit(X_train, y_train)

In [645]:
from sklearn import metrics

#### Test dataset - metrics ####
y_test_pred = final_model.predict(X_test)
r2_score = round(metrics.r2_score(y_test, y_test_pred),2)
rmse = round(np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)),2)
print(f'r2: {r2_score}')
print(f'rmse: {rmse}')

r2: 0.9
rmse: 23263.86


In [646]:
submission_pred = final_model.predict(X_test_encoded)

print(submission_pred)

[ 99305.05930802 151846.85200316 174842.35027293 ... 160535.35647001
 112430.93651399 229983.9492406 ]


In [647]:
ans=pd.DataFrame({'Id':X_test_copy['Id'],'SalePrice':submission_pred})
ans.to_csv('submission.csv',index=False)

## ElasticNet

In [334]:
from sklearn.linear_model import ElasticNet

# Define the parameter grid
param_grid = {
    'alpha': [0.01, 0.1, 1, 10],
    'l1_ratio': [0.1, 0.5, 0.7, 0.9]
}

# ElasticNet model
elastic_net = ElasticNet(max_iter=10000)

# Grid search
grid_search = GridSearchCV(elastic_net, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Best parameters
print("Best parameters:", grid_search.best_params_)

best_params = grid_search.best_params_

Best parameters: {'alpha': 0.01, 'l1_ratio': 0.1}


In [335]:
final_model = ElasticNet(
    alpha=best_params['alpha'],
    l1_ratio=best_params['l1_ratio'],
    max_iter=10000
)

# Train the model
final_model.fit(X_train, y_train)

In [336]:
from sklearn import metrics

#### Test dataset - metrics ####
y_test_pred = final_model.predict(X_test)
r2_score = round(metrics.r2_score(y_test, y_test_pred),2)
rmse = round(np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)),2)
print(f'r2: {r2_score}')
print(f'rmse: {rmse}')

r2: 0.74
rmse: 37731.52


## Ridge

In [337]:
from sklearn.linear_model import Ridge

# Define the parameter grid
param_grid = {'alpha': [0.01, 0.1, 1, 10, 100]}

# Ridge model
ridge = Ridge()

# Grid search
grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Best parameters
print("Best parameters:", grid_search.best_params_)

best_params = grid_search.best_params_

Best parameters: {'alpha': 10}


In [338]:
final_model = Ridge(
    alpha=best_params['alpha'],
)

# Train the model
final_model.fit(X_train, y_train)

In [339]:
#### Test dataset - metrics ####
y_test_pred = final_model.predict(X_test)
r2_score = round(metrics.r2_score(y_test, y_test_pred),2)
rmse = round(np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)),2)
print(f'r2: {r2_score}')
print(f'rmse: {rmse}')

r2: 0.74
rmse: 37958.7


## Random Forest

In [340]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

# Define the RandomForestRegressor model
rf = RandomForestRegressor(random_state=42)

# Define the parameter distribution
param_dist = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

# Set up RandomizedSearchCV with cross-validation
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist,
                                   n_iter=25, cv=5, scoring='neg_mean_squared_error',
                                   n_jobs=-1, verbose=2, random_state=42)

# Fit the random search to the data
random_search.fit(X_train, y_train)

# Get the best parameters and model
print("Best parameters found: ", random_search.best_params_)
best_model_random = random_search.best_estimator_


Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best parameters found:  {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}


In [346]:
best_params = random_search.best_params_

final_model = RandomForestRegressor(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    max_features=best_params['max_features'],
    bootstrap=best_params['bootstrap'],
    random_state=42
)

# Train the model
final_model.fit(X_train, y_train)

In [347]:
#### Test dataset - metrics ####
y_test_pred = final_model.predict(X_test)
r2_score = round(metrics.r2_score(y_test, y_test_pred),2)
rmse = round(np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)),2)
print(f'r2: {r2_score}')
print(f'rmse: {rmse}')

r2: 0.3
rmse: 62363.65
