In [17]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.model_selection import GridSearchCV




In [19]:
df_train = pd.read_csv("train.csv")
df_train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [22]:
df_train.shape

(1460, 81)

In [4]:
df_test = pd.read_csv("test.csv")
df_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [5]:
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission.head()

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.68357
3,1464,179317.477511
4,1465,150730.079977


In [6]:
#df_train.columns

In [7]:
#df_test.columns

In [8]:
# Model 1: Random Forest

# Preprocessing data to make rf model


target_column = 'SalePrice'

# Extract features and target variable from the training set
X = df_train.drop(target_column, axis=1)
y = df_train[target_column]

# Identify categorical columns in the dataset
categorical_cols = X.select_dtypes(include=['object']).columns

# Identify quantative columns in the dataset
numerical_cols = X.select_dtypes(exclude=['object']).columns

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Create transformers for numerical and categorical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a preprocessor to apply transformers to appropriate columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])



In [9]:
# Create a Random Forest regressor model
rf_model = RandomForestRegressor(random_state=12) 

# Create a pipeline with the preprocessor and Random Forest model
# model = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('regressor', rf_model)
# ])

# # Train the model on the training set
# model.fit(X_train, y_train)

# # Make predictions on the test set
# y_test_pred = model.predict(X_test)


In [11]:
# Find the best parameters for the Random Forest model using Cross Validation!

# Define the parameter grid
param_grid = {
    'regressor__n_estimators': [1, 5, 10, 25, 50],
    'regressor__max_depth': [None, 2, 5, 7, 10],
    'regressor__min_samples_split': [2, 4, 5, 7, 10],
    'regressor__min_samples_leaf': [1, 2, 3, 5, 8]
}

# Create a Random Forest regressor model
rf_model = RandomForestRegressor(random_state=12)

# Create a pipeline with the preprocessor and Random Forest model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', rf_model)
])

# Instantiate GridSearchCV with the pipeline and parameter grid
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", grid_search.best_params_)




Best Parameters: {'regressor__max_depth': 10, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 4, 'regressor__n_estimators': 50}


In [13]:

# Get the best model from the grid search
best_model = grid_search.best_estimator_

#best_model

# Make predictions on the test set using the best model
y_test_pred = best_model.predict(X_test)

In [25]:
# Calculate RMSE using cross-validation on the training data
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(-np.mean(cv_scores))

print("RMSE using cross-validation on training data:", cv_rmse)



# Make predictions on the training set
y_train_pred = best_model.predict(X_train)

# Calculate RMSE on the entire training set
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))

print("\nRMSE on the entire training set:", train_rmse)

RMSE using cross-validation on training data: 31026.438022257378

RMSE on the entire training set: 13861.158264096135


In [23]:
# get the RMSE value for rf model

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_test_pred)

# Calculate Root Mean Squared Error
rmse = np.sqrt(mse)

print(f'Root Mean Squared Error (RMSE): ${rmse:.2f}')


Root Mean Squared Error (RMSE): $28281.04


In [None]:
# Model 2

In [31]:
import pandas as pd
import warnings
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [44]:
#Elastic Reg

# Assuming df_train is your training dataset
# Replace 'SalePrice' with the actual name of your target variable column
target_column = 'SalePrice'

# Extract features and target variable from the dataset
X = df_train.drop(target_column, axis=1)
y = df_train[target_column]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical columns in the dataset
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Identify quantitative columns in the dataset
numerical_cols = X_train.select_dtypes(exclude=['object']).columns

# Create transformers for numerical and categorical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a preprocessor to apply transformers to appropriate columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Create an Elastic Net regression model
elastic_net_model = ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=2)

# You can adjust the alpha and l1_ratio parameters based on your needs

# Create a pipeline with the preprocessor and Elastic Net model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', elastic_net_model)
])

# Train the model on the training set
model.fit(X_train, y_train)

# Make predictions on the training set
y_train_pred = model.predict(X_train)

# Make predictions on the test set
y_test_pred = model.predict(X_test)

# Evaluate performance on training set
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
r2_train = r2_score(y_train, y_train_pred)

# Evaluate performance on test set
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(y_test, y_test_pred)

# Reset warnings to default
warnings.resetwarnings()

print(f"Training Set Metrics:")
print(f"  RMSE: {rmse_train}")
print(f"  R-squared: {r2_train}")

print(f"\nTest Set Metrics:")
print(f"  RMSE: {rmse_test}")
print(f"  R-squared: {r2_test}")




Training Set Metrics:
  RMSE: 32360.736110918246
  R-squared: 0.8244264571572313

Test Set Metrics:
  RMSE: 36236.382686460376
  R-squared: 0.8288110055907076


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  common = np.find_common_type(
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return

In [48]:
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score

# Assuming df_train is your training dataset
# Replace 'SalePrice' with the actual name of your target variable column
target_column = 'SalePrice'

# Extract features and target variable from the dataset
X = df_train.drop(target_column, axis=1)
y = df_train[target_column]

# Identify categorical columns in the dataset
categorical_cols = X.select_dtypes(include=['object']).columns

# Identify quantitative columns in the dataset
numerical_cols = X.select_dtypes(exclude=['object']).columns

# Create transformers for numerical and categorical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a preprocessor to apply transformers to appropriate columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Create an Elastic Net regression model
elastic_net_model = ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=2)

# You can adjust the alpha and l1_ratio parameters based on your needs

# Create a pipeline with the preprocessor and Elastic Net model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', elastic_net_model)
])

# Perform k-fold cross-validation and obtain predictions
y_cv_pred = cross_val_predict(model, X, y, cv=5)

# Evaluate performance on the entire dataset
mse_cv = mean_squared_error(y, y_cv_pred)
rmse_cv = np.sqrt(mse_cv)
r2_cv = r2_score(y, y_cv_pred)

print(f"Cross-Validated Metrics:")
print(f"  RMSE: {rmse_cv}")
print(f"  R-squared: {r2_cv}")

warnings.resetwarnings()


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  common = np.find_common_type(
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return

Cross-Validated Metrics:
  RMSE: 34620.132851502414
  R-squared: 0.8099581772187635


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.

In [None]:
# Model 3: Gradient Boosting Machine

In [None]:
# Model 4: Support Vector Machines

In [None]:
# Model 5 Nerual Nets