In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/train.csv')

In [72]:
test = pd.read_csv('data/test.csv')

# Linear Regression

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score

In [30]:
features_used = ['MSSubClass', 'OverallQual', 'GrLivArea', 'GarageCars', 'Neighborhood', 'TotalBsmtSF', '1stFlrSF', 'FullBath', 'YearBuilt']

X = df[features_used]
y = df['SalePrice']

# Neigborhood is a categorical value
X_encoded = pd.get_dummies(X, columns=['Neighborhood', 'MSSubClass'])
label_encoder = LabelEncoder()
X['Neighborhood_encoded'] = label_encoder.fit_transform(X['Neighborhood'])
X['MSSubClass_encoded'] = label_encoder.fit_transform(X['MSSubClass'])
X.drop(columns=['Neighborhood', 'MSSubClass_encoded'], inplace=True)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the normalizer and fit it to the training data
scaler = Normalizer()
X_train_scaled = scaler.fit_transform(X_train)

# Transform the testing data using the same normalizer
X_test_scaled = scaler.transform(X_test)

# Initialize and fit the linear regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test_scaled)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Neighborhood_encoded'] = label_encoder.fit_transform(X['Neighborhood'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['MSSubClass_encoded'] = label_encoder.fit_transform(X['MSSubClass'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(columns=['Neighborhood', 'MSSubClass_encoded'], inplace=True)


In [31]:
# Scoring 

# Calculate the mean squared error of the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: {:,}".format(mse))

# Calculate the root mean squared error
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

r2 = r2_score(y_test, y_pred)

# Print the R-squared score
print("R-squared Score:", r2)

Mean Squared Error: 1,919,296,497.122615
Root Mean Squared Error: 43809.776273368654
R-squared Score: 0.7497764182018456


In [31]:
test_x = test[features_used]
test_ids = test['Id']

# imputation
column_means = test_x.mean()
test_x.fillna(column_means, inplace=True)

test_y = model.predict(test_x)

results_submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': test_y
})



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x.fillna(column_means, inplace=True)


In [32]:
results_submission.to_csv('submission.csv', index=False)

In [25]:
test_ids

0       1461
1       1462
2       1463
3       1464
4       1465
        ... 
1454    2915
1455    2916
1456    2917
1457    2918
1458    2919
Name: Id, Length: 1459, dtype: int64

# Decision Tree

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

In [33]:
features_used = ['MSSubClass', 'OverallQual', 'GrLivArea', 'GarageCars', 'Neighborhood', 'TotalBsmtSF', '1stFlrSF', 'FullBath', 'YearBuilt']

X = df[features_used]
y = df['SalePrice']

# Neigborhood is a categorical value
X_encoded = pd.get_dummies(X, columns=['Neighborhood', 'MSSubClass'])
label_encoder = LabelEncoder()
X['Neighborhood_encoded'] = label_encoder.fit_transform(X['Neighborhood'])
X['MSSubClass_encoded'] = label_encoder.fit_transform(X['MSSubClass'])
X.drop(columns=['Neighborhood', 'MSSubClass_encoded'], inplace=True)


# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

regressor = DecisionTreeRegressor(max_depth=8)

regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Neighborhood_encoded'] = label_encoder.fit_transform(X['Neighborhood'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['MSSubClass_encoded'] = label_encoder.fit_transform(X['MSSubClass'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(columns=['Neighborhood', 'MSSubClass_encoded'], inplace=True)


In [34]:
# Scoring 

# Calculate the mean squared error of the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: {:,}".format(mse))

# Calculate the root mean squared error
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

r2 = r2_score(y_test, y_pred)

# Print the R-squared score
print("R-squared Score:", r2)

Mean Squared Error: 1,681,136,420.6839397
Root Mean Squared Error: 41001.66363312518
R-squared Score: 0.7808259550801492


In [18]:
test_x = test[features_used]
test_ids = test['Id']

test_y = regressor.predict(test_x)

results_submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': test_y
})


In [19]:
results_submission.to_csv('submission.csv', index=False)

# Random Forest

In [77]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [78]:
features_used = ['MSZoning', 'Utilities', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond','GrLivArea', 'GarageCars', 'Neighborhood', 'TotalBsmtSF', '1stFlrSF', 'FullBath', 'YearBuilt']

X = df[features_used]
y = df['SalePrice']

# categorical values
# Selecting the categorical columns for one-hot encoding
categorical_cols = ['Neighborhood', 'MSZoning', 'Utilities', 'BldgType', 'HouseStyle']

# Extracting the categorical columns and the remaining numerical columns
X_categorical = X[categorical_cols]
X_numerical = X.drop(columns=categorical_cols)

# Performing one-hot encoding on the categorical columns
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X_categorical)

# Creating a DataFrame with the encoded columns
encoded_df = pd.DataFrame(X_encoded.toarray(), columns=encoder.get_feature_names_out(categorical_cols))


# Concatenating the encoded DataFrame with the numerical features
X_final = pd.concat([X_numerical.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)

rf_regressor = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42)

# Train the regressor on the training data
rf_regressor.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = rf_regressor.predict(X_test)

In [79]:
# Scoring 

# Calculate the mean squared error of the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: {:,}".format(mse))

# Calculate the root mean squared error
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

r2 = r2_score(y_test, y_pred)

# Print the R-squared score
print("R-squared Score:", r2)

Mean Squared Error: 712,456,661.3281975
Root Mean Squared Error: 26691.883810031046
R-squared Score: 0.9071152070872002


In [80]:
test_x = test[features_used]
test_ids = test['Id']

# Extracting the categorical columns and the remaining numerical columns
X_categorical = test_x[categorical_cols]
X_numerical = test_x.drop(columns=categorical_cols)

X_categorical.fillna('missing', inplace=True)  # Replace NaNs with a placeholder value
X_numerical.fillna(X_numerical.mean(), inplace=True)  # Example: Fill numerical NaNs with mean

# Performing one-hot encoding on the categorical columns
encoder = OneHotEncoder(handle_unknown='ignore')
X_encoded = encoder.fit_transform(X_categorical)

# Creating a DataFrame with the encoded columns
encoded_df = pd.DataFrame(X_encoded.toarray(), columns=encoder.get_feature_names_out(categorical_cols))
X_final = pd.concat([X_numerical.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

test_y = rf_regressor.predict(X_final)

results_submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': test_y
})


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_categorical.fillna('missing', inplace=True)  # Replace NaNs with a placeholder value


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- MSZoning_missing
- Utilities_missing
Feature names seen at fit time, yet now missing:
- HouseStyle_2.5Fin
- Utilities_NoSeWa


In [80]:
test_x = test[features_used]
test_ids = test['Id']

# Extracting the categorical columns and the remaining numerical columns
X_categorical = test_x[categorical_cols]
X_numerical = test_x.drop(columns=categorical_cols)

X_categorical.fillna('missing', inplace=True)  # Replace NaNs with a placeholder value
X_numerical.fillna(X_numerical.mean(), inplace=True)  # Example: Fill numerical NaNs with mean

# Performing one-hot encoding on the categorical columns
encoder = OneHotEncoder(handle_unknown='ignore')
X_encoded = encoder.fit_transform(X_categorical)

# Creating a DataFrame with the encoded columns
encoded_df = pd.DataFrame(X_encoded.toarray(), columns=encoder.get_feature_names_out(categorical_cols))
X_final = pd.concat([X_numerical.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

test_y = rf_regressor.predict(X_final)

results_submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': test_y
})


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_categorical.fillna('missing', inplace=True)  # Replace NaNs with a placeholder value


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- MSZoning_missing
- Utilities_missing
Feature names seen at fit time, yet now missing:
- HouseStyle_2.5Fin
- Utilities_NoSeWa


In [81]:
X_final.columns

Index(['OverallQual', 'OverallCond', 'GrLivArea', 'GarageCars', 'TotalBsmtSF',
       '1stFlrSF', 'FullBath', 'YearBuilt', 'Neighborhood_Blmngtn',
       'Neighborhood_Blueste', 'Neighborhood_BrDale', 'Neighborhood_BrkSide',
       'Neighborhood_ClearCr', 'Neighborhood_CollgCr', 'Neighborhood_Crawfor',
       'Neighborhood_Edwards', 'Neighborhood_Gilbert', 'Neighborhood_IDOTRR',
       'Neighborhood_MeadowV', 'Neighborhood_Mitchel', 'Neighborhood_NAmes',
       'Neighborhood_NPkVill', 'Neighborhood_NWAmes', 'Neighborhood_NoRidge',
       'Neighborhood_NridgHt', 'Neighborhood_OldTown', 'Neighborhood_SWISU',
       'Neighborhood_Sawyer', 'Neighborhood_SawyerW', 'Neighborhood_Somerst',
       'Neighborhood_StoneBr', 'Neighborhood_Timber', 'Neighborhood_Veenker',
       'MSZoning_C (all)', 'MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL',
       'MSZoning_RM', 'MSZoning_missing', 'Utilities_AllPub',
       'Utilities_missing', 'BldgType_1Fam', 'BldgType_2fmCon',
       'BldgType_Duplex', 'BldgT

In [53]:
results_submission.to_csv('submission.csv', index=False)

# Gradient Boosting

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score

In [19]:
features_used = ['OverallQual', 'GrLivArea', 'GarageCars', 'Neighborhood', 'TotalBsmtSF', '1stFlrSF', 'FullBath', 'YearBuilt']

X = df[features_used]
y = df['SalePrice']

# Neigborhood is a categorical value
X_encoded = pd.get_dummies(X, columns=['Neighborhood'])
label_encoder = LabelEncoder()
X['Neighborhood_encoded'] = label_encoder.fit_transform(X['Neighborhood'])
X.drop(columns=['Neighborhood'], inplace=True)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

gb_regressor = GradientBoostingRegressor(n_estimators=100, max_depth=3, learning_rate=0.1, random_state=42)

# Train the regressor on the training data
gb_regressor.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = gb_regressor.predict(X_test)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Neighborhood_encoded'] = label_encoder.fit_transform(X['Neighborhood'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.drop(columns=['Neighborhood'], inplace=True)


In [20]:
# Scoring 

# Calculate the mean squared error of the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: {:,}".format(mse))

# Calculate the root mean squared error
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

r2 = r2_score(y_test, y_pred)

# Print the R-squared score
print("R-squared Score:", r2)

Mean Squared Error: 765,902,295.6661631
Root Mean Squared Error: 27674.939849368475
R-squared Score: 0.9001473633613509


In [15]:
test_x = test[features_used]

# handles categorical values
X_encoded = pd.get_dummies(test_x, columns=['Neighborhood'])
label_encoder = LabelEncoder()
test_x['Neighborhood_encoded'] = label_encoder.fit_transform(test_x['Neighborhood'])
test_x.drop(columns=['Neighborhood'], inplace=True)

# missing values
test_x.fillna(test_x.mean(), inplace=True)

test_ids = test['Id']

test_y = gb_regressor.predict(test_x)

results_submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': test_y
})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x.loc[:, 'Neighborhood_encoded'] = label_encoder.fit_transform(test_x['Neighborhood'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x.drop(columns=['Neighborhood'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x.fillna(test_x.mean(), inplace=True)


In [16]:
results_submission.to_csv('submission.csv', index=False)

# Support Vector Machine (SVM)

In [7]:
from sklearn.svm import SVR
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score

In [5]:
features_used = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', '1stFlrSF', 'FullBath', 'YearBuilt']

X = df[features_used]
y = df['SalePrice']
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Support Vector Regression (SVR) model
svm_regressor = SVR(kernel='linear')

# Fit the model to the training data
svm_regressor.fit(X_train, y_train)

y_pred = svm_regressor.predict(X_test)


In [8]:
# Scoring 

# Calculate the mean squared error of the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: {:,}".format(mse))

# Calculate the root mean squared error
rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

r2 = r2_score(y_test, y_pred)

# Print the R-squared score
print("R-squared Score:", r2)

Mean Squared Error: 2,082,664,413.6420221
Root Mean Squared Error: 45636.218222394615
R-squared Score: 0.7284777260593484
