In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import GaussianNB
import pathlib

### Load data, and preprocess
Data has already been cleaned, just need to drop the Id columna nd remove any coluns that have been added during export/import from R to python. Also use create dummies function to encode discrete values using one hot encoding

In [3]:
houseprices = pd.read_csv("houseprices_data_for_models.csv", index_col="Id")
houseprices.drop(columns=["Unnamed: 0"], inplace = True)
houseprices.reset_index(drop = True, inplace = True)
houseprices_clean = pd.get_dummies(houseprices)
houseprices_clean.head() 

Unnamed: 0,LotArea,OverallQual,OverallCond,YearBuilt,TotalBsmtSF,GrLivArea,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,...,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_None
0,8450,7,5,2003,856,1710,2,1,3,1,...,0,1,0,0,1,0,0,0,0,0
1,9600,6,8,1976,1262,1262,2,0,3,1,...,0,0,1,0,1,0,0,0,0,0
2,11250,7,5,2001,920,1786,2,1,3,1,...,0,1,0,0,1,0,0,0,0,0
3,9550,7,5,1915,756,1717,1,0,3,1,...,0,1,0,0,0,0,0,0,1,0
4,14260,8,5,2000,1145,2198,2,1,4,1,...,0,1,0,0,1,0,0,0,0,0


### Check the data is ready for model building
Check the target is in the correct form

In [4]:
houseprices["SalePrice"]

0       1
1       1
2       1
3       0
4       1
       ..
1454    0
1455    1
1456    1
1457    0
1458    0
Name: SalePrice, Length: 1459, dtype: int64

### Logistic Regression model building and evaluation (using 10-fold cross validation

In [20]:
logistic_regression_model = LogisticRegression(max_iter=10000, random_state = 1)
y_logistic_regression = houseprices_clean["SalePrice"]
X_logistic_regression = houseprices_clean.drop(columns=["SalePrice"])
k = 10
cv_results_logistic_regression = cross_validate(logistic_regression_model, X_logistic_regression, y_logistic_regression, cv = k, scoring = ["precision", "accuracy"])


In [21]:
cv_results_logistic_regression_df = pd.DataFrame.from_dict(cv_results_logistic_regression)
cv_results_logistic_regression_df

Unnamed: 0,fit_time,score_time,test_precision,test_accuracy
0,0.132981,0.001184,0.925926,0.931507
1,0.235259,0.001056,0.875,0.90411
2,0.632942,0.001017,0.927273,0.938356
3,0.414528,0.000984,0.859649,0.89726
4,0.193052,0.001951,0.896552,0.931507
5,0.044447,0.00099,0.980392,0.952055
6,0.904646,0.001068,0.896552,0.931507
7,0.355378,0.003064,0.888889,0.90411
8,0.801974,0.003247,0.881356,0.924658
9,0.865224,0.001136,0.862069,0.903448


### Naive Bayes model Building and evaluation (using 10-fold cross validation)

In [22]:
NaiveBayes_data = houseprices 
features_selected = ["LotArea", "OverallQual", "YearBuilt", "Neighborhood", "TotRmsAbvGrd", "SalePrice"]

In [23]:
naive_bayes_data = houseprices[features_selected]
naive_bayes_data_clean = pd.get_dummies(naive_bayes_data)
y_NB = naive_bayes_data_clean["SalePrice"]
X_NB = naive_bayes_data_clean.drop(columns=["SalePrice"])

In [24]:
GaussianNB_model = GaussianNB()
cv_results_GaussianNB = cross_validate(GaussianNB_model, X_NB, y_NB, cv = k, scoring = ["precision", "accuracy"])

In [25]:
cv_results_Gaussian_NB_df = pd.DataFrame.from_dict(cv_results_GaussianNB)
cv_results_Gaussian_NB_df

Unnamed: 0,fit_time,score_time,test_precision,test_accuracy
0,0.004366,0.003265,0.86,0.863014
1,0.003886,0.002993,0.872727,0.89726
2,0.003612,0.00184,0.822581,0.890411
3,0.002459,0.001638,0.833333,0.863014
4,0.002052,0.001763,0.836364,0.869863
5,0.002278,0.001853,0.881356,0.924658
6,0.002461,0.001886,0.775862,0.835616
7,0.002151,0.0015,0.862745,0.869863
8,0.001935,0.001461,0.819672,0.883562
9,0.001936,0.001428,0.814815,0.848276


## Export model results to csv, to import to R

In [26]:
cv_results_logistic_regression_df.to_csv("Logistic_Regression_Results.csv")
cv_results_Gaussian_NB_df.to_csv("Naive_Bayes_results.csv")