In [25]:
import pandas as pd
import numpy as np

In [26]:
data = pd.read_excel("Sample_Files/houses_for_rent_madrid.xlsx")

In [27]:
data.dtypes

Id                int64
District         object
Address          object
Number           object
Area             object
Rent              int64
Bedrooms        float64
Sq.Mt             int64
Floor           float64
Outer           float64
Elevator        float64
Penthouse         int64
Cottage           int64
Duplex            int64
Semidetached      int64
dtype: object

In [28]:
data.describe(include='all')

Unnamed: 0,Id,District,Address,Number,Area,Rent,Bedrooms,Sq.Mt,Floor,Outer,Elevator,Penthouse,Cottage,Duplex,Semidetached
count,2089.0,2089,2089,747.0,2085,2089.0,2000.0,2089.0,1948.0,1927.0,1956.0,2089.0,2089.0,2089.0,2089.0
unique,,20,1336,137.0,140,,,,,,,,,,
top,,Salamanca,Piso en Recoletos,1.0,Recoletos,,,,,,,,,,
freq,,313,25,29.0,93,,,,,,,,,,
mean,1094.026807,,,,,1932.249402,2.483,128.919579,25.662731,0.867151,0.880879,0.0809,0.042125,0.030637,0.013404
std,630.612544,,,,,1495.474485,1.305206,115.745014,975.06535,0.3395,0.324013,0.272747,0.200923,0.172373,0.115023
min,1.0,,,,,450.0,0.0,15.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,550.0,,,,,950.0,2.0,65.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0
50%,1094.0,,,,,1400.0,2.0,90.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0
75%,1636.0,,,,,2500.0,3.0,147.0,5.0,1.0,1.0,0.0,0.0,0.0,0.0


## Data manipulation
1. Throw away the column "Number", "Adress". "Id"
2. Throw away all columns with missing values.
3. Replace Rent (target) with a binary variable, which is True when the rent is above (or equal) its median, and False otherwise
4. Replace the types of Penthouse, Cottage, Duplex, Semidetached, Outer and Elevator to binary.
5. Replace the type of Bedrooms, District and Area to categoricals (Note: Bedrooms should have values in the range 0..8, inclusive)
6. Split data to 70% train and 30% val

No operation to be done in-place.  the result of steps 1,2,3,4,5 should be data_prepared.  The result of step 6 should be data_train and data_test.

In [29]:
# Your code here
from sklearn.model_selection import train_test_split
data_prepared = data.drop(columns=["Number", "Address", "Id"])
data_prepared = data_prepared.dropna(axis=0)
data_prepared["Rent"] = (data_prepared["Rent"] >= data_prepared["Rent"].median())
data_prepared["Penthouse"] = data["Penthouse"].astype(bool)
data_prepared["Cottage"] = data["Cottage"].astype(bool)
data_prepared["Duplex"] = data["Duplex"].astype(bool)
data_prepared["Semidetached"] = data["Semidetached"].astype(bool)
data_prepared["Outer"] = data["Outer"].astype(bool)
data_prepared["Elevator"] = data["Elevator"].astype(bool)
data_prepared["Bedrooms"] = data["Bedrooms"].astype("category")
data_prepared["District"] = data["District"].astype("category")
data_prepared["Area"] = data["Area"].astype("category")
data_train, data_test = train_test_split(data_prepared, train_size = 0.7)

In [30]:
data_prepared.dtypes

District        category
Area            category
Rent                bool
Bedrooms        category
Sq.Mt              int64
Floor            float64
Outer               bool
Elevator            bool
Penthouse           bool
Cottage             bool
Duplex              bool
Semidetached        bool
dtype: object

## Categorical Naive Bayes
Using sklearn's Categorical Naive Bayes we will run the algorithm only on the categorical variables.
Note that sklearn's implementation only works with integer calues categoricals, so you will have to do ordinal encoding.

Print the training and the test accuracy that you get.  Use the score() method of the Naive Bayes objet to compute the accuracy.

In [31]:
from sklearn.naive_bayes import CategoricalNB

In [32]:
# The parameter alpha is the Laplace smoothing parameter (default 1)
cat_nb = CategoricalNB(alpha=1)

In [41]:
# Your code here: Extract the categorical features, and fit cat_nb.
# Then compute the accuracy of the trained cat_nb on the training and on the
# test set.
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score

categorical_columns = ["District", "Area", "Bedrooms"]
X_train_categorical = data_train[categorical_columns]
X_test_categorical = data_test[categorical_columns]

all_data = pd.concat([X_train_categorical, X_test_categorical])

ordinal_encoder = OrdinalEncoder()
all_data_encoded = ordinal_encoder.fit_transform(all_data)

X_train_encoded = all_data_encoded[:len(X_train_categorical)]
X_test_encoded = all_data_encoded[len(X_train_categorical):]

y_train = data_train['Rent']
y_test = data_test['Rent']

cat_nb = CategoricalNB(alpha=1)
cat_nb.fit(X_train_encoded, y_train)

train_accuracy = cat_nb.score(X_train_encoded, y_train)
print(f"Training Accuracy: {train_accuracy:.2f}")

test_accuracy = cat_nb.score(X_test_encoded, y_test)
print(f"Test Accuracy: {test_accuracy:.2f}")

Training Accuracy: 0.83
Test Accuracy: 0.79


### Gaussian Naive Bayes (for the numerical features)
Train a Gaussian Baive Bayes classifier using the features "Floor"
and "Sq.Mt" (the two remaining features).  Print the training and testing
error.

In [42]:
from sklearn.naive_bayes import GaussianNB

In [43]:
gauss_nb = GaussianNB()

In [44]:
# Your code here
from sklearn.metrics import accuracy_score

X_train_numeric = data_train[['Floor', 'Sq.Mt']]
X_test_numeric = data_test[['Floor', 'Sq.Mt']]

y_train = data_train['Rent']
y_test = data_test['Rent']

gnb = GaussianNB()

gnb.fit(X_train_numeric, y_train)

y_train_pred = gnb.predict(X_train_numeric)

y_test_pred = gnb.predict(X_test_numeric)

train_error = 1 - accuracy_score(y_train, y_train_pred)
test_error = 1 - accuracy_score(y_test, y_test_pred)

print(f"Training Error: {train_error:.2f}")
print(f"Testing Error: {test_error:.2f}")

Training Error: 0.20
Testing Error: 0.19


## Comparing with Logistic Regression
Run logistic regression on the numerical features (Floor, Sq.Mt) and compare your result with GaussianNB.  
Documentation for Logistic Regression:
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
No need to regularize - you have enough data, and only two features.  Also no need to play with the implementation parameters for now.  Just use the usual
"fit" and "predict" to train and to get a (hard) prediction.
Did you get a better or a worse result compared to Gaussian NB?

It is likely that Logistic Regression would give a better result, because Logistic Regression makes fewer assumptions on the data, while Niave Bayes makes strong assumptions.  It is folklore in the ML community though that Naive Bayes would be typically better than logistic regression on small datasets, and logistic regression better on large datasets.  

Therefore, if you have enough data, as is the case here, logistic regression is probably better.

In [46]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

# Your code here: fit the lr to the numerical features, compute accuracy for train, test
lr.fit(X_train_numeric, y_train)

y_train_pred_lr = lr.predict(X_train_numeric)

y_test_pred_lr = lr.predict(X_test_numeric)

train_error_lr = 1 - accuracy_score(y_train, y_train_pred_lr)
test_error_lr = 1 - accuracy_score(y_test, y_test_pred_lr)

print("Logistic Regression:")
print(f"Training Error: {train_error_lr:.2f}")
print(f"Testing Error: {test_error_lr:.2f}")

print("\nComparison with Gaussian Naive Bayes:")
print(f"Training Error (GaussianNB): {train_error:.2f}")
print(f"Testing Error (GaussianNB): {test_error:.2f}")

Logistic Regression:
Training Error: 0.17
Testing Error: 0.19

Comparison with Gaussian Naive Bayes:
Training Error (GaussianNB): 0.20
Testing Error (GaussianNB): 0.19


### Combining categorical and Gaussian NB
Although sklearn does not offer this out of the box, it turns out not to be very difficult to combine the Gaussian and categorical NB outputs into one
NB "mixture".  This method is described below.  It is not mandatoery to do it, and in case you want to do something easier, skip to "Easier Solution" below.


#### Non-mandatory method (with theoretical justification)
Remember that the Naive Bayes assumption is, that the
features are conditionally independent.  This means that the numerical features are independent of the categorical features.  And this means that, conditioned on the label y, the likelihood of the features is the product of the likelihood of the categoricals and the likelihood of the numericals.  Luckily, sklearn gives us the log likelihoods, so we can just add them up to get the log of the product of the likelihoods.

Given a dataset X of sample points, the method predict_joint_log_proba(X) of sklearn's NB returns for each sample point x in X and each class y, the joint log probability:

log p(x,y) = log p(x|y) + log p(y)

Applying predict_joint_log_proba for both the categorical and the gaussian NB's, we will get from the Categorical NB the expression log p(x_cat, y) for all the point x in X, where x_cat are the categorical features of x.  This also equals:

(1)  log p(x_cat,y) = log p(x_cat|y) + log p(y)

From Gaussian NB we get the expression log p(x_num, y), where x_num are the numerical features of x.  This also equals:

(2) log p(x_num,y) = log p(x_num|y) + log p(y)

The probabilities of the combined model (due to conditional independence of x_cat and x_num, by Naive Bayes assumption) are given by:

log p(x,y) = log p(x_cat|y) + log p(x_num|y) + log p(y)

Therefore, using (1) and (2):

log p(x,y) = log p(x_cat, y) + log p(x_num, y) - log p(y)

The expression log p(y) is given by the prior of y, which is close to 50%/50% (because the way we created a binary target from the Rent - using the median).  Therefore we will use p(y) = log(0.5) for both y=+1 and y=-1.  [Generally, you could get p(y) from the data, but we don't really need it here because it is pretty much known]

Write a function predict_from_combined_NBs(X) which takes a set of samples (which would be either data_train or data_test) and returns (hard) predictions using the combined NB classifiers, as described above.  If you did it write, you should get accuracy which is better than both what you got for GaussianNB and CategorialNB.

#### Easier Solution
Use the CategoricalNB soft-prediction (predict_proba method, see Naive Bayes documentation for sklearn) to add a feature to GaussianNB.  This means that you have to retrain GaussianNB using the extra feature, given by the soft-prediction of CategoricalNB!  \

In [7]:
# Your code here (for one of the two methods above)

In [47]:
cat_nb = CategoricalNB(alpha=1)
cat_nb.fit(X_train_encoded, y_train)

X_train_cat_prob = cat_nb.predict_proba(X_train_encoded)
X_test_cat_prob = cat_nb.predict_proba(X_test_encoded)

X_train_augmented = np.concatenate([X_train_numeric, X_train_cat_prob], axis=1)
X_test_augmented = np.concatenate([X_test_numeric, X_test_cat_prob], axis=1)

gnb_augmented = GaussianNB()
gnb_augmented.fit(X_train_augmented, y_train)

y_train_pred_gnb_augmented = gnb_augmented.predict(X_train_augmented)

y_test_pred_gnb_augmented = gnb_augmented.predict(X_test_augmented)

train_error_gnb_augmented = 1 - accuracy_score(y_train, y_train_pred_gnb_augmented)
test_error_gnb_augmented = 1 - accuracy_score(y_test, y_test_pred_gnb_augmented)

print("Gaussian Naive Bayes with Added Feature from CategoricalNB Soft Predictions:")
print(f"Training Error: {train_error_gnb_augmented:.2f}")
print(f"Testing Error: {test_error_gnb_augmented:.2f}")

Gaussian Naive Bayes with Added Feature from CategoricalNB Soft Predictions:
Training Error: 0.14
Testing Error: 0.18
