## 1. Import Necessary Libraries

In [1]:
import numpy as np

import pandas as pd
pd.options.mode.chained_assignment = None
# Turn the warning back on (optional): pd.options.mode.chained_assignment = 'warn'

import statsmodels.api as sm

import matplotlib.pyplot as plt

import seaborn as sns
sns.set()

## 2. Load the data

In [2]:
# Read the dataset
raw_data = pd.read_csv("data/TrustProperty.csv")

# Display the first few rows
raw_data.head()

Unnamed: 0,Price,Resid_area,Air_qual,Num_rooms,Age,Teachers,Poor_prop,N_hos_beds,N_hot_rooms,Rainfall,Parks,Airport,Sold
0,24.0,32.31,0.538,6.575,65.2,24.7,4.98,5.48,11.192,23,0.049347,Yes,0
1,21.6,37.07,0.469,6.421,78.9,22.2,9.14,7.332,12.1728,42,0.046146,No,1
2,34.7,37.07,0.469,7.185,61.1,22.2,4.03,7.394,101.12,38,0.045764,No,0
3,33.4,32.18,0.458,6.998,45.8,21.3,2.94,9.268,11.2672,45,0.047151,Yes,0
4,36.2,32.18,0.458,7.147,54.2,21.3,5.33,8.824,11.2896,55,0.039474,No,0


## 3. Declare the target and the independent variables

After talking with stalkholders we thing that the following features are the most important so we will start with them:
- Price
- Num_rooms
- Poor_prop
- Airport

And our target is the following:
- Sold

In [3]:
# copy the data
data = raw_data.copy()

In [4]:
data.columns

Index(['Price', 'Resid_area', 'Air_qual', 'Num_rooms', 'Age', 'Teachers',
       'Poor_prop', 'N_hos_beds', 'N_hot_rooms', 'Rainfall', 'Parks',
       'Airport', 'Sold'],
      dtype='object')

In [5]:
# Define a subset of columns for analysis
selected_cols = ['Price', 'Num_rooms', 'Teachers', 'Poor_prop', 'N_hos_beds', 'Sold']

# Create a DataFrame 'data_new' to include only these selected columns for further processing.
data_new = data[selected_cols]

# Display the first few rows
data_new.head()

Unnamed: 0,Price,Resid_area,Air_qual,Num_rooms,Age,Teachers,Poor_prop,N_hos_beds,N_hot_rooms,Rainfall,Parks,Airport,Sold
0,24.0,32.31,0.538,6.575,65.2,24.7,4.98,5.48,11.192,23,0.049347,Yes,0
1,21.6,37.07,0.469,6.421,78.9,22.2,9.14,7.332,12.1728,42,0.046146,No,1
2,34.7,37.07,0.469,7.185,61.1,22.2,4.03,7.394,101.12,38,0.045764,No,0
3,33.4,32.18,0.458,6.998,45.8,21.3,2.94,9.268,11.2672,45,0.047151,Yes,0
4,36.2,32.18,0.458,7.147,54.2,21.3,5.33,8.824,11.2896,55,0.039474,No,0


In [6]:
data_new["Airport"] = data_new["Airport"].map({"Yes": 1, "No": 0})
data_new.head()

Unnamed: 0,Price,Resid_area,Air_qual,Num_rooms,Age,Teachers,Poor_prop,N_hos_beds,N_hot_rooms,Rainfall,Parks,Airport,Sold
0,24.0,32.31,0.538,6.575,65.2,24.7,4.98,5.48,11.192,23,0.049347,1,0
1,21.6,37.07,0.469,6.421,78.9,22.2,9.14,7.332,12.1728,42,0.046146,0,1
2,34.7,37.07,0.469,7.185,61.1,22.2,4.03,7.394,101.12,38,0.045764,0,0
3,33.4,32.18,0.458,6.998,45.8,21.3,2.94,9.268,11.2672,45,0.047151,1,0
4,36.2,32.18,0.458,7.147,54.2,21.3,5.33,8.824,11.2896,55,0.039474,0,0


In [7]:
# Target Variable
y = data_new["Sold"]

# Independent Variables (Features)
x1 = data_new.drop(columns=["Sold"])

## 4. Build the model

In [8]:
# Add constant array of ones
x = sm.add_constant(x1)

# Intialize the model
log_reg_model = sm.Logit(y, x)

# Fit the model
results_log = log_reg_model.fit()

# Display the summary table
results_log.summary()

Optimization terminated successfully.
         Current function value: 0.581359
         Iterations 7


0,1,2,3
Dep. Variable:,Sold,No. Observations:,300.0
Model:,Logit,Df Residuals:,287.0
Method:,MLE,Df Model:,12.0
Date:,"Sun, 06 Oct 2024",Pseudo R-squ.:,0.156
Time:,01:53:54,Log-Likelihood:,-174.41
converged:,True,LL-Null:,-206.64
Covariance Type:,nonrobust,LLR p-value:,3.437e-09

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-9.2246,3.501,-2.634,0.008,-16.087,-2.362
Price,-0.2644,0.048,-5.550,0.000,-0.358,-0.171
Resid_area,0.0332,0.030,1.101,0.271,-0.026,0.092
Air_qual,-3.8683,3.370,-1.148,0.251,-10.473,2.736
Num_rooms,1.3215,0.502,2.632,0.008,0.338,2.306
Age,0.0069,0.007,0.930,0.353,-0.008,0.021
Teachers,0.2873,0.076,3.775,0.000,0.138,0.436
Poor_prop,-0.1976,0.045,-4.373,0.000,-0.286,-0.109
N_hos_beds,0.2028,0.085,2.400,0.016,0.037,0.369


## 5. Evaluate the model

In [9]:
# Display the confussion matrix using pred_table() method
results_log.pred_table()

array([[118.,  46.],
       [ 51.,  85.]])

In [10]:
# Some neat formatting to read the confusion matrix
cm_df = pd.DataFrame(results_log.pred_table())

# Rename the columns
cm_df.columns = ["Predicted_0", "Predicted_1"]

# Rename the indexes
cm_df.rename(index={0: "Actual_0", 1: "Actual_1"}, inplace=True)

# Display the confusion matrix
cm_df

Unnamed: 0,Predicted_0,Predicted_1
Actual_0,118.0,46.0
Actual_1,51.0,85.0


### Calculating Accuracy:
$$
\text{Accuracy} = \frac{TP + TN}{TP + TN + FP + FN}
$$

Where:

- TP = True Positives
- TN = True Negatives
- FP = False Positives
- FN = False Negatives

In [11]:
cm_array = np.array(cm_df)
accuracy = ((cm_array[0, 0] + cm_array[1, 1]) / cm_array.sum()) * 100
accuracy

67.66666666666666

## 6. Testing the model