In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

ModuleNotFoundError: No module named 'statsmodels'

In [None]:
raw_data = pd.read_csv('Dataset/2.02. Binary predictors.csv')

In [None]:
raw_data.head()

In [None]:
data = raw_data.copy()
data['Admitted'] = data['Admitted'].map({'Yes': 1, 'No': 0})
data['Gender'] = data['Gender'].map({'Female': 1, 'Male': 0})

### **Regression with only 1 predictor**

In [None]:
y = data['Admitted']
x1 = data['Gender']

In [None]:
data

In [None]:
x = sm.add_constant(x1)
reg_log = sm.Logit(y, x)
results_log = reg_log.fit()
results_log.summary()

How to interpret this table?<br>
We can see clearly that, $log(odds_2) = -0.64 + 2.08\cdot gender_2$, also applies to $gender_1$, therefore<br>
<br>
$log(\frac{odds_2}{odds_1}) = 2.08*(gender_2-gender_1)$<br>
$log(\frac{odds_{female}}{odds_{male}} = 2.08(1-0)$<br>
$odds_{female} = 7.99 \cdot odds_{male}$


$x^3$ 

In [None]:
y = data['Admitted']
x1 = data[['SAT', 'Gender']]

In [None]:
x = sm.add_constant(x1)
reg_log = sm.Logit(y, x)
results_log = reg_log.fit()
results_log.summary()

### Checking the Accuracy

In [None]:
results_log.predict()

This looks awful, lets apply some formatting

In [None]:
np.set_printoptions(formatter={'float': lambda x: "{0:0.2f}".format(x)})
results_log.predict()

Comparing with the actual

In [None]:
np.array(data['Admitted'])

We can compare it easily in a table, using a **confusion matrix**


```python
sm.LogitResults.pred_table()
```
returns a table which compares the predicted and actual values

In [None]:
results_log.pred_table()

In [None]:
cm_df = pd.DataFrame(results_log.pred_table())
cm_df.columns = {'Predicted 0', 'Predicted 1'}
cm_df = cm_df.rename(index= {0: 'Actual 0', 1:'Actual 1'})
cm_df

Checking the accuracy

In [None]:
cm = np.array(cm_df)
acc_train = (cm[0, 0] + cm[1, 1])/cm.sum()
acc_train

### Testing the model and assess its accuracy

In [None]:
test_dataset = pd.read_csv('Dataset/2.03. Test dataset.csv')
test_dataset

In [None]:
test_dataset['Admitted'] = test_dataset['Admitted'].map({'Yes': 1, 'No': 0})
test_dataset['Gender'] = test_dataset['Gender'].map({'Female': 1, 'Male': 0})

In [None]:
test_dataset

In [None]:
x

In [None]:
test_actual = test_dataset['Admitted']
test_data = test_dataset.drop(['Admitted'], axis= 1)
test_data  = sm.add_constant(test_data)
#test_data = test_data[x.columns.values]
test_data

In [None]:
def confusion_matrix(data,actual_values,model):
        
        # Confusion matrix 
        
        # Parameters
        # ----------
        # data: data frame or array
            # data is a data frame formatted in the same way as your input data (without the actual values)
            # e.g. const, var1, var2, etc. Order is very important!
        # actual_values: data frame or array
            # These are the actual values from the test_data
            # In the case of a logistic regression, it should be a single column with 0s and 1s
            
        # model: a LogitResults object
            # this is the variable where you have the fitted model 
            # e.g. results_log in this course
        # ----------
        
        #Predict the values using the Logit model
        pred_values = model.predict(data)
        # Specify the bins 
        bins=np.array([0,0.5,1])
        # Create a histogram, where if values are between 0 and 0.5 tell will be considered 0
        # if they are between 0.5 and 1, they will be considered 1
        cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
        # Calculate the accuracy
        accuracy = (cm[0,0]+cm[1,1])/cm.sum()
        # Return the confusion matrix and the accuracy
        return cm, accuracy

In [None]:
# Create a confusion matrix with the test data
cm = confusion_matrix(test_data,test_actual,results_log)
cm

In [None]:
# Format for easier understanding (not needed later on)
cm_df = pd.DataFrame(cm[0])
cm_df.columns = ['Predicted 0','Predicted 1']
cm_df = cm_df.rename(index={0: 'Actual 0',1:'Actual 1'})
cm_df

In [None]:
# Check the missclassification rate
# Note that Accuracy + Missclassification rate = 1 = 100%
print ('Missclassification rate: '+str((1+1)/19))