# Multinomial Logistic Regression

**Sections**
- [1.0 Synthetic Data & Model](#1.0-Synthetic-Data-&-Model)
- [2.0 Newton Raphson Algorithm](#2.0-Newton-Raphson-Algorithm)
- [3.0 NR Implementation](#3.0-Newton-Raphson-Implementation)
    - [3.1 Checking Convergence](#3.1-Checking-Convergence)
- [4.0 Prediction at X values](#4.0-Prediction-at-X-values)

### 0. Importing Modules

In [1]:
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import bokeh
from bokeh.plotting import figure, show
from bokeh.models import tickers, ranges
from bokeh.io import output_notebook
output_notebook()

## Illustration

In [2]:
n = 10000
x_i = np.random.normal(0, 1, size = (n,1))
Bj = np.array([-0.2,0,0.2,0.4]).reshape(1,-1)

In [3]:
def get_p_j_given_x(x, Bj):
    """Calculates P( y = j | x).

    Args:
        x: N x M features
        B: M features x K-1

    Returns:
        np.ndarray: N x K-1 matrix with the probabilities of each observation
        to be classified as a given category. 
    """
    numerator = np.exp(x @ Bj) # Returns N x K-1 Matrix
    # Note: It is critical to sum over the axis because it is
    # only within an observation that the probabilities must add up to 1.
    denominator = (1 + 
                    np.sum(np.exp(x @ Bj), axis = 1)).reshape(-1,1) # N Vector
    
    return numerator / denominator

def get_pK(x, Bj):
    """Calculates P( y = K | x).
    """    
    denominator = (1 + np.sum(np.exp(x @ Bj), axis = 1)).reshape(-1,1)
    return 1 / denominator

In [4]:
# Probabilities j = 1 through K-1
p_array_K_minus_one = get_p_j_given_x(x_i, Bj)

# Probabilities j = K
p_K = 1 - np.sum(p_array_K_minus_one, axis = 1).reshape(-1,1)

# Same as doing
p_K_v2 = get_pK(x_i, Bj)
assert max(abs(p_K - p_K_v2)) < 10**-15

# Full array
p_array = np.concatenate([p_array_K_minus_one, p_K], axis = 1)

In [5]:
p_array[:10]

array([[0.16776522, 0.18917843, 0.21332478, 0.24055313, 0.18917843],
       [0.23058352, 0.20744459, 0.18662764, 0.16789966, 0.20744459],
       [0.26040118, 0.21244267, 0.17331675, 0.14139672, 0.21244267],
       [0.17656359, 0.19246692, 0.20980268, 0.2286999 , 0.19246692],
       [0.18450631, 0.19521305, 0.20654109, 0.21852649, 0.19521305],
       [0.17906851, 0.19335524, 0.20878182, 0.22543919, 0.19335524],
       [0.19736606, 0.19923728, 0.20112625, 0.20303312, 0.19923728],
       [0.14612516, 0.17989107, 0.22145945, 0.27263324, 0.17989107],
       [0.16433521, 0.18782319, 0.21466824, 0.24535018, 0.18782319],
       [0.18205408, 0.19438709, 0.20755558, 0.22161616, 0.19438709]])

Y values generation

In [6]:
y_i = []
for probabilities in p_array:
    y_random = np.random.choice(a = [1,2,3,4,5], 
                                size = 1,
                                p = probabilities)
    y_i.append(y_random[0])

## Generative classifier

In [7]:
from scipy.stats import norm

In [8]:
def get_prob_array(x, means, stds, Pj):
        
    normalizing_factor = 0
    pj_array = []
    
    # Note: The denominator is common to all
    for mean, std, pj in zip(means, stds, Pj):
        prob_N_times_pj = norm.pdf(x, loc = mean, scale = std) * pj
        pj_array.append(prob_N_times_pj)
        
        normalizing_factor += prob_N_times_pj

    pj_array = np.array(pj_array) / normalizing_factor
    return pj_array

In [9]:
means = [-0.26, -0.10, 0.12, 0.27, 0.10]
stds = [0.97, 1.00, 0.97, 0.99, 0.9749]
Pj = [0.2, 0.2, 0.2, 0.2, 0.2]

classes = []
xArr = np.arange(-3,3,0.05)
for x in xArr:    
    classes.append(get_prob_array(x, means, stds, Pj).argmax() + 1)

In [10]:
p = figure(toolbar_location= None, outline_line_color = 'black')
p.line(x = xArr, y = classes, line_width = 1, color = 'firebrick', legend_label="Data")
p.axis.axis_label = 'x'
p.yaxis.axis_label = 'Classification'
p.legend.border_line_color = "black"
p.legend.border_line_alpha = 1
p.legend.location = 'bottom_right'
show(p)    