In [None]:
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
#A code to run ordinary least squares with associated statistics
#Jeremy Kedziora
#24 March 2016
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

#import libraries
import numpy as np    #import for arrays
from scipy.optimize import minimize    #import for optimization
import scipy.stats
import statsmodels as sm
from scipy.stats import t

In [None]:
#define functions
def maker(N,n_vars,kind = 'linear',n_cat = 0):
    """A function to generate Monte Carlo linear regression data"""
    x = []    #an empty list to hold the data
    y = np.zeros(N)    #an array to hold the dependent variable
    b = []    #an empty list to hold the true bs
    mu = [0.0]    #an empty list to hold the true cutoffs in the ordered probit
    i = 1
    while i <= n_vars:    #loop over the variables we want to create
        x_i = np.random.normal(loc = 0.0, scale = 1.0, size = N)    #generate the data
        x.append(x_i)    #add it to the list of data
        b_i = np.random.normal(loc = 0.0, scale = 1.0)    #draw a random effect for this variable
        b.append(b_i)    #add it to the list of effects
        y = y + b_i*x_i    #add the variable effect to the dependent variable
        i += 1    #index up i
    
    x.append(np.ones(N))    #and a column of ones for a constant
    b_i = np.random.uniform(0.0,1.0)    #draw a random intercept

    if kind == 'linear':
        y = b_i + y + np.random.normal(loc = 0.0, scale = 1.0, size = N)    #add the normally distributed error term and the intercept
    if kind == 'logit':
        y = (np.random.uniform(0,1,len(y)) < np.exp(b_i + y)/(1 + np.exp(b_i + y)))*1    #draw y values
    if kind == 'ordered':
        y = b_i + y + np.random.normal(loc = 0.0, scale = 1.0, size = N)    #add the normally distributed error term and the intercept
        for i in range(n_cat-2):    #loop over number of categories
            mu.append(mu[i] + np.random.uniform(0.25,1,1)[0])    #append the cutoff for the next category
        y_cat = (y<mu[0])*0    #set the ys less than 0 to category 0
        for i in range(1,len(mu)):    #loop over the remaining categories
            y_cat = y_cat + (mu[i-1]<y)*(y<mu[i])*i    #code all the ys that fall between them
        y_cat = y_cat + (y>mu[len(mu)-1])*(n_cat - 1)    #and code all the ys larger than the largest
        y = y_cat    #and save
    if kind == 'count':
        b_i = np.random.uniform(-1.0,0)    #draw a random intercept
        y = np.random.poisson(np.exp(b_i + y))    #draw the counts
    if kind == 'duration':
        alpha = np.random.uniform(0,2)    #draw the shape parameter
        lambda_xb = np.exp(b_i + y)    #make the lambda parameter which varies by observation
        y = lambda_xb*(-1*np.log(np.random.uniform(0,1,N)))**(1/alpha)    #draw durations from the weibull
    b.append(b_i)    #append this intercept to the effects
    if kind == 'duration':
        b.append(alpha)
    return [np.array(x).T,np.array(y),np.array(b),np.array(mu)]

In [None]:
def logit_mle(b,X,y):
    """A function to compute logit coefficients using MLE"""
    xb = X.dot(b)    #compute the means
    return -1*sum(y*xb - np.log(1+np.exp(xb)))    #return the log likelihood

In [None]:
def logit_effects(b,X,which_variable):
    """A function to compute the predicted probability as x varies"""
    x_mean = np.mean(X,0)    #compute the means
    grid = np.linspace(min(X[:,which_variable]),max(X[:,which_variable]),100)    #make a grid
    probs = []    #a list to hold the predicted probabilities
    for g in grid:    #loop over grid
        x_mean[which_variable] = g    #replace
        probs = probs + [np.exp(x_mean.dot(b))/(1 + np.exp(x_mean.dot(b)))]    #compute the logit transform
    return probs

In [None]:
def poisson_mle(b,X,y):
    """A function to compute the poisson log-likelihood."""
    xb = X.dot(b)    #compute xb
    return -1*sum(y*xb - np.exp(xb))    #compute the log-likelihood

In [None]:
def poisson_effects(b,X,which_variable):
    """A function to compute the predicted probability as x varies"""
    x_mean = np.mean(X,0)    #compute the means
    grid = np.linspace(min(X[:,which_variable]),max(X[:,which_variable]),100)    #make a grid
    lambdas = []    #a list to hold the predicted probabilities
    for g in grid:    #loop over grid
        x_mean[which_variable] = g    #replace
        lambdas = lambdas + [np.exp(x_mean.dot(b))]    #compute the logit transform
    return lambdas

In [None]:
def ordered_probit_mle(b,X,y):
    """A function to compute the ordered probit log-likelihood."""
    xb = X.dot(b[0:(X.shape[1])])    #compute xb
    mu = [float('-inf'),0]    #initialize the list of mus
    for i in range(len(b[X.shape[1]:])):    #loop over categories
        mu = mu + [mu[i+1] + b[X.shape[1]:][i]]    #and create each mu
    mu = mu + [float('inf')]    #append infinity on the end
    probs = np.zeros(len(y))    #set up an array of 0s
    for i in range(1,len(set(y)) + 1):    #loop over categories
        probs = probs + (scipy.stats.norm.cdf(mu[i] - xb) - scipy.stats.norm.cdf(mu[i - 1] - xb))*(y == list(set(y))[i-1])    #compute probability
    return -1*sum(np.log(probs))

In [None]:
def OP_predicted_values(b,X):
    """A function to compute predicted values in the ordered probit."""
    xb = X.dot(b[0:(X.shape[1])])    #compute xb
    mu = [float('-inf'),0]    #initialize the list of mus
    for i in range(len(b[X.shape[1]:])):    #loop over categories
        mu = mu + [mu[i+1] + b[X.shape[1]:][i]]    #and create each mu
    mu = mu + [float('inf')]    #append infinity on the end
    y_pred = np.zeros(len(xb))    #set up an array of zeros
    for i in range(1,len(mu)):    #loop over categories
        y_pred = y_pred + (((mu[i-1]<xb)*1*(mu[i]>xb)*1)*(i-1))    #check which category each falls in
    return y_pred

In [None]:
def weibull_mle(b,X,t):
    """A function to compute the log-log mle."""
    alpha = np.exp(b[len(b) - 1])    #grab the shape parameter
    xb = X.dot(b[0:(X.shape[1])])    #grab the covariate effects parameter
    return -1*sum(np.log(alpha) + alpha*xb + (alpha - 1)*(np.log(t)) - (t*np.exp(xb))**alpha)

In [None]:
def mle_inferences(model):
    """A function to compute inferences from an MLE model."""
    Coefficients = model.x    #pull out the coefficients
    SE = np.diag(model.hess_inv)**0.5
    t_stat = Coefficients/SE    #compute the t statistics for each variable
    p_values = 2*t.pdf(abs(t_stat),df = N - 1)    #compute the p-values
    print('Coefficients are:         ',np.round(Coefficients,4))    #print coefficients rounded to 4th decimal
    print('Standard Errors are:      ',np.round(SE,4))    #print standard errors round to 4th decimal
    print('t-statistics are:         ',np.round(t_stat,4))    #print tstat rounded to 4th decimal
    print('p-values are:             ',np.round(p_values,4))    #print p-value rounded to 4th decimal
    CI = []    #an empty list to hold the CI strings
    for i,j in zip(Coefficients,SE):    #loop over variables
        CI.append('[' + str(round(i-1.96*j,4)) + ',' + str(round(i+1.96*j,4)) + ']')    #create the CI
    print('Confidence Intervals are: ',CI)    #print the 95% CI rounded to 4th decimal


In [None]:
N = 1000
n_vars = 3
Data = maker(N,n_vars,kind = 'logit')    #make logit data
X = Data[0]    #pull out explanatory variables
y = Data[1]    #pull out dependent variable
b = Data[2]    #pull out true coefficients

b = np.random.uniform(0,1,4)*0.01    #set starting values
model = minimize(logit_mle, x0 = b, args = (X,y), method = 'BFGS')    #maximize the log-likelihood
print(mle_inferences(model))

In [None]:
logit_effects(coefficients,X,0)    #compute effect on the predicted probability

In [None]:
N = 100
n_vars = 3
Data = maker(N,n_vars,'count')    #make poisson data
X = Data[0]    #pull out the features
y = Data[1]    #pull out the labels

b = np.random.uniform(0,1,4)*0.01    #set starting values
model = minimize(poisson_mle, x0 = b, args = (X,y), method = 'Nelder-Mead')    #maximize the log-likelihood

In [None]:
poisson_effects(coefficients,X,0)    #compute the effect on the mean

In [None]:
N = 10000
n_vars = 3
Data = maker(N,n_vars,'ordered',n_cat=3)    #make poisson data
X = Data[0]    #pull out the features
y = Data[1]    #pull out the labels

b = np.array(list(np.random.uniform(0,1,X.shape[1])*0.01) + list(np.random.uniform(0,1.0,len(set(y))-2)))    #set starting values
coefficients = minimize(ordered_probit_mle, x0 = b, args = (X,y), method = 'Nelder-Mead').x    #maximize the log-likelihood

print('Percent correctly predicted is: ',sum((OP_predicted_values(coefficients,X) - y)==0)/N)    #compute the PCP


In [None]:
N = 1000
n_vars = 3
Data = maker(N,n_vars,'duration')    #make poisson data
X = Data[0]    #pull out the features
t = Data[1]    #pull out the labels

b = np.array(np.random.uniform(0,1,X.shape[1]+1)*0.01)    #set starting values
coefficients = minimize(weibull_mle, x0 = b, args = (X,t), method = 'BFGS').x    #maximize the log-likelihood
print(coefficients[:len(coefficients)-1],np.exp(coefficients[len(coefficients)-1]))
print(Data[2])