In [None]:
#si-exercise
import pandas as pd
import numpy as np
import scipy as sp

pd.read_csv("C:\Users\User\Dropbox\PC\Documents\GitHub\econ8320-assignment-10\tests\filesassignment8.csv")

class RegressionModel:
    def __init__(self, x, y, create_intercept=True, regression_type='ols'):
        self.x = x.copy()
        self.y = pd.DataFrame(y)
        self.create_intercept = create_intercept
        self.regression_type = regression_type
        self.results = {}

        if self.create_intercept:
            self.add_intercept()

    def add_intercept(self):
        self.x['intercept'] = 1

    def ols_regression(self):
        X = self.x.values
        Y = self.y.values.reshape(-1, 1)

        # OLS estimation
        XtX = np.dot(X.T, X)
        XtY = np.dot(X.T, Y)
        beta_hat = sp.linalg.solve(XtX, XtY, assume_a='pos')

        # Predicted values and residuals
        y_hat = np.dot(X, beta_hat)
        residuals = Y - y_hat

        # Degrees of freedom and residual variance
        n = len(Y)
        k = X.shape[1]
        df = n - k
        residual_variance = np.sum(residuals**2) / df

        # Standard errors
        var_beta_hat = residual_variance * sp.linalg.inv(XtX)
        standard_errors = np.sqrt(np.diag(var_beta_hat))

        t_stats = beta_hat.flatten() / standard_errors
        p_values = 2 * (sp.stats.t.sf(np.abs(t_stats), df))

        for i, col in enumerate(self.x.columns):
            self.results[col] = {
                'coefficient': beta_hat[i][0],
                'standard_error': standard_errors[i],
                't_stat': t_stats[i],
                'p_value': p_values[i]}

    def log_likelihood(self, beta):
        X = self.x.values
        Y = self.y.values.reshape(-1, 1)
        logits = np.dot(X, beta)
        likelihood = Y * logits - np.log(1 + np.exp(logits))
        return -np.sum(likelihood)

    def gradient(self, beta):
        X = self.x.values
        Y = self.y.values.reshape(-1, 1)
        logits = np.dot(X, beta)
        probs = 1 / (1 + np.exp(-logits))
        gradient = np.dot(X.T, (probs - Y))
        return gradient

    def logistic_regression(self, learning_rate=0.01, max_iter=1000, tol=1e-6):
        X = self.x.values
        Y = self.y.values.reshape(-1, 1)
        beta = np.zeros((X.shape[1], 1))

        for _ in range(max_iter):
            grad = self.gradient(beta)
            beta -= learning_rate * grad

            if np.linalg.norm(grad) < tol:
                break

        self.store_logit_results(beta)

    def store_logit_results(self, beta):
        X = self.x.values
        logits = np.dot(X, beta)
        probs = 1 / (1 + np.exp(-logits))
        n = len(self.y)
        k = X.shape[1]

        variance_cov_matrix = np.linalg.inv(np.dot(X.T, X * (probs * (1 - probs)).reshape(-1, 1)))
        standard_errors = np.sqrt(np.diag(variance_cov_matrix))
        z_stats = beta.flatten() / standard_errors
        p_values = 2 * (1 - sp.stats.norm.cdf(np.abs(z_stats)))

        for i, col in enumerate(self.x.columns):
            self.results[col] = {
                'log_odds_ratio': beta[i][0],
                'standard_error': standard_errors[i],
                'z_stat': z_stats[i],
                'p_value': p_values[i]
            }

    def fit_model(self):
        if self.regression_type == 'ols':
            self.ols_regression()
        elif self.regression_type == 'logit':
            self.logistic_regression()

    def summary(self):
        if not self.results:
            print("No results available. Please run fit_model first.")
            return

        if self.regression_type == 'ols':
            summary_table = pd.DataFrame(self.results).T
            summary_table.columns = ['coefficient', 'standard_error', 't_stat', 'p_value']
            summary_table.index.name = 'Variable name'

            print(summary_table)


SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (<ipython-input-3-4fc1824b7e03>, line 6)