In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm   
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

Smarket = pd.read_csv("Smarket.csv")   
Smarket['Direction_num'] = Smarket['Direction'].map({'Down': 0, 'Up': 1})
Smarket.head()



Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction,Direction_num
0,2001,0.381,-0.192,-2.624,-1.055,5.01,1.1913,0.959,Up,1
1,2001,0.959,0.381,-0.192,-2.624,-1.055,1.2965,1.032,Up,1
2,2001,1.032,0.959,0.381,-0.192,-2.624,1.4112,-0.623,Down,0
3,2001,-0.623,1.032,0.959,0.381,-0.192,1.276,0.614,Up,1
4,2001,0.614,-0.623,1.032,0.959,0.381,1.2057,0.213,Up,1


In [None]:
X = sm.add_constant(Smarket[['Lag1','Lag2','Lag3','Lag4','Lag5','Volume']])
y = Smarket['Direction_num']

result = sm.Logit(y, X).fit(disp=False) # fit is for MLE
print(result.summary())              



Optimization terminated successfully.
         Current function value: 0.691034
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:          Direction_num   No. Observations:                 1250
Model:                          Logit   Df Residuals:                     1243
Method:                           MLE   Df Model:                            6
Date:                Sun, 07 Dec 2025   Pseudo R-squ.:                0.002074
Time:                        17:32:28   Log-Likelihood:                -863.79
converged:                       True   LL-Null:                       -865.59
Covariance Type:            nonrobust   LLR p-value:                    0.7319
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.1260      0.241     -0.523      0.601      -0.598       0.346
Lag1          -0.0731      0.

In [6]:
print("Coefficients:\n", result.params)
print("\nP-values:\n", result.pvalues)

print("\nPredicted probabilities (first 10 rows):")
print(result.predict(X).round(4)[:10])


Coefficients:
 const    -0.126000
Lag1     -0.073074
Lag2     -0.042301
Lag3      0.011085
Lag4      0.009359
Lag5      0.010313
Volume    0.135441
dtype: float64

P-values:
 const     0.600700
Lag1      0.145232
Lag2      0.398352
Lag3      0.824334
Lag4      0.851445
Lag5      0.834998
Volume    0.392404
dtype: float64

Predicted probabilities (first 10 rows):
0    0.5071
1    0.4815
2    0.4811
3    0.5152
4    0.5108
5    0.5070
6    0.4927
7    0.5092
8    0.5176
9    0.4888
dtype: float64


In [None]:
pred_labels = np.where(result.predict(X) > 0.5, 'Up', 'Down')

train = Smarket['Year'] < 2005
cols = ['Lag1','Lag2','Lag3','Lag4','Lag5','Volume']

X_train = sm.add_constant(Smarket.loc[train, cols])
y_train = Smarket.loc[train, 'Direction_num']

X_test = sm.add_constant(Smarket.loc[~train, cols])
y_test = Smarket.loc[~train, 'Direction_num']
y_test_labels = Smarket.loc[~train, 'Direction']

print("Train:", len(X_train), "Test:", len(X_test))


Train: 998 Test: 252


In [None]:
logit_train = sm.Logit(y_train, X_train)
result_train = logit_train.fit(disp=False)
print(result_train.summary())  
glm_probs_2005 = result_train.predict(X_test)
print("First 10 predicted probabilities for 2005:\n", glm_probs_2005[:10].round(4))


                           Logit Regression Results                           
Dep. Variable:          Direction_num   No. Observations:                  998
Model:                          Logit   Df Residuals:                      991
Method:                           MLE   Df Model:                            6
Date:                Sun, 07 Dec 2025   Pseudo R-squ.:                0.001562
Time:                        18:54:50   Log-Likelihood:                -690.55
converged:                       True   LL-Null:                       -691.63
Covariance Type:            nonrobust   LLR p-value:                    0.9044
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.1912      0.334      0.573      0.567      -0.463       0.845
Lag1          -0.0542      0.052     -1.046      0.295      -0.156       0.047
Lag2          -0.0458      0.052     -0.884      0.3

In [14]:
cols_r = ['Lag1', 'Lag2']

X_train_r = sm.add_constant(Smarket.loc[train, cols_r])
X_test_r  = sm.add_constant(Smarket.loc[~train, cols_r])

y_train_r = y_train
y_test_r_labels = y_test_labels   

model_r = sm.Logit(y_train_r, X_train_r).fit(disp=False)
print(model_r.summary())

                           Logit Regression Results                           
Dep. Variable:          Direction_num   No. Observations:                  998
Model:                          Logit   Df Residuals:                      995
Method:                           MLE   Df Model:                            2
Date:                Sun, 07 Dec 2025   Pseudo R-squ.:                0.001347
Time:                        19:02:34   Log-Likelihood:                -690.70
converged:                       True   LL-Null:                       -691.63
Covariance Type:            nonrobust   LLR p-value:                    0.3939
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0322      0.063      0.508      0.611      -0.092       0.156
Lag1          -0.0556      0.052     -1.076      0.282      -0.157       0.046
Lag2          -0.0445      0.052     -0.861      0.3

In [17]:
probs_r_2005 = model_r.predict(X_test_r)
preds_r_2005 = np.where(probs_r_2005 > 0.5, 'Up', 'Down')

cm_r = confusion_matrix(y_test_r_labels, preds_r_2005, labels=['Up', 'Down'])
print("Reduced-model confusion matrix (actual x predicted):\n", cm_r)
print("Reduced-model accuracy:", accuracy_score(y_test_r_labels, preds_r_2005))

newdata = pd.DataFrame({'Lag1': [1.2, 1.5], 'Lag2': [1.1, -0.8]})
newdata = sm.add_constant(newdata)

new_probs = model_r.predict(newdata)
print("Predicted classes:", np.where(new_probs > 0.5, 'Up', 'Down'))


Reduced-model confusion matrix (actual x predicted):
 [[106  35]
 [ 76  35]]
Reduced-model accuracy: 0.5595238095238095
Predicted classes: ['Down' 'Down']
