In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

Smarket = pd.read_csv("Smarket.csv") 


train = Smarket['Year'] < 2005
test = Smarket['Year'] == 2005

X_train = Smarket.loc[train, ['Lag1', 'Lag2']]
X_test = Smarket.loc[test, ['Lag1', 'Lag2']]

y_train = Smarket.loc[train, 'Direction']
y_test = Smarket.loc[test, 'Direction']

le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)
Smarket.head()




Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
0,2001,0.381,-0.192,-2.624,-1.055,5.01,1.1913,0.959,Up
1,2001,0.959,0.381,-0.192,-2.624,-1.055,1.2965,1.032,Up
2,2001,1.032,0.959,0.381,-0.192,-2.624,1.4112,-0.623,Down
3,2001,-0.623,1.032,0.959,0.381,-0.192,1.276,0.614,Up
4,2001,0.614,-0.623,1.032,0.959,0.381,1.2057,0.213,Up


In [7]:
# mean/std for Lag1 for Direction=Down
mask_down = (y_train_enc == le.transform(['Down'])[0])
print("Sample mean Lag1 for 'Down':", X_train.loc[mask_down, 'Lag1'].mean())
print("Sample std  Lag1 for 'Down':", X_train.loc[mask_down, 'Lag1'].std(ddof=1))


Sample mean Lag1 for 'Down': 0.04279022403258655
Sample std  Lag1 for 'Down': 1.2274456282010824


In [10]:
gnb = GaussianNB()
gnb.fit(X_train, y_train_enc)
y_pred = gnb.predict(X_test)


print("Confusion matrix:")
print(confusion_matrix(y_test_enc, y_pred))

print("Accuracy:", accuracy_score(y_test_enc, y_pred))


Confusion matrix:
[[ 29  82]
 [ 20 121]]
Accuracy: 0.5952380952380952


In [11]:
# class probabilities prediction
y_proba = gnb.predict_proba(X_test)
proba_df = pd.DataFrame(y_proba, columns=[f"P({c})" for c in le.classes_])
proba_df.head()


Unnamed: 0,P(Down),P(Up)
0,0.487329,0.512671
1,0.476236,0.523764
2,0.465295,0.534705
3,0.474845,0.525155
4,0.490206,0.509794
