In [9]:
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
import warnings 
warnings.filterwarnings('ignore')

In [10]:
# K-Fold: For Regression 

In [11]:
X, y = load_iris(return_X_y=True)

In [12]:
model = LogisticRegression(max_iter=200)

In [13]:
scores = cross_val_score(model, X, y, cv=5)

In [14]:
print("Cross Validation Score:", scores)
print("Average accuracy:", scores.mean())

Cross Validation Score: [0.96666667 1.         0.93333333 0.96666667 1.        ]
Average accuracy: 0.9733333333333334


# Stratified K-Fold

In [15]:
# Stratified: For Classfication

In [16]:
from sklearn.model_selection import StratifiedKFold

In [17]:
skf = StratifiedKFold(
    n_splits = 7, shuffle = True, random_state= 42 
)

In [18]:
scores = cross_val_score(
    model, X, y, cv=skf
)

In [19]:
print("Stratified CV Accuracy:", scores.mean())

Stratified CV Accuracy: 0.9730983302411874


# Breast Cancer Dataset 

In [20]:
# k-fold

In [21]:
from sklearn.datasets import load_breast_cancer

In [22]:
X, y = load_breast_cancer(return_X_y=True)

model = LogisticRegression(max_iter=500)

scores = cross_val_score(model, X, y, cv=5)

print("Cross Validation Score:", scores)
print("Average accuracy:", scores.mean())

Cross Validation Score: [0.93859649 0.93859649 0.97368421 0.94736842 0.96460177]
Average accuracy: 0.9525694767893185


In [23]:
# stratified

In [24]:
skf = StratifiedKFold(
    n_splits = 5, shuffle = True, random_state= 42 
)

scores = cross_val_score(
    model, X, y, cv=skf
)

print("Stratified CV Accuracy:", scores.mean())

Stratified CV Accuracy: 0.9525694767893185


# Time Series

In [25]:
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression

In [26]:
X = np.arange(100).reshape(-1,1)
y = X.flatten()*0.5 + np.random.rand(100) 

In [27]:
model = LinearRegression()

tscv = TimeSeriesSplit(n_splits =5)

scores = cross_val_score(
    model, X, y, cv = tscv, scoring = 'r2'
)

print("R2 Scores:", scores)
print("Average R2:", scores.mean())

R2 Scores: [0.98764346 0.9824722  0.98292631 0.98441045 0.98130947]
Average R2: 0.9837523773315485


# Time Series with Real Dataset

In [28]:
import pandas as pd
import yfinance as yf

In [44]:
data = yf.download("AAPL", start = '2010-01-01', end ='2025-01-01')

[*********************100%***********************]  1 of 1 completed


In [45]:
data

Price,Close,High,Low,Open,Volume
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2010-01-04,6.418383,6.433078,6.369497,6.400988,493729600
2010-01-05,6.429481,6.465770,6.395590,6.436079,601904800
2010-01-06,6.327211,6.454973,6.320613,6.429480,552160000
2010-01-07,6.315515,6.358103,6.269629,6.350605,477131200
2010-01-08,6.357502,6.358102,6.269928,6.307117,447610800
...,...,...,...,...,...
2024-12-24,257.037476,257.047410,254.140559,254.339671,23234700
2024-12-26,257.853760,258.928914,256.470034,257.027510,27237100
2024-12-27,254.439209,257.535222,251.920601,256.669114,42355300
2024-12-30,251.064484,252.358634,249.621015,251.094347,35557500


In [46]:
df = data[['Close']].dropna()
print(df.head())

Price          Close
Ticker          AAPL
Date                
2010-01-04  6.418383
2010-01-05  6.429481
2010-01-06  6.327211
2010-01-07  6.315515
2010-01-08  6.357502


In [47]:
df['Target'] = df['Close'].shift(-1)
df.dropna()

X = X[:-1]
y = y[:-1]

In [48]:
tscv2 = TimeSeriesSplit(n_splits = 5)

model = LinearRegression()

scores = cross_val_score(
    model, X, y, cv = tscv2, scoring = 'r2'
)

print("R2 Scores:", scores)
print("Average R2 Mean:", scores.mean())
print("Average R2:", np.nanmean(scores))

R2 Scores: [0.99341566 0.98906517 0.99109924 0.99307595 0.99200881]
Average R2 Mean: 0.9917329654175221
Average R2: 0.9917329654175221


In [None]:
# H:w: Apple Stock Real time series crosss validation 