# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN
from prophet import Prophet

# Data Preprocessing

In [2]:
# Load the data
data = pd.read_excel('yahoo_data.xlsx')

In [3]:
data.head()

Unnamed: 0,Date,Open,High,Low,Close*,Adj Close**,Volume
0,"Apr 28, 2023",33797.43,34104.56,33728.4,34098.16,34098.16,354310000
1,"Apr 27, 2023",33381.66,33859.75,33374.65,33826.16,33826.16,343240000
2,"Apr 26, 2023",33596.34,33645.83,33235.85,33301.87,33301.87,321170000
3,"Apr 25, 2023",33828.34,33875.49,33525.39,33530.83,33530.83,297880000
4,"Apr 24, 2023",33805.04,33891.15,33726.09,33875.4,33875.4,252020000


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1258 entries, 0 to 1257
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         1258 non-null   object 
 1   Open         1258 non-null   float64
 2   High         1258 non-null   float64
 3   Low          1258 non-null   float64
 4   Close*       1258 non-null   float64
 5   Adj Close**  1258 non-null   float64
 6   Volume       1258 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 68.9+ KB


In [6]:
data.isnull().sum()

Date           0
Open           0
High           0
Low            0
Close*         0
Adj Close**    0
Volume         0
dtype: int64

In [7]:
data.describe()

Unnamed: 0,Open,High,Low,Close*,Adj Close**,Volume
count,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0
mean,29595.823045,29776.945739,29402.432226,29599.361677,29599.361677,345063600.0
std,4006.078299,4009.007573,4004.949066,4007.468822,4007.468822,106914200.0
min,19028.36,19121.01,18213.65,18591.93,18591.93,86150000.0
25%,26041.2675,26163.155,25877.8725,26027.12,26027.12,277312500.0
50%,29201.41,29335.685,28996.5,29199.46,29199.46,324725000.0
75%,33604.0275,33825.445,33346.8275,33600.3425,33600.3425,387510000.0
max,36722.6,36952.65,36636.0,36799.65,36799.65,915990000.0


In [8]:
# Rename column for consistency
data.columns = ['Date', 'open', 'High', 'Low', 'Close', 'Adj_Close', 'Volume']

In [10]:
# Convert Date to datetime 
data['Date'] = pd.to_datetime(data['Date'])

In [12]:
# Sort Data By Date
data = data.sort_values("Date").reset_index(drop=True)

In [13]:
# Convert numbers to float
numeric_cols = ['open', 'High', 'Low', 'Close', 'Adj_Close', 'Volume']
data[numeric_cols] = data[numeric_cols].replace(',', '', regex=True).astype(float)

Unnamed: 0,Date,open,High,Low,Close,Adj_Close,Volume
0,2018-05-01,24117.29,24117.29,23808.19,24099.05,24099.05,380070000.0
1,2018-05-02,24097.63,24185.52,23886.3,23924.98,23924.98,385350000.0
2,2018-05-03,23836.23,23996.15,23531.31,23930.15,23930.15,389240000.0
3,2018-05-04,23865.22,24333.35,23778.87,24262.51,24262.51,329480000.0
4,2018-05-07,24317.66,24479.45,24263.42,24357.32,24357.32,307670000.0


# Financial Indicators

In [15]:
# simple Moving Average (SMA)
data['SMA_20'] = data['Close'].rolling(window=20).mean()

In [16]:
# Exponential Moving Average (EMA)
data['EMA_20'] = data['Close'].ewm(span=20, adjust=False).mean()

In [17]:
# Bollinger Bonds
data['BB_uperr'] = data['SMA_20'] + 2 * data['Close'].rolling(window=20).std()
data['BB_lower'] = data['SMA_20'] - 2 * data['Close'].rolling(window=20).std()

In [18]:
# RSI (Relative Strength Index)
delta = data['Close'].diff()
gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
rs = gain / loss
data['RSI'] = 100 - (100 / (1 + rs))

# Anomaly Detection (Isolation Forest or DBSCAN)

1. Using Isolation Forest: 

In [21]:
# Use Selected features
features = data[['Close', 'SMA_20', 'EMA_20', 'RSI']].dropna()

In [23]:
# Fit Isolation Forest
iso_forest = IsolationForest(contamination=0.01, random_state=42)
data.loc[features.index, 'anomaly_iso'] = iso_forest.fit_predict(features)

In [24]:
# Label Anomalies
data['anomaly_iso'] = data['anomaly_iso'].map({1: 0, -1: 1})

2. Using DBSCAN: