# Exploratory Data Analysis of Stock Market Data

## Imports

In [1]:
!pip install statsmodels plotly pandas



In [2]:
from plotly.offline import plot, iplot, init_notebook_mode
from statsmodels.tsa.seasonal import seasonal_decompose
from plotly.subplots import make_subplots
import plotly.graph_objs as go
import plotly.express as px
import plotly.io as pio
import pandas as pd
import warnings

## Options

In [3]:
pio.renderers.default='notebook'
init_notebook_mode(connected=True)
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Functions

In [4]:
def split_date(df, target_col, new_alias=None):
    """
    Utility to convert a date column into various date entities like year, month, day, etc.
    
    Args:
        df (pd.DataFrame): Input DataFrame
        target_col (string): Feature on which the split needs to be performed. Must be of datetime type
        new_alias (string): String alias to be used for split columns generated from target_col. 
                            In case no value is passed then it is same as target_col
    """
    
    if new_alias is None:
        new_alias = target_col
    
    df[f'{new_alias}_year'] = df[target_col].dt.year
    df[f'{new_alias}_month'] = df[target_col].dt.month
    df[f'{new_alias}_day'] = df[target_col].dt.day
    df[f'{new_alias}_quarter'] = df[target_col].dt.quarter
    df[f'{new_alias}_is_month_start'] = df[target_col].dt.is_month_start.astype(int)
    df[f'{new_alias}_is_month_end'] = df[target_col].dt.is_month_end.astype(int)
    
    return df

## Load Data

In [5]:
raw_df = pd.read_csv('../datasets/original/AAPL.csv', parse_dates=['Date'])

raw_df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../datasets/original/AAPL.csv'

## Preprocess

### Column Names

In [None]:
df = raw_df.copy()
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.head()

## Feature Engineering

### Split Dates

In [None]:
df = split_date(df, 'date')

### Returns

In [None]:
df['returns'] = df.adj_close - df.open
df['returns_prc'] = 100 * (df.returns / df.adj_close)

## Exploratory Data Analysis

### Dataframe

So basically this dataset contains 6 different features i.e. date, open, high, low, close, volume
Date - This contains date + time at the instant of trade
Open - Open is the price when the stock began
High - Maximum price at the given time period
Low - Minimum price at the given time period
Close - Price at which stock ended
Volume - It is the total amount of trading activity
Incase of our data the time period is 15 minutes¶


In [None]:
df.head()

### Shape and Size

In [None]:
print(f'Dataframe Shape: {df.shape}')
print(f'Dataframe Size: {df.size}')

### Column datatypes

In [None]:
df.dtypes

### Info and Description

In [None]:
df.info()

In [None]:
df.describe()

### Missing Values

In [None]:
df.isnull().sum()

### Columns skew

In [None]:
df.skew(numeric_only=True)

### Correaltion

In [None]:
correlations = df.corr(numeric_only=True)
correlations

In [None]:
fig = px.imshow(
    correlations.round(2), 
    text_auto=True, 
    color_continuous_scale='RdYlBu', 
    title="Feature Correlations"
)

fig.update_xaxes(tickangle=90)
fig.show('notebook')

### Data subset for plot

In [None]:
# NOTE: Creating a subset of data to produce uncluttered plots
year_since = 2012
plot_df = df.loc[df.date_year > year_since, :]

### Volume over date

In [None]:
fig = px.line(x=df.date, y=df.volume)
fig.show('notebook')

### Price over volume

In [None]:
fig = px.line(x=plot_df.volume, y=plot_df.adj_close, color=plot_df.date_year)
fig.show('notebook')

### Price over time

In [None]:
# Make subplots
fig = make_subplots(rows=5, cols=1, subplot_titles=('Open','High','Low','Close', 'Adjusted Close'))

# Add Open, High, Low, CLose, Adjusted Close
fig.add_trace(go.Scatter(x=plot_df.date, y=plot_df.open), row=1, col=1)
fig.add_trace(go.Scatter(x=plot_df.date, y=plot_df.high), row=2, col=1)
fig.add_trace(go.Scatter(x=plot_df.date, y=plot_df.low), row=3, col=1)
fig.add_trace(go.Scatter(x=plot_df.date, y=plot_df.close), row=4, col=1)
fig.add_trace(go.Scatter(x=plot_df.date, y=plot_df.adj_close), row=5, col=1)

# Layout params
fig.update_layout(height=600, width=800, title_text=f'Apple Stock (since {year_since})')
fig.show('notebook')

### Seasonal Trends

In [None]:
# Last n days visualized
n = 1000
result_data = seasonal_decompose(
    df.adj_close.tail(n),
    model='additive',
    period=30
)

fig = go.Figure()
fig = result_data.plot()
fig.set_size_inches(20, 19)

## Save dataframe

In [None]:
df.to_csv('../datasets/preprocessed_aapl.csv', index=False)
df.shape