# Stock Market Data

In [40]:
import pandas as pd
from datetime import datetime, timedelta, date

## Loadind the data

In [41]:
citi = pd.read_csv("C-intra.csv")
goldman = pd.read_csv("GS-intra.csv")
morgan = pd.read_csv("MS-intra.csv")

---

## Overview of the dataframe

In [42]:
citi.head()

Unnamed: 0,<DATE>,<TIME>,<OPEN>,<HIGH>,<LOW>,<CLOSE>,<VOL>
0,20191008,100000,67.05,67.05,66.34,66.34,113224
1,20191008,110000,66.32,66.5,66.03,66.5,163117
2,20191008,120000,66.495,66.84,66.19,66.65,105768
3,20191008,130000,66.65,67.17,66.6,66.91,91339
4,20191008,140000,66.92,67.03,66.87,66.87,100957


In [43]:
goldman.head()

Unnamed: 0,<DATE>,<TIME>,<OPEN>,<HIGH>,<LOW>,<CLOSE>,<VOL>
0,20191008,100000,197.59,197.71,196.28,196.3,22894
1,20191008,110000,196.3,197.5,195.94,197.5,28411
2,20191008,120000,197.46,198.3,196.61,197.39,25527
3,20191008,130000,197.41,198.78,197.21,197.98,20972
4,20191008,140000,198.01,198.56,197.86,198.05,10451


In [44]:
morgan.head()

Unnamed: 0,<DATE>,<TIME>,<OPEN>,<HIGH>,<LOW>,<CLOSE>,<VOL>
0,20191008,110000,39.93,40.025,39.78,40.01,33348
1,20191008,120000,40.0,40.15,39.75,40.0,34901
2,20191008,130000,39.96,40.15,39.92,40.02,21383
3,20191008,140000,39.99,40.19,39.99,40.08,13972
4,20191008,150000,40.1,40.15,39.94,39.98,29330


---

## Standardization of data

In [45]:
files = [citi , goldman, morgan]

#### Remove '<' and '>' from columns names

In [46]:
def columns_names(files):
    for data in files:
        data.columns = data.columns.str.replace('<', '').str.replace('>', '')
    

In [47]:
columns_names(files)

In [48]:
citi.head()

Unnamed: 0,DATE,TIME,OPEN,HIGH,LOW,CLOSE,VOL
0,20191008,100000,67.05,67.05,66.34,66.34,113224
1,20191008,110000,66.32,66.5,66.03,66.5,163117
2,20191008,120000,66.495,66.84,66.19,66.65,105768
3,20191008,130000,66.65,67.17,66.6,66.91,91339
4,20191008,140000,66.92,67.03,66.87,66.87,100957


#### Standadize date and time

In [49]:
def to_date(files):
    for data in files:
        data['DATE'] = data['DATE'].map(str) + data['TIME'].map(str)
        data["DATE"]= pd.to_datetime(data["DATE"],format="%Y%m%d%H%M%S")
        data.drop(['TIME'],axis=1,inplace=True)

In [50]:
to_date(files)

In [51]:
citi.head()

Unnamed: 0,DATE,OPEN,HIGH,LOW,CLOSE,VOL
0,2019-10-08 10:00:00,67.05,67.05,66.34,66.34,113224
1,2019-10-08 11:00:00,66.32,66.5,66.03,66.5,163117
2,2019-10-08 12:00:00,66.495,66.84,66.19,66.65,105768
3,2019-10-08 13:00:00,66.65,67.17,66.6,66.91,91339
4,2019-10-08 14:00:00,66.92,67.03,66.87,66.87,100957


---

## Add stock change for future correlation

The percentage change in stock market is calculated from the price the market had closed the hour before. In the case of the start of the day, we take the closing price from the previous day.

In [52]:
def add_change(files):
    for data in files:
        list_change = []
        for index, row in data.iterrows():
            if index == 0:
                list_change.append(0)
            else:
                res = ((row['CLOSE']) - (data['CLOSE'][index-1])) / (data['CLOSE'][index-1])
                list_change.append(res*100)
        data['%CHANGE'] = list_change

In [53]:
add_change(files)

In [54]:
citi.head()

Unnamed: 0,DATE,OPEN,HIGH,LOW,CLOSE,VOL,%CHANGE
0,2019-10-08 10:00:00,67.05,67.05,66.34,66.34,113224,0.0
1,2019-10-08 11:00:00,66.32,66.5,66.03,66.5,163117,0.241182
2,2019-10-08 12:00:00,66.495,66.84,66.19,66.65,105768,0.225564
3,2019-10-08 13:00:00,66.65,67.17,66.6,66.91,91339,0.390098
4,2019-10-08 14:00:00,66.92,67.03,66.87,66.87,100957,-0.059782


---

####  %Change 1h after every day opening

In [57]:
def add_change2(files):
    for data in files:
        if (data.equals(morgan)):
            list_change = []
            for index, row in data.iterrows():
                if ((row['DATE'].to_pydatetime().hour == 12) & (index != 1)):
                    res = ((row['CLOSE']) - (data['CLOSE'][index - 2])) / (data['CLOSE'][index - 2])
                    list_change.append(res*100)
                else:
                    list_change.append(0)
                
        else:   
            list_change = []
            for index, row in data.iterrows():
                if ((row['DATE'].to_pydatetime().hour == 11) & (index != 1)):
                    res = ((row['CLOSE']) - (data['CLOSE'][index - 2])) / (data['CLOSE'][index - 2])
                    list_change.append(res*100)
                else:
                    list_change.append(0)
                
        data['%CHANGE_1H'] = list_change

In [58]:
add_change2(files)

In [65]:
citi.loc[:10]

Unnamed: 0,DATE,OPEN,HIGH,LOW,CLOSE,VOL,%CHANGE,%CHANGE_1H
0,2019-10-08 10:00:00,67.05,67.05,66.34,66.34,113224,0.0,0.0
1,2019-10-08 11:00:00,66.32,66.5,66.03,66.5,163117,0.241182,0.0
2,2019-10-08 12:00:00,66.495,66.84,66.19,66.65,105768,0.225564,0.0
3,2019-10-08 13:00:00,66.65,67.17,66.6,66.91,91339,0.390098,0.0
4,2019-10-08 14:00:00,66.92,67.03,66.87,66.87,100957,-0.059782,0.0
5,2019-10-08 15:00:00,66.89,67.11,66.63,66.775,103199,-0.142067,0.0
6,2019-10-08 16:00:00,66.79,66.97,66.32,66.425,166816,-0.524148,0.0
7,2019-10-09 10:00:00,66.9,67.18,66.79,67.035,50634,0.918329,0.0
8,2019-10-09 11:00:00,67.03,67.36,66.85,67.12,86032,0.126799,1.046293
9,2019-10-09 12:00:00,67.16,67.49,66.97,67.29,53317,0.253278,0.0
