# 1.0 Data preparation

In [1]:
import numpy as np
import pandas as pd
import math
import base64

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
# Read the data file and take a look at the data
df = pd.read_csv('../data/raw/stocks_data/stocks.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,symbol
0,0,2022-03-01 00:00:00-05:00,57.610421,57.817248,54.028574,54.667854,8964000,0.0,0.0,TFC
1,1,2022-03-02 00:00:00-05:00,55.523362,57.572819,55.325935,57.055752,6482900,0.0,0.0,TFC
2,2,2022-03-03 00:00:00-05:00,57.337791,57.601023,56.143841,56.924137,4755600,0.0,0.0,TFC
3,3,2022-03-04 00:00:00-05:00,55.325934,55.673781,54.846476,55.429348,8031300,0.0,0.0,TFC
4,4,2022-03-07 00:00:00-05:00,55.034501,55.325935,52.825222,52.862827,8460000,0.0,0.0,TFC


In [4]:
df.columns

Index(['Unnamed: 0', 'Date', 'Open', 'High', 'Low', 'Close', 'Volume',
       'Dividends', 'Stock Splits', 'symbol'],
      dtype='object')

In [5]:
df = df.drop(columns=["Unnamed: 0"])

In [6]:
# Rename column names with underscores
columns_dict = {"Stock Splits":"Stock_Splits",
               "symbol":"Symbol"
              }
df = df.rename(columns=columns_dict)
df.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends',
       'Stock_Splits', 'Symbol'],
      dtype='object')

In [7]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock_Splits,Symbol
0,2022-03-01 00:00:00-05:00,57.610421,57.817248,54.028574,54.667854,8964000,0.0,0.0,TFC
1,2022-03-02 00:00:00-05:00,55.523362,57.572819,55.325935,57.055752,6482900,0.0,0.0,TFC
2,2022-03-03 00:00:00-05:00,57.337791,57.601023,56.143841,56.924137,4755600,0.0,0.0,TFC
3,2022-03-04 00:00:00-05:00,55.325934,55.673781,54.846476,55.429348,8031300,0.0,0.0,TFC
4,2022-03-07 00:00:00-05:00,55.034501,55.325935,52.825222,52.862827,8460000,0.0,0.0,TFC


In [8]:
strings = list(df.dtypes[df.dtypes == 'object'].index)
strings

['Date', 'Symbol']

In [9]:
df['Symbol'] = df['Symbol'].str.upper()

In [10]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock_Splits,Symbol
0,2022-03-01 00:00:00-05:00,57.610421,57.817248,54.028574,54.667854,8964000,0.0,0.0,TFC
1,2022-03-02 00:00:00-05:00,55.523362,57.572819,55.325935,57.055752,6482900,0.0,0.0,TFC
2,2022-03-03 00:00:00-05:00,57.337791,57.601023,56.143841,56.924137,4755600,0.0,0.0,TFC
3,2022-03-04 00:00:00-05:00,55.325934,55.673781,54.846476,55.429348,8031300,0.0,0.0,TFC
4,2022-03-07 00:00:00-05:00,55.034501,55.325935,52.825222,52.862827,8460000,0.0,0.0,TFC


In [11]:
df.isnull().sum()

Date            0
Open            0
High            0
Low             0
Close           0
Volume          0
Dividends       0
Stock_Splits    0
Symbol          0
dtype: int64

In [12]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock_Splits,Symbol
0,2022-03-01 00:00:00-05:00,57.610421,57.817248,54.028574,54.667854,8964000,0.0,0.0,TFC
1,2022-03-02 00:00:00-05:00,55.523362,57.572819,55.325935,57.055752,6482900,0.0,0.0,TFC
2,2022-03-03 00:00:00-05:00,57.337791,57.601023,56.143841,56.924137,4755600,0.0,0.0,TFC
3,2022-03-04 00:00:00-05:00,55.325934,55.673781,54.846476,55.429348,8031300,0.0,0.0,TFC
4,2022-03-07 00:00:00-05:00,55.034501,55.325935,52.825222,52.862827,8460000,0.0,0.0,TFC


In [13]:
df.to_csv('../data/cleaned/stocks_data/stocks_data.csv', encoding="utf-8")