# Categorical and Continuous Data

Categorical data: data contain a finite number of categories or distinct groups. Categorical data might not have a logical order. For example, categorical predictors include gender, material type, and payment method. On the other hand, categorical have strings or labels. 

Continuous data: are numeric variables that have an infinite number of values between any two values. A continuous variable can be numeric or date/time. For example, the length of a part or the date and time a payment is received. 

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

# fix_yahoo_finance is used to fetch data 
import fix_yahoo_finance as yf
yf.pdr_override()

In [2]:
# input
symbol = 'AMD'
start = '2014-01-01'
end = '2019-01-01'

# Read data 
dataset = yf.download(symbol,start,end)

# View Columns
dataset.head()

[*********************100%***********************]  1 of 1 downloaded


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-01-02,3.85,3.98,3.84,3.95,3.95,20548400
2014-01-03,3.98,4.0,3.88,4.0,4.0,22887200
2014-01-06,4.01,4.18,3.99,4.13,4.13,42398300
2014-01-07,4.19,4.25,4.11,4.18,4.18,42932100
2014-01-08,4.23,4.26,4.14,4.18,4.18,30678700


In [3]:
# Create Data
dataset['Open_Close'] = (dataset['Open'] - dataset['Adj Close'])/dataset['Open']
dataset['High_Low'] = (dataset['High'] - dataset['Low'])/dataset['Low']
dataset['Increase_Decrease'] = np.where(dataset['Volume'].shift(-1) > dataset['Volume'],1,0)
dataset['Buy_Sell_on_Open'] = np.where(dataset['Open'].shift(-1) > dataset['Open'],1,0)
dataset['Buy_Sell'] = np.where(dataset['Adj Close'].shift(-1) > dataset['Adj Close'],1,0)
dataset['Returns'] = dataset['Adj Close'].pct_change()
dataset = dataset.dropna()
dataset.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Open_Close,High_Low,Increase_Decrease,Buy_Sell_on_Open,Buy_Sell,Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2014-01-03,3.98,4.0,3.88,4.0,4.0,22887200,-0.005025,0.030928,1,1,1,0.012658
2014-01-06,4.01,4.18,3.99,4.13,4.13,42398300,-0.029925,0.047619,1,1,1,0.0325
2014-01-07,4.19,4.25,4.11,4.18,4.18,42932100,0.002387,0.034063,0,1,0,0.012107
2014-01-08,4.23,4.26,4.14,4.18,4.18,30678700,0.01182,0.028986,0,0,0,0.0
2014-01-09,4.2,4.23,4.05,4.09,4.09,30667600,0.02619,0.044444,0,0,1,-0.021531


In [4]:
# Create Labels string
dataset['Increase_Decrease_L'] = np.where(dataset['Volume'].shift(-1) > dataset['Volume'],'Increase','Decrease')
dataset['Buy_Sell_on_Open_L'] = np.where(dataset['Open'].shift(-1) > dataset['Open'],'Buy','Sell')
dataset['Buy_Sell_L'] = np.where(dataset['Adj Close'].shift(-1) > dataset['Adj Close'],'Buy','Sell')
dataset.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Open_Close,High_Low,Increase_Decrease,Buy_Sell_on_Open,Buy_Sell,Returns,Increase_Decrease_L,Buy_Sell_on_Open_L,Buy_Sell_L
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2014-01-03,3.98,4.0,3.88,4.0,4.0,22887200,-0.005025,0.030928,1,1,1,0.012658,Increase,Buy,Buy
2014-01-06,4.01,4.18,3.99,4.13,4.13,42398300,-0.029925,0.047619,1,1,1,0.0325,Increase,Buy,Buy
2014-01-07,4.19,4.25,4.11,4.18,4.18,42932100,0.002387,0.034063,0,1,0,0.012107,Decrease,Buy,Sell
2014-01-08,4.23,4.26,4.14,4.18,4.18,30678700,0.01182,0.028986,0,0,0,0.0,Decrease,Sell,Sell
2014-01-09,4.2,4.23,4.05,4.09,4.09,30667600,0.02619,0.044444,0,0,1,-0.021531,Decrease,Sell,Buy


In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1257 entries, 2014-01-03 to 2018-12-31
Data columns (total 15 columns):
Open                   1257 non-null float64
High                   1257 non-null float64
Low                    1257 non-null float64
Close                  1257 non-null float64
Adj Close              1257 non-null float64
Volume                 1257 non-null int32
Open_Close             1257 non-null float64
High_Low               1257 non-null float64
Increase_Decrease      1257 non-null int32
Buy_Sell_on_Open       1257 non-null int32
Buy_Sell               1257 non-null int32
Returns                1257 non-null float64
Increase_Decrease_L    1257 non-null object
Buy_Sell_on_Open_L     1257 non-null object
Buy_Sell_L             1257 non-null object
dtypes: float64(8), int32(4), object(3)
memory usage: 137.5+ KB


In [8]:
# separating the features and labels

data_feat = dataset.drop(columns=["Buy_Sell"],axis=1)
data_label = dataset["Buy_Sell"]

In [9]:
# first just take a look at all the columns
list(data_feat.columns)

['Open',
 'High',
 'Low',
 'Close',
 'Adj Close',
 'Volume',
 'Open_Close',
 'High_Low',
 'Increase_Decrease',
 'Buy_Sell_on_Open',
 'Returns',
 'Increase_Decrease_L',
 'Buy_Sell_on_Open_L',
 'Buy_Sell_L']

In [10]:
categorical_columns = [col for col in data_feat.columns if len(data_feat[col].unique())==2 or data_feat[col].dtype=='O']

In [11]:
continuous_columns = [col for col in data_feat.columns if len(data_feat[col].unique())>2 and (data_feat[col].dtype=='int64' or data_feat[col].dtype=='float64')]

In [13]:
print("Categorical Variables:")
print("categorical columns : ", categorical_columns)
print('-'*50)
print("Continuous Variables:")
print("continuous columns : ", continuous_columns)

Categorical Variables:
categorical columns :  ['Increase_Decrease', 'Buy_Sell_on_Open', 'Increase_Decrease_L', 'Buy_Sell_on_Open_L', 'Buy_Sell_L']
--------------------------------------------------
Continuous Variables:
continuous columns :  ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Open_Close', 'High_Low', 'Returns']
