# Array Selection Numpy

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="white", color_codes=True)

import warnings
warnings.filterwarnings("ignore")

# fix_yahoo_finance is used to fetch data 
import fix_yahoo_finance as yf
yf.pdr_override()

In [2]:
# input
symbol = 'AMD'
start = '2014-01-01'
end = '2019-01-01'

# Read data 
dataset = yf.download(symbol,start,end)

# View Columns
dataset.head()

[*********************100%***********************]  1 of 1 downloaded


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-01-02,3.85,3.98,3.84,3.95,3.95,20548400
2014-01-03,3.98,4.0,3.88,4.0,4.0,22887200
2014-01-06,4.01,4.18,3.99,4.13,4.13,42398300
2014-01-07,4.19,4.25,4.11,4.18,4.18,42932100
2014-01-08,4.23,4.26,4.14,4.18,4.18,30678700


In [3]:
# Create Data
dataset['Open_Close'] = (dataset['Open'] - dataset['Adj Close'])/dataset['Open']
dataset['High_Low'] = (dataset['High'] - dataset['Low'])/dataset['Low']
dataset['Increase_Decrease'] = np.where(dataset['Volume'].shift(-1) > dataset['Volume'],1,0)
dataset['Buy_Sell_on_Open'] = np.where(dataset['Open'].shift(-1) > dataset['Open'],1,0)
dataset['Buy_Sell'] = np.where(dataset['Adj Close'].shift(-1) > dataset['Adj Close'],1,0)
dataset['Returns'] = dataset['Adj Close'].pct_change()
dataset = dataset.dropna()

In [4]:
dataset.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Open_Close,High_Low,Increase_Decrease,Buy_Sell_on_Open,Buy_Sell,Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2014-01-03,3.98,4.0,3.88,4.0,4.0,22887200,-0.005025,0.030928,1,1,1,0.012658
2014-01-06,4.01,4.18,3.99,4.13,4.13,42398300,-0.029925,0.047619,1,1,1,0.0325
2014-01-07,4.19,4.25,4.11,4.18,4.18,42932100,0.002387,0.034063,0,1,0,0.012107
2014-01-08,4.23,4.26,4.14,4.18,4.18,30678700,0.01182,0.028986,0,0,0,0.0
2014-01-09,4.2,4.23,4.05,4.09,4.09,30667600,0.02619,0.044444,0,0,1,-0.021531


## Feature Selection in Array (Numpy)

In [5]:
feature_data = np.asarray(dataset)

In [6]:
feature_data

array([[  3.98000000e+00,   4.00000000e+00,   3.88000000e+00, ...,
          1.00000000e+00,   1.00000000e+00,   1.26582278e-02],
       [  4.01000000e+00,   4.18000000e+00,   3.99000000e+00, ...,
          1.00000000e+00,   1.00000000e+00,   3.25000000e-02],
       [  4.19000000e+00,   4.25000000e+00,   4.11000000e+00, ...,
          1.00000000e+00,   0.00000000e+00,   1.21065375e-02],
       ..., 
       [  1.74300000e+01,   1.77400000e+01,   1.64400010e+01, ...,
          1.00000000e+00,   1.00000000e+00,  -2.29050279e-02],
       [  1.75300010e+01,   1.83099990e+01,   1.71399990e+01, ...,
          1.00000000e+00,   1.00000000e+00,   1.88679245e-02],
       [  1.81500000e+01,   1.85100000e+01,   1.78500000e+01, ...,
          0.00000000e+00,   0.00000000e+00,   3.59146465e-02]])

In [7]:
type(feature_data)

numpy.ndarray

In [8]:
feature_data[0]

array([  3.98000000e+00,   4.00000000e+00,   3.88000000e+00,
         4.00000000e+00,   4.00000000e+00,   2.28872000e+07,
        -5.02512563e-03,   3.09278351e-02,   1.00000000e+00,
         1.00000000e+00,   1.00000000e+00,   1.26582278e-02])

In [9]:
feature_data[0][1]

4.0

In [10]:
# row 1, all columns
feature_data[1, :]

array([  4.01000000e+00,   4.18000000e+00,   3.99000000e+00,
         4.13000000e+00,   4.13000000e+00,   4.23983000e+07,
        -2.99251870e-02,   4.76190476e-02,   1.00000000e+00,
         1.00000000e+00,   1.00000000e+00,   3.25000000e-02])

In [11]:
# row 0, column 0 to 1
# numpy includes the first, and excludes the last
# use :2 to reach 1
feature_data[0, :2]

array([ 3.98,  4.  ])

In [12]:
# one column with one row in array
feature_data[:,0]

array([  3.98    ,   4.01    ,   4.19    , ...,  17.43    ,  17.530001,
        18.15    ])

In [13]:
# This show in rows of one column 
feature_data[:,[0]]

array([[  3.98    ],
       [  4.01    ],
       [  4.19    ],
       ..., 
       [ 17.43    ],
       [ 17.530001],
       [ 18.15    ]])

In [14]:
np.hstack((feature_data, feature_data[:,[0]]))

array([[  3.98000000e+00,   4.00000000e+00,   3.88000000e+00, ...,
          1.00000000e+00,   1.26582278e-02,   3.98000000e+00],
       [  4.01000000e+00,   4.18000000e+00,   3.99000000e+00, ...,
          1.00000000e+00,   3.25000000e-02,   4.01000000e+00],
       [  4.19000000e+00,   4.25000000e+00,   4.11000000e+00, ...,
          0.00000000e+00,   1.21065375e-02,   4.19000000e+00],
       ..., 
       [  1.74300000e+01,   1.77400000e+01,   1.64400010e+01, ...,
          1.00000000e+00,  -2.29050279e-02,   1.74300000e+01],
       [  1.75300010e+01,   1.83099990e+01,   1.71399990e+01, ...,
          1.00000000e+00,   1.88679245e-02,   1.75300010e+01],
       [  1.81500000e+01,   1.85100000e+01,   1.78500000e+01, ...,
          0.00000000e+00,   3.59146465e-02,   1.81500000e+01]])

In [15]:
# Use transpose of particular column
feature_data.T[0]

array([  3.98    ,   4.01    ,   4.19    , ...,  17.43    ,  17.530001,
        18.15    ])

In [16]:
feature_data.shape[0]

1257

In [17]:
# Get more than one column
feature_data[:,[0,2]]

array([[  3.98    ,   3.88    ],
       [  4.01    ,   3.99    ],
       [  4.19    ,   4.11    ],
       ..., 
       [ 17.43    ,  16.440001],
       [ 17.530001,  17.139999],
       [ 18.15    ,  17.85    ]])

In [18]:
# Remove elements - Delete the last element
r = feature_data[:-1]
r

array([[  3.98000000e+00,   4.00000000e+00,   3.88000000e+00, ...,
          1.00000000e+00,   1.00000000e+00,   1.26582278e-02],
       [  4.01000000e+00,   4.18000000e+00,   3.99000000e+00, ...,
          1.00000000e+00,   1.00000000e+00,   3.25000000e-02],
       [  4.19000000e+00,   4.25000000e+00,   4.11000000e+00, ...,
          1.00000000e+00,   0.00000000e+00,   1.21065375e-02],
       ..., 
       [  1.68799990e+01,   1.79100000e+01,   1.60300010e+01, ...,
          1.00000000e+00,   0.00000000e+00,   7.50750751e-02],
       [  1.74300000e+01,   1.77400000e+01,   1.64400010e+01, ...,
          1.00000000e+00,   1.00000000e+00,  -2.29050279e-02],
       [  1.75300010e+01,   1.83099990e+01,   1.71399990e+01, ...,
          1.00000000e+00,   1.00000000e+00,   1.88679245e-02]])

In [19]:
r =np.delete(r,0) # Delete the first element
r

array([ 4.        ,  3.88      ,  4.        , ...,  1.        ,
        1.        ,  0.01886792])

In [20]:
print("Compare 2 array data")
print("Original Data:")
print(feature_data)
print("-" * 50)
a=np.delete(feature_data, 0, axis=0) # Delete the first column
print("New array data:\n", a)

Compare 2 array data
Original Data:
[[  3.98000000e+00   4.00000000e+00   3.88000000e+00 ...,   1.00000000e+00
    1.00000000e+00   1.26582278e-02]
 [  4.01000000e+00   4.18000000e+00   3.99000000e+00 ...,   1.00000000e+00
    1.00000000e+00   3.25000000e-02]
 [  4.19000000e+00   4.25000000e+00   4.11000000e+00 ...,   1.00000000e+00
    0.00000000e+00   1.21065375e-02]
 ..., 
 [  1.74300000e+01   1.77400000e+01   1.64400010e+01 ...,   1.00000000e+00
    1.00000000e+00  -2.29050279e-02]
 [  1.75300010e+01   1.83099990e+01   1.71399990e+01 ...,   1.00000000e+00
    1.00000000e+00   1.88679245e-02]
 [  1.81500000e+01   1.85100000e+01   1.78500000e+01 ...,   0.00000000e+00
    0.00000000e+00   3.59146465e-02]]
--------------------------------------------------
New array data:
 [[  4.01000000e+00   4.18000000e+00   3.99000000e+00 ...,   1.00000000e+00
    1.00000000e+00   3.25000000e-02]
 [  4.19000000e+00   4.25000000e+00   4.11000000e+00 ...,   1.00000000e+00
    0.00000000e+00   1.210653

In [21]:
print('Example Stock with ll Features: ', '\n', feature_data[0], 'Stock Data Features')

Example Stock with ll Features:  
 [  3.98000000e+00   4.00000000e+00   3.88000000e+00   4.00000000e+00
   4.00000000e+00   2.28872000e+07  -5.02512563e-03   3.09278351e-02
   1.00000000e+00   1.00000000e+00   1.00000000e+00   1.26582278e-02] Stock Data Features


In [22]:
selected_features = feature_data[:, [1, 2, 3, 11]]

In [23]:
x_data_features = selected_features[1:, :]
x_data_features[x_data_features == ''] = 0.0
x_data = x_data_features
selected_feature_labels = selected_features[0, 1:]
print('Selected Feature Names: \n', selected_feature_labels)
print('First few Stocks with Features, no Lables: ', '\n', x_data[0:2, :], ' ...')
print(np.size(x_data[:, 0]), 'Stocks by', np.size(x_data[0, :]), 'Features')

Selected Feature Names: 
 [ 3.88        4.          0.01265823]
First few Stocks with Features, no Lables:  
 [[ 4.18        3.99        4.13        0.0325    ]
 [ 4.25        4.11        4.18        0.01210654]]  ...
1256 Stocks by 4 Features


In [24]:
# Place dataset into input (X) and output (Y) variables
x_strings = x_data_features[:, 1:]  # take off tickers, as they can't be tensor'd
raw_X = x_strings.astype(np.float)  # convert strings to float
# num_of_features = np.size(raw_X[0, :])
print('Features for Tensor: ', np.size(raw_X[0, :]))

Features for Tensor:  3
