# Class Seminar 2

Put the data files in the same folder as your Jupyter Notebook files if there are data files. 

We learn coding by imitation. Therefore, we start by copying example codes and run them. Based on the outputs, comments, and the codes, we understand what the codes need and what the codes produce. Then we can modify the codes and apply them to new data for solving new problems.

### Data Munging

Data loading and preprocessing with pandas, is a fast and easy data loading,  

In [3]:
import pandas as pd
import numpy as np

In [6]:
iris_filename = "datasets-uci-iris.csv"

In [8]:
iris = pd.read_csv(iris_filename, sep=',', header=None,names= ['sepal_length', 'sepal_width','petal_length', 'petal_width','target'])

In [9]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [10]:
iris.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'target'], dtype='object')

In [11]:
y = iris['target']

In [12]:
y

0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
            ...      
145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Name: target, Length: 150, dtype: object

In [13]:
X = iris[['sepal_length', 'sepal_width']]

In [14]:
X

Unnamed: 0,sepal_length,sepal_width
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6
...,...,...
145,6.7,3.0
146,6.3,2.5
147,6.5,3.0
148,6.2,3.4


### Dealing with problematic data

In [15]:
import pandas as pd

In [16]:
fake_dataset = pd.read_csv('a_loading_example_1.csv', sep=',')

In [17]:
fake_dataset

Unnamed: 0,Date,Temperature_city_1,Temperature_city_2,Temperature_city_3,Which_destination
0,20140910,80.0,32.0,40,1
1,20140911,100.0,50.0,36,2
2,20140912,102.0,55.0,46,1
3,20140913,60.0,20.0,35,3
4,20140914,60.0,,32,3
5,20140915,,57.0,42,2


In [18]:
fake_dataset.fillna(fake_dataset.mean(axis=0))

Unnamed: 0,Date,Temperature_city_1,Temperature_city_2,Temperature_city_3,Which_destination
0,20140910,80.0,32.0,40,1
1,20140911,100.0,50.0,36,2
2,20140912,102.0,55.0,46,1
3,20140913,60.0,20.0,35,3
4,20140914,60.0,42.8,32,3
5,20140915,80.4,57.0,42,2


### Data preprocessing

In [19]:
iris['target'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

### Data selection

In [20]:
dataset = pd.read_csv('a_selection_example_1.csv')

In [21]:
dataset

Unnamed: 0,n,val1,val2,val3
0,100,10,10,C
1,101,10,20,C
2,102,10,30,B
3,103,10,40,B
4,104,10,50,A


In [22]:
dataset = pd.read_csv('a_selection_example_1.csv', index_col=0)

In [23]:
dataset

Unnamed: 0_level_0,val1,val2,val3
n,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100,10,10,C
101,10,20,C
102,10,30,B
103,10,40,B
104,10,50,A


In [24]:
dataset['val3'][104]

'A'

In [25]:
dataset.loc[104, 'val3']

'A'

In [26]:
dataset.iloc[4, 2]

'A'

In [27]:
dataset[['val3', 'val2']][0:2]

Unnamed: 0_level_0,val3,val2
n,Unnamed: 1_level_1,Unnamed: 2_level_1
100,C,10
101,C,20


# Trading Strategies

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split



In [3]:
pd.options.mode.chained_assignment = None
# Don't show the waring when writing in a copy slice, not the original data.

## Load Data

In [4]:
dataset = pd.read_csv('AAPL.csv') 
dataset

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
0,0.936384,1.004464,0.907924,0.999442,0.850643,535796800
1,0.966518,0.987723,0.903460,0.915179,0.778926,512377600
2,0.926339,0.987165,0.919643,0.928571,0.790324,778321600
3,0.947545,0.955357,0.848214,0.848214,0.721930,767972800
4,0.861607,0.901786,0.852679,0.888393,0.756128,460734400
...,...,...,...,...,...,...
5782,130.919998,132.419998,129.639999,131.860001,131.658981,63814900
5783,131.380005,131.410004,128.720001,130.029999,129.831772,69007800
5784,129.669998,131.029999,125.870003,126.040001,125.847855,85438400
5785,127.989998,130.479996,127.730003,129.610001,129.412415,75703700


## Create features

In [5]:
dataset = dataset.dropna() 
dataset = dataset[['Open', 'High', 'Low', 'Close']]

dataset['H-L'] = dataset['High'] - dataset['Low'] 
dataset['O-C'] = dataset['Close'] - dataset['Open'] 
dataset['3day MA'] = dataset['Close'].shift(1).rolling(window = 3).mean() 
dataset['10day MA'] = dataset['Close'].shift(1).rolling(window = 10).mean() 

dataset['30day MA'] = dataset['Close'].shift(1).rolling(window = 30).mean() 
dataset['Std_dev']= dataset['Close'].rolling(5).std() 

dataset['Price_Rise'] = np.where(dataset['Close'].shift(-1) > dataset['Close'], 1, 0)
dataset = dataset.dropna()
dataset.head()

Unnamed: 0,Open,High,Low,Close,H-L,O-C,3day MA,10day MA,30day MA,Std_dev,Price_Rise
30,1.029018,1.070871,1.02846,1.0625,0.042411,0.033482,1.006138,0.973214,0.939639,0.03399,0
31,1.051339,1.054688,1.001116,1.018973,0.053572,-0.032366,1.022507,0.989955,0.941741,0.033357,1
32,1.02846,1.03125,1.010045,1.02567,0.021205,-0.00279,1.038504,1.003627,0.945201,0.033203,0
33,1.023438,1.030134,0.989955,0.993304,0.040179,-0.030134,1.035714,1.013951,0.948437,0.025057,1
34,0.983259,1.044085,0.952567,1.016183,0.091518,0.032924,1.012649,1.016853,0.953274,0.025057,1
