In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

### Load the original data

In [None]:
df = pd.DataFrame([])

with open("datasets/dow_jones_index/dow_jones_index.data") as f:
    feature = f.readlines()[0].split(",")
    feature[-1] = feature[-1].strip("\n")
with open("datasets/dow_jones_index/dow_jones_index.data") as f_:
    for row in f_:
        row_splitted = row.split(",")
        se = pd.DataFrame([row_splitted],columns=feature)
        df = df.append(se,ignore_index=True)

# create csv file #
df = df.drop(index=0)
df.to_csv("datasets/dow_jones_index/dow_jones_index.csv")

In [None]:
df = pd.read_csv("datasets/dow_jones_index/dow_jones_index.csv",index_col=0)
df

### Remove dollar sign

In [None]:
df_price = pd.DataFrame([])
price_type = ["open","high","low","close","next_weeks_open","next_weeks_close"]

for type in price_type:
    ls = []
    for i in df[type].values:
        i = i.strip("$")
        ls.append(i)
    se = pd.DataFrame(ls,columns=[type]).T
    df_price = df_price.append(se,ignore_index=False)
for type in price_type:
    df[type] = df_price.T[type].values


### Transform numerical values to float type

In [None]:
df.iloc[:,3:] = df.iloc[:,3:].values.astype(np.float32)

### Drop instances with missing value

In [None]:
df_drop_NaN = df.dropna(subset=["percent_change_volume_over_last_wk","previous_weeks_volume"])
df_drop_NaN

### Split train / test set

In [None]:
train_set = df_drop_NaN[df_drop_NaN["quarter"] == 1]
test_set = df_drop_NaN[df_drop_NaN["quarter"] == 2]

### Extract features (X) and labels (y) for each set

In [None]:
features_train = train_set.drop(["quarter","stock","date","percent_change_next_weeks_price"],axis=1)
features_test = test_set.drop(["quarter","stock","date","percent_change_next_weeks_price"],axis=1)
label_train = train_set["percent_change_next_weeks_price"]
label_test = test_set["percent_change_next_weeks_price"]

### Standardize

In [None]:
std_scalar = StandardScaler()
features_train_prepared = std_scalar.fit_transform(features_train)
features_test_prepared = std_scalar.fit_transform(features_test)

features_train_prepared_df = pd.DataFrame(features_train_prepared,columns=features_train.columns)
features_test_prepared_df = pd.DataFrame(features_test_prepared,columns=features_train.columns)

### Save data

In [None]:
features_train.to_csv("datasets/dow_jones_index/features_train.csv")
features_train_prepared_df.to_csv("datasets/dow_jones_index/features_train_standardized.csv")
features_test.to_csv("datasets/dow_jones_index/features_test.csv")
features_test_prepared_df.to_csv("datasets/dow_jones_index/features_test_standardized.csv")

label_train.to_csv("datasets/dow_jones_index/label_train.csv")
label_test.to_csv("datasets/dow_jones_index/label_test.csv")

### Add group index to training data

In [None]:
group_ls = [1,1,1,1,2,1,2,2,1,1,3,3]

features_train_grouped = features_train_prepared_df.append(pd.DataFrame(group_ls,columns=["group_index"],index=features_train.columns).T,ignore_index=False)
features_train_grouped = features_train_grouped.sort_values("group_index",axis=1)
features_train_grouped.to_csv("datasets/dow_jones_index/features_train_grouped.csv")