## Preprocessing the price data & merge with the news data

In [11]:
import numpy as np
import pandas as pd

import os
import time
import random

* Some parameters:

In [15]:
all_stocks = './Stocks'
news_data = './news_day.csv'

consider_years = ['2014', '2015', '2016', '2017']
threshold = 0.015
UP, DOWN, STAY = 1, 0, -1

train_size = 50000
test_size = 10000

### Price Data Preprocessing:

In [7]:
def get_company(relative_path):
    company_name = []
    count = 0
    
    for _, _, files in os.walk(relative_path):
        for file in files:
            count += 1
            company_name.append(file.split('.')[0].lower())
    
    return sorted(company_name), count

def build_path(company):
    return all_stocks + '/' + company + '.us.txt'

def get_label(rate):
    if rate>=threshold:
        return UP
    elif rate<=-threshold:
        return DOWN
    else:
        return STAY

def rate_diff(company, years):
    com_path = build_path(company)
    
    #df = pd.read_csv(com_path)
    
    try:
        df = pd.read_csv(com_path)
    except:
        return []
    
    dates = list(df['Date'])
    open_price = list(df['Open'])
    close_price = list(df['Close'])
    
    res = {}
    
    for i in range(len(dates)):
        if dates[i][:4] in years:
            if open_price[i]!=0:
                rate = (close_price[i]-open_price[i]) / open_price[i]
                label = get_label(rate)
                res[dates[i]] = label
                #res.append([dates[i], label])
    
    return res

def extract_price_label(company_list, years):
    label_all_company = {}
    
    start_t = time.time()
    
    for i, comp in enumerate(company_list):
        temp = rate_diff(comp, years)
        if temp:
            label_all_company[comp] = temp
        if (i+1)%500==0:
            print("The %dth company processed; time till now is %.4fs." % (i+1, time.time()-start_t))
    
    return label_all_company

In [5]:
company_list, company_number = get_company(all_stocks)

print("There are %d companies in total." % (company_number))

There are 7195 companies in total.


In [8]:
price_label_all_company = extract_price_label(company_list, consider_years)

total_number = 0
for key in price_label_all_company:
    total_number += len(price_label_all_company[key])
    
print("There are %d rows of record in this dictionary." % (total_number))

The 500th company processed; time till now is 4.3642s.
The 1000th company processed; time till now is 8.3630s.
The 1500th company processed; time till now is 12.6893s.
The 2000th company processed; time till now is 16.6319s.
The 2500th company processed; time till now is 20.5600s.
The 3000th company processed; time till now is 24.2738s.
The 3500th company processed; time till now is 28.1657s.
The 4000th company processed; time till now is 32.0057s.
The 4500th company processed; time till now is 36.0976s.
The 5000th company processed; time till now is 39.9549s.
The 5500th company processed; time till now is 44.0088s.
The 6000th company processed; time till now is 48.8638s.
The 6500th company processed; time till now is 52.9304s.
The 7000th company processed; time till now is 57.0681s.
There are 5442556 rows of record in this dictionary.


### News Data Preprocessing:

In [16]:
def get_news_data(news_data_path, years):
    df_news = pd.read_csv(news_data_path)
    
    stocks = df_news['stock']
    date = df_news['day']
    news_title = df_news['title']
    
    news_info = []
    
    for i in range(len(stocks)):
        if date[i][:4] in years:
            news_info.append([stocks[i], date[i], news_title[i]])
    
    random.shuffle(news_info)
    
    return news_info

def generate_train_test(price_label, news_info, train_size=train_size, test_size=test_size):
    columns_name = ['stock', 'date', 'title', 'label']
    
    count = 0
    train_set = []
    test_set = []
    
    for i, info in enumerate(news_info):
        if count>=train_size+test_size:
            break
        stock, date, title = info
        lower_stock = stock.lower()
        if lower_stock in price_label:
            if date in price_label[lower_stock]:
                count += 1
                label = price_label[lower_stock][date]
                if count <= train_size:
                    train_set.append([stock, date, title, label])
                else:
                    test_set.append([stock, date, title, label])
    
    train_df = pd.DataFrame(columns=columns_name, data=train_set)
    test_df = pd.DataFrame(columns=columns_name, data=test_set)
    
    return train_df, test_df

In [13]:
news_info = get_news_data(news_data, consider_years)

print("There are %d rows of data in news_info." % (len(news_info)))

There are 524439 rows of data in news_info.


In [17]:
train_df, test_df = generate_train_test(price_label_all_company, news_info)

In [28]:
print("The shape of training dataframe is %s." % (str(train_df.shape)))
print("The shape of test dataframe is %s." % (str(test_df.shape)))

The shape of training dataframe is (50000, 4).
The shape of test dataframe is (10000, 4).


In [30]:
train_df.to_csv('./train.csv', index=False)
test_df.to_csv('./test.csv', index=False)