## Preprocessing the price data & merge with the news+10K data

In [1]:
import numpy as np
import pandas as pd

import os
import time
import random

from collections import defaultdict

* Some parameters:

In [2]:
all_stocks = './Stocks'
news_data = './news_day.csv'

consider_years = ['2014', '2015', '2016', '2017']
daily_threshold = 0.015
yearly_threshold = 0.10
UP, STAY, DOWN = 2, 1, 0

train_size = 80000
test_size = 20000

### Price Data Preprocessing:

In [3]:
def get_company(relative_path):
    company_name = []
    count = 0
    
    for _, _, files in os.walk(relative_path):
        for file in files:
            count += 1
            company_name.append(file.split('.')[0].lower())
    
    return sorted(company_name), count

def build_path(company):
    return all_stocks + '/' + company + '.us.txt'

def get_label(rate):
    if rate>=daily_threshold:
        return UP
    elif rate<=-daily_threshold:
        return DOWN
    else:
        return STAY

def rate_diff(company, years):
    com_path = build_path(company)
    
    #df = pd.read_csv(com_path)
    
    try:
        df = pd.read_csv(com_path)
    except:
        return []
    
    dates = list(df['Date'])
    open_price = list(df['Open'])
    close_price = list(df['Close'])
    
    res = {}
    
    for i in range(len(dates)):
        if dates[i][:4] in years:
            if open_price[i]!=0:
                rate = (close_price[i]-open_price[i]) / open_price[i]
                label = get_label(rate)
                res[dates[i]] = label
                #res.append([dates[i], label])
    
    return res

def extract_price_label(company_list, years):
    label_all_company = {}
    
    start_t = time.time()
    
    for i, comp in enumerate(company_list):
        temp = rate_diff(comp, years)
        if temp:
            label_all_company[comp] = temp
        if (i+1)%500==0:
            print("The %dth company processed; time till now is %.4fs." % (i+1, time.time()-start_t))
    
    return label_all_company

In [4]:
company_list, company_number = get_company(all_stocks)

print("There are %d companies in total." % (company_number))

There are 7195 companies in total.


In [5]:
price_label_all_company = extract_price_label(company_list, consider_years)

total_number = 0
for key in price_label_all_company:
    total_number += len(price_label_all_company[key])
    
print("There are %d rows of record in this dictionary." % (total_number))

The 500th company processed; time till now is 5.1144s.
The 1000th company processed; time till now is 9.3844s.
The 1500th company processed; time till now is 13.3043s.
The 2000th company processed; time till now is 17.5232s.
The 2500th company processed; time till now is 21.4627s.
The 3000th company processed; time till now is 25.4160s.
The 3500th company processed; time till now is 29.2270s.
The 4000th company processed; time till now is 33.4277s.
The 4500th company processed; time till now is 38.0440s.
The 5000th company processed; time till now is 42.2219s.
The 5500th company processed; time till now is 46.5047s.
The 6000th company processed; time till now is 50.3297s.
The 6500th company processed; time till now is 54.3039s.
The 7000th company processed; time till now is 58.2257s.
There are 5442556 rows of record in this dictionary.


### News Data Preprocessing:

In [6]:
def get_news_data(news_data_path, years):
    df_news = pd.read_csv(news_data_path)
    
    stocks = df_news['stock']
    date = df_news['day']
    news_title = df_news['title']
    
    news_info = []
    
    for i in range(len(stocks)):
        if date[i][:4] in years:
            news_info.append([stocks[i], date[i], news_title[i]])
    
    random.shuffle(news_info)
    
    return news_info

def generate_train_test(price_label, news_info, train_size=train_size, test_size=test_size):
    columns_name = ['stock', 'date', 'title', 'label']
    
    count = 0
    train_set = []
    test_set = []
    
    for i, info in enumerate(news_info):
        if count>=train_size+test_size:
            break
        stock, date, title = info
        lower_stock = stock.lower()
        if lower_stock in price_label:
            if date in price_label[lower_stock]:
                count += 1
                label = price_label[lower_stock][date]
                if count <= train_size:
                    train_set.append([stock, date, title, label])
                else:
                    test_set.append([stock, date, title, label])
    
    train_df = pd.DataFrame(columns=columns_name, data=train_set)
    test_df = pd.DataFrame(columns=columns_name, data=test_set)
    
    return train_df, test_df

In [7]:
news_info = get_news_data(news_data, consider_years)

print("There are %d rows of data in news_info." % (len(news_info)))

There are 524439 rows of data in news_info.


In [8]:
train_df, test_df = generate_train_test(price_label_all_company, news_info)

In [9]:
print("The shape of training dataframe is %s." % (str(train_df.shape)))
print("The shape of test dataframe is %s." % (str(test_df.shape)))

The shape of training dataframe is (80000, 4).
The shape of test dataframe is (20000, 4).


In [10]:
train_df.to_csv('RandomComp/train.csv', index=False)
test_df.to_csv('RandomComp/test.csv', index=False)

### Distinct company list:

In [11]:
all_company = list(set(list(train_df['stock']) + list(test_df['stock'])))

In [12]:
col_name = ['Company']
distinct_company = pd.DataFrame(columns=col_name, data=sorted(all_company))
distinct_company.to_csv('RandomComp/dis_comp.csv', index=False)

### 10K label generating:

In [13]:
def generate_10K_label(all_company, consider_years, price_label_all_company):
    res = []
    dic = defaultdict(list)
    column_name = ['stock', 'year', 'label']
    
    start_t = time.time()
    
    for i, comp in enumerate(all_company):
        comp_path = build_path(comp)
        df = pd.read_csv(comp_path)
        
        dates = list(df['Date'])
        open_price = list(df['Open'])
        close_price = list(df['Close'])
        
        for j, date in enumerate(dates):
            if date[:4] in consider_years:
                key = comp + '-' + date[:4]
                dic[key].append(close_price[j])
        
        if (i+1)%300==0:
            print("The %dth company done, time till now: %.4fs." % (i+1, time.time()-start_t))
    
    for comp in all_company:
        for year in consider_years:
            key = comp + '-' + year
            if dic[key]!=[]:
                diff = (dic[key][-1] - dic[key][0]) / dic[key][0]
                if diff>=yearly_threshold:
                    label = UP
                elif diff<=-yearly_threshold:
                    label = DOWN
                else:
                    label = STAY
                res.append([comp, year, label])
    
    TenK_data = pd.DataFrame(columns=column_name, data=res)
    return TenK_data

In [14]:
TenK_data = generate_10K_label(all_company, consider_years, price_label_all_company)

The 300th company done, time till now: 2.7885s.
The 600th company done, time till now: 6.1897s.
The 900th company done, time till now: 8.9142s.
The 1200th company done, time till now: 11.6203s.
The 1500th company done, time till now: 14.3261s.
The 1800th company done, time till now: 17.0483s.
The 2100th company done, time till now: 19.9115s.
The 2400th company done, time till now: 22.6122s.
The 2700th company done, time till now: 25.3724s.
The 3000th company done, time till now: 28.0650s.
The 3300th company done, time till now: 30.7666s.


In [15]:
TenK_data.to_csv('RandomComp/10K_label.csv', index=False)