# Price Data Preprocessing with limited number of companies

In [72]:
import numpy as np
import pandas as pd

import os
import time
import random

from collections import defaultdict
from sklearn.model_selection import train_test_split

In [81]:
all_stock_path = './Stocks'
news_data = './news_day.csv'

consider_years = ['2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017']
company_number = 200

daily_threshold = 0.015
yearly_threshold = 0.10
UP, STAY, DOWN = 2, 1, 0

#train_size = 75000
#test_size = 15000
#total_size = 50000
test_size = 0.2

In [49]:
def get_news_data(news_data_path, years):
    res = defaultdict(dict)
    df = pd.read_csv(news_data_path)
    
    stocks = list(df['stock'])
    date = list(df['day'])
    title = list(df['title'])
    
    for i in range(len(stocks)):
        if date[i][:4] in years:
            if date[i] not in res[stocks[i]]:
                res[stocks[i]][date[i]] = [title[i]]
            else:
                res[stocks[i]][date[i]].append(title[i])
        if (i+1)%500000==0:
            print("The %dth line finished!" % (i+1))
    return res

def choose_company(news_dic, stock_path, company_number):
    company_name = []
    count = 0
    
    for _, _, files in os.walk(stock_path):
        random.shuffle(files)
        for file in files:
            file_name = file.split('.')[0].upper()
            if file_name in news_dic:
                count += 1
                company_name.append(file_name)
            if count >= company_number:
                break
    return company_name

def build_path(company, stock_path=all_stock_path):
    return stock_path + '/' + company + '.us.txt'

def get_label(rate, thres):
    if rate>=thres:
        return UP
    elif rate<=-thres:
        return DOWN
    else:
        return STAY

def price_news(company_list, news_dic, data_size=total_size):
    info = []
    column_names = ['stock', 'date', 'title', 'label']
    
    start_t = time.time()
    
    for i, comp in enumerate(company_list):
        comp_path = build_path(comp.lower())
        
        try:
            df = pd.read_csv(comp_path)
        except:
            continue
        
        dates = list(df['Date'])
        open_price = list(df['Open'])
        close_price = list(df['Close'])
        
        for j, dat in enumerate(dates):
            if dat in news_dic[comp] and open_price[j]!=0:
                rate = (close_price[j]-open_price[j])/open_price[j]
                label = get_label(rate, daily_threshold)
                for news in news_dic[comp][dat]:
                    info.append([comp, dat, news, label])
        
        if (i+1)%50==0:
            print("The %dth company done, time till now is %.4fs." % (i+1, time.time()-start_t))
            
    info_df = pd.DataFrame(columns=column_names, data=info)
    return info_df

In [56]:
news_dic = get_news_data(news_data, consider_years)

The 500000th line finished!
The 1000000th line finished!


In [82]:
company_list = choose_company(news_dic, all_stock_path, company_number)
price_new_info = price_news(company_list, news_dic)

The 50th company done, time till now is 0.4434s.
The 100th company done, time till now is 0.8856s.
The 150th company done, time till now is 1.3327s.
The 200th company done, time till now is 1.7798s.


In [83]:
train_df, test_df = train_test_split(price_new_info, test_size=test_size, shuffle=True)

In [88]:
train_df.to_csv('LimitedComp/train.csv', index=False)
test_df.to_csv('LimitedComp/test.csv', index=False)

### Distinct company list:

In [89]:
col_name = ['Company']
distinct_company = pd.DataFrame(columns=col_name, data=sorted(company_list))
distinct_company.to_csv('LimitedComp/dis_comp.csv', index=False)

### 10K label generating:

In [90]:
def generate_10K_label(company_list, years):
    res = []
    dic = defaultdict(list)
    column_name = ['stock', 'year', 'label']
    
    start_t = time.time()
    
    for i, comp in enumerate(sorted(company_list)):
        comp_path = build_path(comp.lower())
        df = pd.read_csv(comp_path)
        
        dates = list(df['Date'])
        open_price = list(df['Open'])
        close_price = list(df['Close'])
        
        for j, date in enumerate(dates):
            if date[:4] in years:
                key = comp + '-' + date[:4]
                dic[key].append(close_price[j])
        
        if (i+1)%50==0:
            print("The %dth company done, time till now: %.4fs." % (i+1, time.time()-start_t))
    
    for comp in company_list:
        for year in years:
            key = comp + '-' + year
            if dic[key]!=[]:
                diff = (dic[key][-1] - dic[key][0]) / dic[key][0]
                label = get_label(diff, yearly_threshold)
                res.append([comp, year, label])
    
    TenK_data = pd.DataFrame(columns=column_name, data=res)
    return TenK_data

In [91]:
TenK_data = generate_10K_label(company_list, consider_years)

The 50th company done, time till now: 0.6345s.
The 100th company done, time till now: 1.1843s.
The 150th company done, time till now: 1.8857s.
The 200th company done, time till now: 2.5278s.


In [93]:
TenK_data.to_csv('LimitedComp/10K_label.csv', index=False)