Label SEC files

In [1]:

import json
import logging
import os.path

import numpy as np
import pandas

# log level
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: ignored

In [None]:
date_offset_weeks = 1
return_interval_weeks = 4

data_directory = "/content/drive/MyDrive/Histdata/"

In [None]:
def load_market_data():
    data = pandas.read_csv(os.path.join(data_directory, "hist_weekly_data1.csv"), sep=";", header=[0, 1])
    data2 = pandas.read_csv(os.path.join(data_directory, "hist_weekly_data2.csv"), sep=";", header=[0, 1])
    data = pandas.concat([data, data2], axis=1, join="inner")

    # drop second date column (they are exactly the same)
    data.drop(("data2", "date"), axis=1, inplace=True)

    # convert date column
    data[("data1", "date")] = pandas.to_datetime(data[("data1", "date")], format="%d.%m.%Y")

    return data


def load_cik_lookup():
    with open(file=os.path.join(data_directory, "company_tickers.json")) as cik_file:
        return json.loads(cik_file.read())


def get_ticker_symbol(cik):
    for entries in cik_lookup.values():
        if entries["cik_str"] == cik:
            return entries["ticker"]

    # no ticker symbol found
    return np.nan


def calculate_return(row):
    date = row["date"]
    symbol = row["symbol"]

    # get date and closing price
    closure_data = market_data[[("data1", "date"), (symbol, "close")]].droplevel(axis=1, level=0)

    # find entries with dates next to x weeks of the sec filing
    date_offset = pandas.DateOffset(weeks=date_offset_weeks)
    closure_data_0 = closure_data.loc[(closure_data["date"] >= date) & (closure_data["date"] <= (date + date_offset))]
    if closure_data_0.empty:
        return np.nan

    # use first entry
    closure_data_0 = closure_data_0.iloc[0]

    # get second date y weeks after the first
    date_offset = pandas.DateOffset(weeks=return_interval_weeks)
    closure_data_1 = closure_data.loc[closure_data["date"] >= (closure_data_0["date"] + date_offset)]
    if closure_data_1.empty:
        return np.nan

    # use first entry
    closure_data_1 = closure_data_1.iloc[0]

    # calculate return
    return closure_data_1["close"] / closure_data_0["close"] - 1


def assign_label(return_value, quantile):
    if return_value < quantile[0]:
        return "very negative"
    elif return_value < quantile[1]:
        return "negative"
    elif return_value < quantile[2]:
        return "positive"
    else:
        return "very positive"

In [None]:
parsed_directory = os.path.join(data_directory, "parsed")

cik_lookup = load_cik_lookup()
market_data = load_market_data()

# load parsed documents as dataframe
files = []
for directory in os.listdir(parsed_directory):
    cik_directory = os.path.join(parsed_directory, directory)
    for file_name in os.listdir(cik_directory):
        date_type = file_name.split("_")
        files.append({
            "CIK": directory,
            "name": file_name,
            "date": date_type[0],
            "type": date_type[1].split(".")[0]
        })
df = pandas.DataFrame(files, columns=["CIK", "name", "date", "type"])

# convert columns
df["date"] = pandas.to_datetime(df["date"])
df["CIK"] = pandas.to_numeric(df["CIK"])

# assign ticker symbol
df['symbol'] = df['CIK'].map(get_ticker_symbol)
# drop rows with no symbol
df = df.dropna(subset=["symbol"])

# sort for symbol and date
df = df.sort_values(["symbol", "date"], ascending=[True, True])

# calculate returns
df["return"] = df.apply(calculate_return, axis=1)
# drop rows with no return
df = df.dropna(subset=['return']).reset_index(drop=True)

# calculate quartile
quartile = [df["return"].quantile(0.25), df["return"].quantile(0.5), df["return"].quantile(0.75)]
logging.info(f'quartile: {quartile}')

# assign labels
df["label"] = df["return"].apply(lambda x: assign_label(x, quartile))

# export dataframe as JSON
print(df)
df.to_json(os.path.join(data_directory, "sec_dataset.json"))

# plot histogram with all returns and quantiles
# plt.hist(df['Return'], bins=len(df.index))
# plt.plot(quantiles)
# plt.xlabel('Returns')
# plt.ylabel('Probability')
df["return"].plot()