In [1]:
import re
from bs4 import BeautifulSoup
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

#def read_txt(file_name):
   # with open(file_name, "r", encoding='UTF8') as txt_file:
        #return txt_file.read()
def read_txt(file_name):
    with open(file_name, "r", encoding='ISO-8859-1') as txt_file:
        return txt_file.read()

def extract(text_initial):
    start_index = text_initial.find('<DOCUMENT>') + len('<DOCUMENT>')
    end_index = text_initial.find('</DOCUMENT>')
    return text_initial[start_index:end_index]

def BeautifulSoup_clean1(extracted_content):
    soup = BeautifulSoup(extracted_content, 'html.parser')
    return soup.get_text()

def further_clean(text):
    for a_sign in ['\\n', '\\t', '☐', '☒', '\xa0', '●', '“', '”']:
        text = text.replace(a_sign, " ")
    for a_punc in string.punctuation:
        text = text.replace(a_punc, " ")
    return re.sub('\s+', " ", text).lower().strip()

def word_count(text):
    word_list = word_tokenize(text)
    total_num = len(word_list)
    return total_num, word_list

def filter_words(word_list):
    stop_words = set(stopwords.words('english'))
    pattern = re.compile(r'[\d\W]')
    filtered_list = [word for word in word_list if word not in stop_words and not pattern.search(word) and len(word) > 1]
    return len(filtered_list), filtered_list

def process_file(file_name):
    text_initial = read_txt(file_name)
    extracted_content = extract(text_initial)
    cleaned_text = further_clean(BeautifulSoup_clean1(extracted_content))
    total_num, word_list = word_count(cleaned_text)
    word_count_filtered, filtered_list = filter_words(word_list)
    return word_count_filtered, filtered_list



### test code for 14 files


In [2]:
import os
import pandas as pd

def get_all_files(folder_name):
    """Extract all filenames with .txt extension from the given directory"""
    all_files = [f for f in os.listdir(folder_name) if os.path.isfile(os.path.join(folder_name, f)) and f.endswith('.txt')]
    return all_files


def process_directory(folder_name, files_list):
    # Initialize an empty dataframe
    df = pd.DataFrame(columns=["ticker", "file_number", "date", "word_count", "words"])
    data = []

    # Iterate over each file in the provided files list
    for file_name in files_list:
        # Full path to the file
        file_path = os.path.join(folder_name, file_name)
        
        # Process the file
        count, words = process_file(file_path)

        # Extract ticker, file_number, and date from the file_name
        ticker = file_name.split('_')[0]
        file_number = file_name.split('_')[1].split(' : ')[0]
        date_str = file_name.split(' : ')[1].split('.txt')[0]
        
        # Convert date string to actual date format
        date = pd.to_datetime(date_str, format='%Y%m%d')

        # Append results to the data list
        data.append({
            "ticker": ticker,
            "file_number": file_number,
            "date": date,
            "word_count": count,
            "words": words
        })

    # Convert data list to a DataFrame and concatenate with the original df
    df = pd.concat([df, pd.DataFrame(data)], ignore_index=True)

    return df




if __name__ == '__main__':
    # You need to have nltk's resources downloaded to run the program
    nltk.download('punkt')
    nltk.download('stopwords')

    folder_name = input("Enter the folder name: ")
    
    all_files = get_all_files(folder_name)
    print("Files to be processed:", all_files)
    
    confirmation = input("Continue processing? (yes/no): ").strip().lower()
    if confirmation == 'yes':
        results_df = process_directory(folder_name, all_files)
      
    else:
        print("Processing aborted.")


[nltk_data] Downloading package punkt to /Users/mitunl/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mitunl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Enter the folder name: test
Files to be processed: ['A_0001090872-20-000004.txt : 20200303.txt', 'A_0001090872-18-000004.txt : 20180306.txt', 'A_0001090872-19-000006.txt : 20190305.txt', 'A_0001090872-22-000007.txt : 20220303.txt', 'A_0001090872-19-000015.txt : 20190830.txt', 'A_0001090872-18-000015.txt : 20180830.txt', 'A_0001090872-21-000009.txt : 20210601.txt', 'A_0001090872-22-000012.txt : 20220531.txt', 'A_0001090872-19-000010.txt : 20190530.txt', 'A_0001090872-21-000015.txt : 20210901.txt', 'A_0001090872-18-000009.txt : 20180531.txt', 'A_0001090872-21-000004.txt : 20210302.txt', 'A_0001090872-20-000014.txt : 20200901.txt', 'A_0001090872-22-000017.txt : 20220901.txt', 'A_0001090872-20-000010.txt : 20200601.txt']
Continue processing? (yes/no): yes


In [3]:
results_df

Unnamed: 0,ticker,file_number,date,word_count,words
0,A,0001090872-20-000004.txt,2020-03-03,18292,"[htm, document, accelerated, us, gaap, product..."
1,A,0001090872-18-000004.txt,2018-03-06,16522,"[htm, document, table, contentsunited, statess..."
2,A,0001090872-19-000006.txt,2019-03-05,17730,"[htm, document, table, contentsunited, statess..."
3,A,0001090872-22-000007.txt,2022-03-03,18088,"[htm, gaap, gaap, gaap, gaap, usdxbrli, gaap, ..."
4,A,0001090872-19-000015.txt,2019-08-30,20449,"[htm, document, accelerated, us, gaap, product..."
5,A,0001090872-18-000015.txt,2018-08-30,18482,"[htm, document, table, contentsunited, statess..."
6,A,0001090872-21-000009.txt,2021-06-01,20499,"[htm, gaap, gaap, gaap, gaap, gaap, gaap, gaap..."
7,A,0001090872-22-000012.txt,2022-05-31,20064,"[htm, gaap, gaap, gaap, gaap, gaap, gaap, gaap..."
8,A,0001090872-19-000010.txt,2019-05-30,19174,"[htm, document, table, contentsunited, statess..."
9,A,0001090872-21-000015.txt,2021-09-01,20441,"[htm, gaap, gaap, gaap, gaap, gaap, gaap, gaap..."


### processing all the files in the form of small chunks

In [5]:
import time  # Import the time module

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

if __name__ == '__main__':
    import nltk  # You need to import the nltk package
    # ... [rest of the imports]

    # You need to have nltk's resources downloaded to run the program
    nltk.download('punkt')
    nltk.download('stopwords')

    folder_name = input("Enter the folder name: ")
    all_files = get_all_files(folder_name)
    print("Total files to be processed:", len(all_files))
    
    chunk_size = len(all_files) // 16  # Divide the files into 16 chunks
    file_chunks = list(chunks(all_files, chunk_size))

    all_dataframes = []  # This will store all the dataframes

    for index, file_chunk in enumerate(file_chunks):
        print(f"Processing chunk {index+1} out of 16...")
        
        start_time = time.time()  # Record the start time
        
        # Process the chunk of files
        results_df_chunk = process_directory(folder_name, file_chunk)
        all_dataframes.append(results_df_chunk)
        
        end_time = time.time()  # Record the end time
        elapsed_time = end_time - start_time  # Calculate the elapsed time
        
        # Print the number of files processed and the time taken
        print(f"Processed {len(file_chunk)} files in chunk {index+1}.")
        print(f"Time taken for chunk {index+1}: {elapsed_time} seconds")
        
        # Save the processed chunk to a CSV for safety
        results_df_chunk.to_csv(f"chunk_{index+1}.csv", index=False)
        print(f"Saved chunk {index+1} to chunk_{index+1}.csv")
        
        # If not the last chunk, ask for permission to continue
        if index < 15:
            confirmation = input("Continue processing the next chunk? (yes/no): ").strip().lower()
            if confirmation != 'yes':
                print("Processing aborted.")
                break

    # Combine all dataframes
    final_df = pd.concat(all_dataframes, ignore_index=True)
    
    print("Processing completed.")


[nltk_data] Downloading package punkt to /Users/mitunl/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mitunl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Enter the folder name: 10-Q
Total files to be processed: 7304
Processing chunk 1 out of 16...
Processed 456 files in chunk 1.
Time taken for chunk 1: 317.7752001285553 seconds
Saved chunk 1 to chunk_1.csv
Continue processing the next chunk? (yes/no): yes
Processing chunk 2 out of 16...
Processed 456 files in chunk 2.
Time taken for chunk 2: 353.67844581604004 seconds
Saved chunk 2 to chunk_2.csv
Continue processing the next chunk? (yes/no): yes
Processing chunk 3 out of 16...
Processed 456 files in chunk 3.
Time taken for chunk 3: 376.4825930595398 seconds
Saved chunk 3 to chunk_3.csv
Continue processing the next chunk? (yes/no): yes
Processing chunk 4 out of 16...
Processed 456 files in chunk 4.
Time taken for chunk 4: 396.8757908344269 seconds
Saved chunk 4 to chunk_4.csv
Continue processing the next chunk? (yes/no): yes
Processing chunk 5 out of 16...
Processed 456 files in chunk 5.
Time taken for chunk 5: 439.02630400657654 seconds
Saved chunk 5 to chunk_5.csv
Continue processing t

In [7]:
final_df
final_df.to_csv('cleaned_text.csv', index=False)

In [8]:
final_df

Unnamed: 0,ticker,file_number,date,word_count,words
0,SNA,0000091440-21-000011.txt,2021-04-22,13001,"[sna, htm, sna, gaap, gaap, propertyplantandeq..."
1,MCK,0000927653-20-000093.txt,2020-11-03,17989,"[mck, htm, mck, gaap, gaap, usdxbrli, gaap, ga..."
2,HSIC,0001000228-20-000055.txt,2020-08-04,20228,"[htm, quarterly, report, truetruemodified, var..."
3,CME,0001156375-21-000052.txt,2021-05-05,7184,"[cme, htm, cme, gaap, gaap, gaap, gaap, gaap, ..."
4,WTW,0001564590-18-026537.txt,2018-11-02,24493,"[wltw, htm, wltw, htm, united, states, securit..."
...,...,...,...,...,...
7299,MCK,0000927653-21-000065.txt,2021-08-05,14680,"[mck, htm, mck, gaap, usdxbrli, gaap, gaap, ga..."
7300,YUM,0001041061-19-000048.txt,2019-11-05,10695,"[yum, htm, document, false, yum, refranchising..."
7301,PARA,0000813828-19-000033.txt,2019-11-12,18320,"[htm, document, us, gaap, commonclassamember, ..."
7302,TMUS,0001283699-22-000117.txt,2022-07-29,14902,"[tmus, htm, tmus, form, tmus, usdxbrli, gaap, ..."


In [9]:
import yfinance as yf
import numpy as np
import pandas as pd
import statsmodels.api as sm
from datetime import datetime, timedelta

# (1) Gather the S&P 500 historical close prices
sp500 = yf.Ticker('^GSPC')
history = sp500.history(period='1d', start='2017-06-01', end='2023-02-01')['Close']
mk_returns = history / history.shift(1) - 1
mk_returns.dropna(inplace=True)
mk_returns.index = mk_returns.index.strftime('%Y-%m-%d')
mk_returns.name = 'market_returns'
def previous_bd(date):
    return (datetime.strptime(date, "%Y-%m-%d") + timedelta(days=-1)).strftime('%Y-%m-%d')

# (2) Define your function to gather market returns
def get_mk_returns(data):
    filing_date = data['date']
    while not filing_date in mk_returns.index:
        filing_date = (datetime.strptime(filing_date, "%Y-%m-%d") + timedelta(days=1)).strftime('%Y-%m-%d')
    n_row = mk_returns.index.get_loc(filing_date)
    start_date = mk_returns.index[n_row - 120]
    end_date = mk_returns.index[n_row]
    window = np.logical_and(mk_returns.index >= start_date, mk_returns.index < end_date)
    return mk_returns[window]

# (3) Define your function to gather past returns of a given ticker
def get_ticker_past_returns(data):
    filing_date = data['date']
    ticker = data['ticker']
    while not filing_date in mk_returns.index:
        filing_date = (datetime.strptime(filing_date, "%Y-%m-%d") + timedelta(days=1)).strftime('%Y-%m-%d')
    n_row = mk_returns.index.get_loc(filing_date)
    start_date = mk_returns.index[n_row - 121]
    end_date = mk_returns.index[n_row]
    history = yf.Ticker(ticker).history(period='1d', start=start_date, end=end_date)['Close']
    returns = history / history.shift(1) - 1
    returns.dropna(inplace=True)
    returns.index = returns.index.strftime('%Y-%m-%d')
    returns.name = ticker
    return returns

# (4) Define your function to compute beta
def get_beta(data):
    series1 = get_ticker_past_returns(data)
    series2 = get_mk_returns(data)
    ts1_reset = series1.reset_index()
    ts2_reset = series2.reset_index()
    merged = pd.merge(ts1_reset, ts2_reset, on='Date', how='inner')
    merged.set_index('Date', inplace=True)
    merged = sm.add_constant(merged)
    model = sm.OLS(merged.iloc[:, 2], merged.iloc[:, :2])
    results = model.fit()
    beta = results.params[data['ticker']]
    return beta

# (5) Define your function to compute excess returns
def excess_returns(data):
    filing_date = data['date']
    ticker = data['ticker']
    while not filing_date in mk_returns.index:
        filing_date = (datetime.strptime(filing_date, "%Y-%m-%d") + timedelta(days=1)).strftime('%Y-%m-%d')
    n_row = mk_returns.index.get_loc(filing_date)
    start_date = mk_returns.index[n_row + 1]
    end_date = mk_returns.index[n_row + 4]
    history = yf.Ticker(ticker).history(period='1d', start=previous_bd(start_date), end=end_date)['Close']
    if history.empty:
        return np.nan
    returns = history / history.shift(1) - 1
    returns.dropna(inplace=True)
    returns.index = returns.index.strftime('%Y-%m-%d')
    sum_returns = sum(returns)
    beta = get_beta(data)
    sum_mk_returns = sum(mk_returns.iloc[n_row + 1:n_row + 4])
    return sum_returns - beta * sum_mk_returns
result_df = pd.read_csv('cleaned_text.csv')

# Assuming result_df is already loaded and contains 'ticker' and 'filing_date' columns
result_df['excess_returns'] = result_df.apply(excess_returns, axis=1)


In [10]:
result_df

Unnamed: 0,ticker,file_number,date,word_count,words,excess_returns
0,SNA,0000091440-21-000011.txt,2021-04-22,13001,"['sna', 'htm', 'sna', 'gaap', 'gaap', 'propert...",-0.003843
1,MCK,0000927653-20-000093.txt,2020-11-03,17989,"['mck', 'htm', 'mck', 'gaap', 'gaap', 'usdxbrl...",0.052555
2,HSIC,0001000228-20-000055.txt,2020-08-04,20228,"['htm', 'quarterly', 'report', 'truetruemodifi...",-0.027407
3,CME,0001156375-21-000052.txt,2021-05-05,7184,"['cme', 'htm', 'cme', 'gaap', 'gaap', 'gaap', ...",0.023516
4,WTW,0001564590-18-026537.txt,2018-11-02,24493,"['wltw', 'htm', 'wltw', 'htm', 'united', 'stat...",0.015161
...,...,...,...,...,...,...
7299,MCK,0000927653-21-000065.txt,2021-08-05,14680,"['mck', 'htm', 'mck', 'gaap', 'usdxbrli', 'gaa...",-0.023552
7300,YUM,0001041061-19-000048.txt,2019-11-05,10695,"['yum', 'htm', 'document', 'false', 'yum', 're...",0.008845
7301,PARA,0000813828-19-000033.txt,2019-11-12,18320,"['htm', 'document', 'us', 'gaap', 'commonclass...",0.019004
7302,TMUS,0001283699-22-000117.txt,2022-07-29,14902,"['tmus', 'htm', 'tmus', 'form', 'tmus', 'usdxb...",-0.004576


In [12]:
result_df.to_csv('scores.csv', index=False)

In [13]:
print('done')

done
