In [12]:
!pip install yfinance



In [13]:
import pandas as pd
import yfinance as yf
from datetime import timedelta
import re

In [27]:
df = pd.read_csv("Data/flagged_filings.csv", parse_dates=["filingDate"])
print("Shape:", df.shape)       # how many rows × columns
display(df.head())              # peek at the first few rows
df.info()   

Shape: (61, 7)


Unnamed: 0,ticker,cik,form,filingDate,url,keywords,summary
0,STRD,1050446,8-K,2025-07-07,https://www.sec.gov/Archives/edgar/data/105044...,private placement,MicroStrategy Inc. filed an 8-K report on July...
1,STRD,1050446,8-K,2025-07-07,https://www.sec.gov/Archives/edgar/data/105044...,reverse stock split,MicroStrategy Inc. filed an 8-K reporting an a...
2,INO,1055726,8-K,2025-07-07,https://www.sec.gov/Archives/edgar/data/105572...,reverse stock split,Inovio Pharmaceuticals announced the entry int...
3,KFS,1072627,8-K,2025-07-07,https://www.sec.gov/Archives/edgar/data/107262...,acquired beneficial ownership,Kingsway Financial Services Inc. filed an 8-K ...
4,,1307275,4,2025-07-07,https://www.sec.gov/Archives/edgar/data/130727...,reverse stock split,"On July 3, 2025, Tsai Chen Lung, a director of..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   ticker      45 non-null     object        
 1   cik         61 non-null     int64         
 2   form        61 non-null     object        
 3   filingDate  61 non-null     datetime64[ns]
 4   url         61 non-null     object        
 5   keywords    61 non-null     object        
 6   summary     61 non-null     object        
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 3.5+ KB


In [38]:
def get_price_changes(ticker, filing_date):
    """
    Returns a Series of five features:
      pct_1d, pct_3d, pct_before, volatility_before, volume_change
    around the filing_date for the given ticker.
    """
    # 1) Download 7 days before through 5 days after
    start = filing_date - timedelta(days=7)
    end   = filing_date + timedelta(days=5)
    hist = yf.download(ticker, start=start, end=end, progress=False)

    # 2) Normalize index to dates for matching
    dates = [d.date() for d in hist.index]
    if filing_date.date() not in dates:
        # no data for that exact date
        return pd.Series([None]*5,
                         index=[
                           "pct_1d","pct_3d",
                           "pct_before","volatility_before","volume_change"
                         ])

    idx = dates.index(filing_date.date())

    # 3) Use .iloc for safe positional indexing
    close_0    = hist["Close"].iloc[idx]
    close_1d   = hist["Close"].iloc[idx+1] if idx+1 < len(hist) else None
    close_3d   = hist["Close"].iloc[idx+3] if idx+3 < len(hist) else None
    close_prev = hist["Close"].iloc[idx-1] if idx-1 >= 0 else None

    # 4) Compute returns
    pct_1d     = (close_1d - close_0) / close_0  if close_1d is not None else None
    pct_3d     = (close_3d - close_0) / close_0  if close_3d is not None else None
    pct_before = (close_0  - close_prev) / close_prev if close_prev is not None else None

    
    
    vol_before = hist["Close"].iloc[max(0, idx-5):idx].std()

    try:
        # Compute average volume over the 5 days before filing
        avg_vol_prior = hist["Volume"].iloc[max(0, idx-5):idx].mean()
        # If it’s zero or NaN, bail out
        if avg_vol_prior == 0 or pd.isna(avg_vol_prior):
            volume_change = None
        else:
            volume_change = hist["Volume"].iloc[idx] / avg_vol_prior
    except Exception:
        volume_change = None

    # then return as before
    return pd.Series(
        [pct_1d, pct_3d, pct_before, vol_before, volume_change],
        index=["pct_1d","pct_3d","pct_before","volatility_before","volume_change"]
    )

In [39]:
# Inspect your filings DF
print("Flagged filings shape:", df.shape)
print(df.head(3).to_dict(orient="records"))  # show first 3 rows as dicts

# Pull one history object so you can see it’s loading right
ticker, date = df.iloc[0]["ticker"], df.iloc[0]["filingDate"]
hist = yf.download(ticker, start=date - timedelta(days=7), end=date + timedelta(days=5), progress=False)
print("Price history sample:\n", hist.head(), "\n…\n", hist.tail())

Flagged filings shape: (61, 7)
[{'ticker': 'STRD', 'cik': 1050446, 'form': '8-K', 'filingDate': Timestamp('2025-07-07 00:00:00'), 'url': 'https://www.sec.gov/Archives/edgar/data/1050446/0000950170-25-094137.txt', 'keywords': 'private placement', 'summary': "MicroStrategy Inc. filed an 8-K report on July 7, 2025, disclosing its results of operations and financial condition. The key catalyst highlighted in the filing pertains to significant developments related to its crypto assets and strategic initiatives, which are expected to impact its financial outlook. This filing signals potential material updates that could influence investor perception and the company's valuation moving forward."}, {'ticker': 'STRD', 'cik': 1050446, 'form': '8-K', 'filingDate': Timestamp('2025-07-07 00:00:00'), 'url': 'https://www.sec.gov/Archives/edgar/data/1050446/0001193125-25-155918.txt', 'keywords': 'reverse stock split', 'summary': 'MicroStrategy Inc. filed an 8-K reporting an amendment to its Articles of

  hist = yf.download(ticker, start=date - timedelta(days=7), end=date + timedelta(days=5), progress=False)


Price history sample:
 Price           Close       High        Low       Open   Volume
Ticker           STRD       STRD       STRD       STRD     STRD
Date                                                           
2025-06-30  89.750000  89.750000  87.559998  87.860001  1872200
2025-07-01  89.779999  90.430000  89.250000  89.750000   420100
2025-07-02  93.510002  95.764999  90.110001  90.250000   761500
2025-07-03  95.250000  96.080002  94.169998  94.849998   218900
2025-07-07  93.974998  96.150002  93.760002  95.480003   355900 
…
 Price           Close       High        Low       Open  Volume
Ticker           STRD       STRD       STRD       STRD    STRD
Date                                                          
2025-07-02  93.510002  95.764999  90.110001  90.250000  761500
2025-07-03  95.250000  96.080002  94.169998  94.849998  218900
2025-07-07  93.974998  96.150002  93.760002  95.480003  355900
2025-07-08  92.800003  94.500000  92.750000  94.300003  378700
2025-07-09  94.12000

In [29]:
price_rows = []
for _, row in df.iterrows():
    price_rows.append(get_price_changes(row["ticker"], row["filingDate"]))

# Construct a DataFrame with the same index as df
price_feats = pd.DataFrame(price_rows, index=df.index)

# Merge back into df
df = pd.concat([df, price_feats], axis=1)

  hist = yf.download(ticker, start=start, end=end, progress=False)


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [24]:
df["label_1d_up"] = df["pct_1d"] >= 0.05

KeyError: 'pct_1d'

In [None]:
df["summary_length"]       = df["summary"].astype(str).apply(len)
df["has_numbers_in_summary"] = df["summary"].astype(str).apply(lambda s: bool(re.search(r"\$\d|\d+%", s)))
df["num_keywords_matched"] = df["keywords"].astype(str).apply(lambda s: len(s.split(";")) if pd.notna(s) else 0)


In [None]:
df["backtest_date"] = pd.Timestamp.today().normalize()

In [None]:
df = df.dropna(subset=["pct_1d","pct_3d"])

In [None]:
df.to_csv("Data/backtest_dataset.csv", index=False)
print(f"✅ backtest_dataset.csv written with {len(df)} rows for backtest_date {df['backtest_date'].iloc[0].date()}")