In [232]:
#!pip install yfinance

In [259]:
import pandas as pd
import yfinance as yf
from datetime import timedelta
import re

In [273]:
df = pd.read_csv("flagged_filings.csv", parse_dates=["filingDate"])
# only keep rows where 'ticker' is a non-null string
df = df.dropna(subset=["ticker"]).copy()
# if you want, ensure it's a string type:
df["ticker"] = df["ticker"].astype(str)
print("Shape:", df.shape)       # how many rows × columns
display(df.head())              # peek at the first few rows
df.info()   

Shape: (38, 7)


Unnamed: 0,ticker,cik,form,filingDate,url,keywords,summary
1,PNBK,1098146,4,2025-07-08,https://www.sec.gov/Archives/edgar/data/109814...,private placement,"On July 3, 2025, Steven Sugarman, President an..."
2,MYSZ,1211805,DEF 14A,2025-07-08,https://www.sec.gov/Archives/edgar/data/121180...,reverse stock split,"My Size, Inc. filed a DEF 14A proxy statement ..."
4,EQBK,1227500,8-K,2025-07-08,https://www.sec.gov/Archives/edgar/data/122750...,securities purchase agreement,Equity BancShares Inc. filed an 8-K reporting ...
6,LIXTW,1335105,8-K,2025-07-08,https://www.sec.gov/Archives/edgar/data/133510...,private placement;securities purchase agreemen...,"Lixte Biotechnology Holdings, Inc. filed an 8-..."
8,CV,1378325,4,2025-07-08,https://www.sec.gov/Archives/edgar/data/137832...,reverse stock split,"On July 3, 2025, Eliyahu Harari and related pa..."


<class 'pandas.core.frame.DataFrame'>
Int64Index: 38 entries, 1 to 49
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   ticker      38 non-null     object        
 1   cik         38 non-null     int64         
 2   form        38 non-null     object        
 3   filingDate  38 non-null     datetime64[ns]
 4   url         38 non-null     object        
 5   keywords    38 non-null     object        
 6   summary     38 non-null     object        
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 2.4+ KB


In [274]:
def get_price_changes(ticker, filing_date):
    """
    Returns a Series of five features:
      pct_1d, pct_3d, pct_before, volatility_before, volume_change
    around the filing_date for the given ticker.
    """
    # 1) Download 7 days before through 5 days after
    start = filing_date - timedelta(days=7)
    end   = filing_date + timedelta(days=5)
    hist = yf.download(ticker, start=start, end=end, progress=False)

    # 2) Normalize index to dates for matching
    dates = [d.date() for d in hist.index]
    if filing_date.date() not in dates:
        # no data for that exact date
        return pd.Series([None]*4,
                         index=[
                           "pct_1d","pct_3d",
                           "pct_before","volatility_before"])

    idx = dates.index(filing_date.date())

    # 3) Use .iloc for safe positional indexing
    close_0    = hist["Close"].iloc[idx]
    close_1d   = hist["Close"].iloc[idx+1] if idx+1 < len(hist) else None
    close_3d   = hist["Close"].iloc[idx+3] if idx+3 < len(hist) else None
    close_prev = hist["Close"].iloc[idx-1] if idx-1 >= 0 else None

    # 4) Compute returns
    pct_1d = (close_0 - close_prev) / close_prev if close_prev is not None else None
    pct_3d     = (close_3d - close_0) / close_0  if close_3d is not None else None
    pct_before = (close_0  - close_prev) / close_prev if close_prev is not None else None

    
    
    vol_before = hist["Close"].iloc[max(0, idx-5):idx].std()


    # then return as before
    return pd.Series(
        [pct_1d, pct_3d, pct_before, vol_before],
        index=["pct_1d","pct_3d","pct_before","volatility_before"]
    )

In [275]:
# Inspect your filings DF
print("Flagged filings shape:", df.shape)
print(df.head(3).to_dict(orient="records"))  # show first 3 rows as dicts

# Pull one history object so you can see it’s loading right
ticker, date = df.iloc[0]["ticker"], df.iloc[0]["filingDate"]
hist = yf.download(ticker, start=date - timedelta(days=7), end=date + timedelta(days=5), progress=False)
print("Price history sample:\n", hist.head(), "\n…\n", hist.tail())

Flagged filings shape: (38, 7)
[{'ticker': 'PNBK', 'cik': 1098146, 'form': '4', 'filingDate': Timestamp('2025-07-08 00:00:00'), 'url': 'https://www.sec.gov/Archives/edgar/data/1098146/0001437749-25-022374.txt', 'keywords': 'private placement', 'summary': "On July 3, 2025, Steven Sugarman, President and CEO of Patriot National Bancorp Inc., reported the disposition of 19,167 shares of Series A Preferred Stock through a Form 4 filing. The key catalyst is the sale of these preferred shares, which may impact the company's ownership structure and stockholder dynamics."}, {'ticker': 'MYSZ', 'cik': 1211805, 'form': 'DEF 14A', 'filingDate': Timestamp('2025-07-08 00:00:00'), 'url': 'https://www.sec.gov/Archives/edgar/data/1211805/0001641172-25-018202.txt', 'keywords': 'reverse stock split', 'summary': "My Size, Inc. filed a DEF 14A proxy statement indicating its fiscal year ends on December 31, 2024. The filing does not specify a particular catalyst or event; it primarily provides standard info

  hist = yf.download(ticker, start=date - timedelta(days=7), end=date + timedelta(days=5), progress=False)


Price history sample:
 Price      Close  High   Low  Open  Volume
Ticker      PNBK  PNBK  PNBK  PNBK    PNBK
Date                                      
2025-07-01  1.50  1.55  1.45  1.50  631200
2025-07-02  1.47  1.54  1.42  1.50  425800
2025-07-03  1.47  1.48  1.41  1.46  135800
2025-07-07  1.39  1.48  1.34  1.47  555900
2025-07-08  1.39  1.42  1.30  1.40  590300 
…
 Price       Close  High     Low  Open  Volume
Ticker       PNBK  PNBK    PNBK  PNBK    PNBK
Date                                         
2025-07-03  1.470  1.48  1.4100  1.46  135800
2025-07-07  1.390  1.48  1.3400  1.47  555900
2025-07-08  1.390  1.42  1.3000  1.40  590300
2025-07-09  1.390  1.40  1.3500  1.38  389000
2025-07-10  1.465  1.47  1.3501  1.38  114365


In [276]:
price_rows = []
for _, row in df.iterrows():
    price_rows.append(get_price_changes(row["ticker"], row["filingDate"]))

# Construct a DataFrame with the same index as df
pprice_feats = pd.DataFrame(price_rows, index=df.index)

# 2) flatten each column in price_feats
def unpack_one(x):
    if isinstance(x, pd.Series):
        return x.iat[0]    # pull out the single scalar
    else:
        return x

for col in price_feats.columns:
    price_feats[col] = price_feats[col].apply(unpack_one).astype(float)

# 3) now merge flattened features into your main df
df = pd.concat([df, price_feats], axis=1)

  hist = yf.download(ticker, start=start, end=end, progress=False)
  hist = yf.download(ticker, start=start, end=end, progress=False)
  hist = yf.download(ticker, start=start, end=end, progress=False)
  hist = yf.download(ticker, start=start, end=end, progress=False)
  hist = yf.download(ticker, start=start, end=end, progress=False)
  hist = yf.download(ticker, start=start, end=end, progress=False)
  hist = yf.download(ticker, start=start, end=end, progress=False)
  hist = yf.download(ticker, start=start, end=end, progress=False)
  hist = yf.download(ticker, start=start, end=end, progress=False)
  hist = yf.download(ticker, start=start, end=end, progress=False)
  hist = yf.download(ticker, start=start, end=end, progress=False)
  hist = yf.download(ticker, start=start, end=end, progress=False)
  hist = yf.download(ticker, start=start, end=end, progress=False)
  hist = yf.download(ticker, start=start, end=end, progress=False)
  hist = yf.download(ticker, start=start, end=end, progress=Fa

In [277]:
# Print all column names and how many times they appear
print(df.columns.value_counts())

# If you want to drop duplicate column names and keep only the first occurrence:
df = df.loc[:, ~df.columns.duplicated(keep='first')]
df = df.drop(columns=["url", "pct_before", "filingDate", "cik", "volume_change"])

ticker               1
cik                  1
form                 1
filingDate           1
url                  1
keywords             1
summary              1
pct_1d               1
pct_3d               1
pct_before           1
volatility_before    1
volume_change        1
dtype: int64


In [278]:
print(type(df["pct_1d"]))
df

<class 'pandas.core.series.Series'>


Unnamed: 0,ticker,form,keywords,summary,pct_1d,pct_3d,volatility_before
1,PNBK,4,private placement,"On July 3, 2025, Steven Sugarman, President an...",0.0,,0.04717
2,MYSZ,DEF 14A,reverse stock split,"My Size, Inc. filed a DEF 14A proxy statement ...",0.015625,,0.022174
4,EQBK,8-K,securities purchase agreement,Equity BancShares Inc. filed an 8-K reporting ...,0.005552,,0.597794
6,LIXTW,8-K,private placement;securities purchase agreemen...,"Lixte Biotechnology Holdings, Inc. filed an 8-...",-0.181208,,0.09196
8,CV,4,reverse stock split,"On July 3, 2025, Eliyahu Harari and related pa...",0.100239,,0.47085
11,XTNT,8-K,entered into agreement,"Xtant Medical Holdings, Inc. filed an 8-K repo...",-0.044776,,0.020616
12,MULN,DEF 14A,private placement;securities purchase agreemen...,Mullen Automotive Inc. filed a Schedule 14A pr...,-0.125,,0.039158
14,PGNY,8-K,private placement;acquired beneficial ownership,"On July 1, 2025, Progyny, Inc. entered into a ...",0.131469,,0.353211
15,BCTXZ,S-1,private placement;securities purchase agreemen...,BriaCell Therapeutics Corp. filed an S-1 regis...,,,
16,SLRX,8-K,securities purchase agreement;reverse stock split,"Salarius Pharmaceuticals, Inc. filed an 8-K re...",-0.045894,,0.031316


In [279]:
df["pct_1d"] = df["pct_1d"].apply(lambda x: 1 if x >= 0.05 else (-1 if x <= -0.05 else 0))

In [280]:
df["pct_3d"] = df["pct_3d"].apply(lambda x: 1 if x >= 0.10 else (-1 if x <= -0.10 else 0))

In [281]:
df["summary_length"]       = df["summary"].astype(str).apply(len)
df["has_numbers_in_summary"] = df["summary"].astype(str).apply(lambda s: bool(re.search(r"\$\d|\d+%", s)))
df["num_keywords_matched"] = df["keywords"].astype(str).apply(lambda s: len(s.split(";")) if pd.notna(s) else 0)
df["has_numbers_in_summary"] = df["has_numbers_in_summary"].astype(int)


In [282]:
market_caps = []
sectors = []

# Iterate through tickers
for ticker in df['ticker']:
    try:
        stock = yf.Ticker(ticker)
        info = stock.info

        market_caps.append(info.get("marketCap", None))
        sectors.append(info.get("sector", None))

    except Exception as e:
        market_caps.append(None)
        sectors.append(None)

# Add columns to DataFrame
df['market_cap'] = market_caps
df['sector'] = sectors

In [283]:
label_columns = ["pct_1d", "pct_3d"]
df = df[[col for col in df.columns if col not in label_columns] + label_columns]

In [284]:
#df = df.dropna(subset=["pct_1d","pct_3d"])
df.to_csv("backtest_dataset.csv", index=False)

rows = len(df)

In [285]:
df.to_csv("backtest_dataset.csv", index=False)
print("DF empty?", df.empty)
print("Columns:", df.columns.tolist())

DF empty? False
Columns: ['ticker', 'form', 'keywords', 'summary', 'volatility_before', 'summary_length', 'has_numbers_in_summary', 'num_keywords_matched', 'market_cap', 'sector', 'pct_1d', 'pct_3d']
