## Aufgabe 1
### Data Preprocessing

Import Libraries

In [4]:
from __future__ import annotations

from pathlib import Path
import time
import numpy as np
import pandas as pd
import yfinance as yf

Define Constants

In [7]:
DATA_DIR = Path("..", "data")
RAW, OUT = DATA_DIR / "raw", DATA_DIR / "processed"
OUT.mkdir(parents=True, exist_ok=True)

START = pd.Timestamp("2000-01-01")
END = pd.Timestamp("2025-12-31")
EFFECTIVE_END = min(END, pd.Timestamp.today().normalize())
TOL = 0.05  # split confirmation tolerance

Define Functions

In [43]:
def to_date(s: pd.Series) -> pd.Series:
    """Converts a Series to Datetime and removes the time zone"""
    return pd.to_datetime(s, errors="coerce").dt.tz_localize(None)


def clean_cols(df: pd.DataFrame) -> pd.DataFrame:
    """Strips and lowers the column labels of a Dataframe"""
    df.columns = df.columns.map(lambda label: label.strip().lower())
    return df


def batch(seq, size: int):
    for i in range(0, len(seq), size):
        yield seq[i : i + size]


def quality(df: pd.DataFrame, name: str) -> None:
    print(f"\n=== {name} ===")
    print("rows:", len(df), "| tickers:", df["ticker"].nunique() if "ticker" in df else "n/a")
    if "date" in df:
        print("range:", df["date"].min(), "->", df["date"].max())
    if {"open", "close"}.issubset(df.columns):
        print("missing open:", df["open"].isna().mean(), "| missing close:", df["close"].isna().mean())
    if {"ticker", "date"}.issubset(df.columns):
        print("dup(ticker,date):", df.duplicated(["ticker", "date"]).sum())

Get Meta information (screener + addresses) and merge them

In [83]:
meta_file = RAW / "nasdaq_screener.csv"
addr_file = RAW / "nasdaq_company_addresses.csv"

# Meta has two tickers called NA and NAN. These are converted to NaN. To prevent this keep_default_na is False
meta = clean_cols(pd.read_csv(meta_file, keep_default_na=False)).rename(columns={"symbol": "ticker"})
# meta has no ticker duplicates

addr = clean_cols(pd.read_csv(addr_file))
# addr has no ticker duplicates

print(f"{addr.shape[0]} of {meta.shape[0]} tickers have an address\n")

df_nasdaq_meta = meta.merge(addr, on="ticker", how="left")
df_nasdaq_meta.head(5)




3308 of 7023 tickers have an address



Unnamed: 0,ticker,name,last sale,net change,% change,market cap,country,ipo year,volume,sector,industry,address
0,A,Agilent Technologies Inc. Common Stock,$146.59,-1.44,-0.973%,41558327594.0,United States,1999,1187952,Industrials,Biotechnology: Laboratory Analytical Instruments,
1,AA,Alcoa Corporation Common Stock,$39.90,0.48,1.218%,10330697448.0,United States,2016,7207004,Industrials,Aluminum,
2,AACB,Artius II Acquisition Inc. Class A Ordinary Sh...,$10.24,-0.01,-0.098%,0.0,United States,2025,10915,,,
3,AACBR,Artius II Acquisition Inc. Rights,$0.34,0.02,6.25%,0.0,United States,2025,379367,,,
4,AACBU,Artius II Acquisition Inc. Units,$10.70,0.0,0.00%,0.0,United States,2025,1102,Finance,Blank Checks,
