In [1]:
import pandas as pd


# Load data
holdings = pd.read_csv('../data/raw/holdings.csv')
trades = pd.read_csv('../data/raw/trades.csv')


def analyze_data_quality(holdings, trades):
    """Identify all data quality issues"""
    
    issues = []
    
    # Holdings issues
    print("Analyzing Holdings...")
    
    # 1. Date consistency
    if holdings['AsOfDate'].isnull().any():
        issues.append("Missing AsOfDate values")
    
    # 2. Portfolio names
    if holdings['PortfolioName'].isnull().any():
        issues.append("Missing PortfolioName")
    
    # Check for inconsistent naming (spaces, case)
    unique_portfolios = holdings['PortfolioName'].unique()
    normalized = [p.strip().lower() for p in unique_portfolios if pd.notna(p)]
    if len(normalized) != len(set(normalized)):
        issues.append("Inconsistent portfolio names (case/spaces)")
    
    # 3. Numeric fields
    for col in ['Qty', 'MV_Base', 'PL_YTD']:
        if col in holdings.columns:
            if holdings[col].isnull().sum() > 0:
                issues.append(f"Missing values in {col}")
            if not pd.api.types.is_numeric_dtype(holdings[col]):
                issues.append(f"{col} is not numeric")
    
    # 4. Zero positions
    zero_qty = (holdings['Qty'] == 0).sum()
    if zero_qty > 0:
        issues.append(f"{zero_qty} holdings with zero quantity")
    
    # Trades issues
    print("\nAnalyzing Trades...")
    
    # 5. Date logic
    invalid_dates = trades[trades['SettleDate'] < trades['TradeDate']]
    if len(invalid_dates) > 0:
        issues.append(f"{len(invalid_dates)} trades with SettleDate before TradeDate")
    
    # 6. Missing critical fields
    critical_fields = ['TradeDate', 'PortfolioName', 'Quantity', 'Price']
    for field in critical_fields:
        if field in trades.columns:
            missing = trades[field].isnull().sum()
            if missing > 0:
                issues.append(f"{missing} missing {field} in trades")
    
    return issues

# Run analysis
issues = analyze_data_quality(holdings, trades)

print("\n" + "="*60)
print("DATA QUALITY ISSUES")
print("="*60)
for i, issue in enumerate(issues, 1):
    print(f"{i}. {issue}")

Analyzing Holdings...

Analyzing Trades...

DATA QUALITY ISSUES
1. Missing values in MV_Base
2. 16 holdings with zero quantity


In [2]:
def clean_data(holdings, trades):
    """Clean and standardize data"""
    
    print("Cleaning data...")
    
    # Holdings cleaning
    h_clean = holdings.copy()
    
    # 1. Standardize portfolio names
    h_clean['PortfolioName'] = h_clean['PortfolioName'].str.strip()
    
    # 2. Convert dates
    h_clean['AsOfDate'] = pd.to_datetime(h_clean['AsOfDate'], errors='coerce')
    
    # 3. Fill numeric NaNs with 0 (or drop - depending on business logic)
    numeric_cols = ['Qty', 'MV_Base', 'PL_YTD', 'PL_MTD', 'PL_QTD']
    for col in numeric_cols:
        if col in h_clean.columns:
            h_clean[col] = pd.to_numeric(h_clean[col], errors='coerce').fillna(0)
    
    # 4. Remove zero quantity holdings (optional - keep for audit)
    # h_clean = h_clean[h_clean['Qty'] != 0]
    
    # 5. Drop rows with missing critical data
    h_clean = h_clean.dropna(subset=['AsOfDate', 'PortfolioName'])
    
    # Trades cleaning
    t_clean = trades.copy()
    
    # 1. Standardize portfolio names
    t_clean['PortfolioName'] = t_clean['PortfolioName'].str.strip()
    
    # 2. Convert dates
    t_clean['TradeDate'] = pd.to_datetime(t_clean['TradeDate'], errors='coerce')
    t_clean['SettleDate'] = pd.to_datetime(t_clean['SettleDate'], errors='coerce')
    
    # 3. Fix invalid settle dates
    invalid_mask = t_clean['SettleDate'] < t_clean['TradeDate']
    t_clean.loc[invalid_mask, 'SettleDate'] = t_clean.loc[invalid_mask, 'TradeDate']
    
    # 4. Drop rows with missing critical data
    t_clean = t_clean.dropna(subset=['TradeDate', 'PortfolioName', 'Quantity'])
    
    print(f"Holdings: {len(holdings)} → {len(h_clean)} rows")
    print(f"Trades: {len(trades)} → {len(t_clean)} rows")
    
    return h_clean, t_clean

# Clean data
holdings_clean, trades_clean = clean_data(holdings, trades)

# Save cleaned data
holdings_clean.to_csv('../data/processed/holdings_clean.csv', index=False)
trades_clean.to_csv('../data/processed/trades_clean.csv', index=False)

Cleaning data...
Holdings: 1022 → 1022 rows
Trades: 649 → 649 rows
