In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Load data
holdings = pd.read_csv('../data/raw/holdings.csv')
trades = pd.read_csv('../data/raw/trades.csv')

print("="*60)
print("HOLDINGS DATA OVERVIEW")
print("="*60)

# Shape and info
print(f"Shape: {holdings.shape}")
print(f"\nColumns: {holdings.columns.tolist()}")
print(f"\nData types:\n{holdings.dtypes}")
print(f"\nMemory usage: {holdings.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Missing values
missing = holdings.isnull().sum()
missing_pct = (missing / len(holdings) * 100).round(2)
missing_df = pd.DataFrame({
    'Missing_Count': missing,
    'Missing_Percent': missing_pct
}).sort_values('Missing_Count', ascending=False)

print(f"\nMissing Values:\n{missing_df[missing_df['Missing_Count'] > 0]}")

# Duplicates
print(f"\nDuplicate rows: {holdings.duplicated().sum()}")

# Sample data
print(f"\nFirst 3 rows:\n{holdings.head(3)}")

HOLDINGS DATA OVERVIEW
Shape: (1022, 25)

Columns: ['AsOfDate', 'OpenDate', 'CloseDate', 'ShortName', 'PortfolioName', 'StrategyRefShortName', 'Strategy1RefShortName', 'Strategy2RefShortName', 'CustodianName', 'DirectionName', 'SecurityId', 'SecurityTypeName', 'SecName', 'StartQty', 'Qty', 'StartPrice', 'Price', 'StartFXRate', 'FXRate', 'MV_Local', 'MV_Base', 'PL_DTD', 'PL_QTD', 'PL_MTD', 'PL_YTD']

Data types:
AsOfDate                  object
OpenDate                  object
CloseDate                 object
ShortName                 object
PortfolioName             object
StrategyRefShortName      object
Strategy1RefShortName     object
Strategy2RefShortName     object
CustodianName             object
DirectionName             object
SecurityId                 int64
SecurityTypeName          object
SecName                   object
StartQty                 float64
Qty                      float64
StartPrice               float64
Price                    float64
StartFXRate             

Key Columns Analysis

In [3]:
# Date analysis
holdings['AsOfDate'] = pd.to_datetime(holdings['AsOfDate'])
print(f"\nDate Range: {holdings['AsOfDate'].min()} to {holdings['AsOfDate'].max()}")
print(f"Unique dates: {holdings['AsOfDate'].nunique()}")

# Fund/Portfolio analysis
print(f"\nUnique Portfolios: {holdings['PortfolioName'].nunique()}")
print(f"\nTop 10 Portfolios by holdings count:")
print(holdings['PortfolioName'].value_counts().head(10))

# Securities
print(f"\nUnique Securities: {holdings['SecurityId'].nunique()}")
print(f"\nTop security types:")
print(holdings['SecurityTypeName'].value_counts())

# Direction (Long/Short)
print(f"\nPosition directions:")
print(holdings['DirectionName'].value_counts())


Date Range: 2023-01-08 00:00:00 to 2023-01-08 00:00:00
Unique dates: 1

Unique Portfolios: 19

Top 10 Portfolios by holdings count:
PortfolioName
MNC Investment Fund        243
Garfield                   221
Heather                    195
Opium Holdings Partners    131
Platpot                     61
Ytum                        51
Hi Yield                    19
NPSMF2                      17
NPSMF3                      17
NPSMF1                      17
Name: count, dtype: int64

Unique Securities: 243

Top security types:
SecurityTypeName
Bond                     225
Equity                   176
AssetBacked              176
Repo Contract             83
Loan                      72
Fund Holding              52
Option                    39
CDS Contract              35
Future                    29
IR Swap                   26
Swaption                  24
FX Forward                22
CDO Tranche               20
Credit Index Contract     14
Total Return Swap         11
Preferred           

Numerical Columns Analysis

In [4]:
# Financial metrics
numeric_cols = ['Qty', 'MV_Base', 'PL_YTD', 'PL_MTD', 'PL_QTD', 'Price']

print("\nNumerical Statistics:")
print(holdings[numeric_cols].describe())

# Check for anomalies
print("\nPotential anomalies:")
for col in numeric_cols:
    if col in holdings.columns:
        q1 = holdings[col].quantile(0.25)
        q3 = holdings[col].quantile(0.75)
        iqr = q3 - q1
        outliers = ((holdings[col] < (q1 - 3*iqr)) | (holdings[col] > (q3 + 3*iqr))).sum()
        print(f"{col}: {outliers} outliers ({outliers/len(holdings)*100:.2f}%)")

# Zeros and negatives
print("\nZeros and negatives:")
print(f"Zero Qty: {(holdings['Qty'] == 0).sum()}")
print(f"Negative PL_YTD: {(holdings['PL_YTD'] < 0).sum()}")
print(f"Negative MV_Base: {(holdings['MV_Base'] < 0).sum()}")


Numerical Statistics:
                Qty       MV_Base        PL_YTD        PL_MTD        PL_QTD  \
count  1.022000e+03  1.006000e+03  1.022000e+03  1.022000e+03  1.022000e+03   
mean   2.564347e+06  2.360190e+06 -5.916734e+06 -1.381546e+03 -4.107637e+05   
std    6.764281e+06  7.141503e+06  1.567413e+08  3.583076e+04  6.160577e+06   
min   -6.000000e+06 -4.287250e+06 -5.000000e+09 -1.119863e+06 -1.612500e+08   
25%    1.000000e+04  0.000000e+00 -2.134909e+03  0.000000e+00  0.000000e+00   
50%    8.130000e+05  1.814674e+04  0.000000e+00  0.000000e+00  0.000000e+00   
75%    2.160000e+06  1.609462e+06  5.053261e+03  0.000000e+00  3.375000e+02   
max    4.410000e+07  4.410000e+07  4.392916e+06  2.560000e+04  3.993990e+05   

              Price  
count   1022.000000  
mean     365.492790  
std     3612.537662  
min      -83.981944  
25%        9.430000  
50%       79.000000  
75%       97.000000  
max    43840.930000  

Potential anomalies:
Qty: 73 outliers (7.14%)
MV_Base: 64 outliers

Trades Data Analysis

In [5]:
print("\n" + "="*60)
print("TRADES DATA OVERVIEW")
print("="*60)

trades['TradeDate'] = pd.to_datetime(trades['TradeDate'])
trades['SettleDate'] = pd.to_datetime(trades['SettleDate'])

print(f"Shape: {trades.shape}")
print(f"\nDate Range: {trades['TradeDate'].min()} to {trades['TradeDate'].max()}")
print(f"\nUnique Portfolios: {trades['PortfolioName'].nunique()}")

# Trade types
print(f"\nTrade Types:")
print(trades['TradeTypeName'].value_counts())

# Volume analysis
print(f"\nTotal Cash Flow: ${trades['TotalCash'].sum():,.2f}")
print(f"Average Trade Size: ${trades['TotalCash'].abs().mean():,.2f}")
print(f"Median Trade Size: ${trades['TotalCash'].abs().median():,.2f}")

# Portfolio activity
print(f"\nTop 10 most active portfolios:")
print(trades['PortfolioName'].value_counts().head(10))


TRADES DATA OVERVIEW
Shape: (649, 31)

Date Range: 2026-01-13 00:00:00 to 2026-01-13 00:00:00

Unique Portfolios: 16

Trade Types:
TradeTypeName
Buy                        504
Sell Short                  62
Sell                        59
Buy Fixed/Floating Rate     10
Cover Short                  9
Buy Protection               5
Name: count, dtype: int64

Total Cash Flow: $13,691,504,586.75
Average Trade Size: $21,096,525.03
Median Trade Size: $1,597,500.00

Top 10 most active portfolios:
PortfolioName
Redfield Accu-Fund           143
UNC Investment Fund          142
Leatherwood Trust MA         132
Optimum Holdings Partners    101
HoldCo 1                      43
CampNou Holdings              30
ClientA                       24
Platpot Fund                   9
HoldCo 11                      6
Northpoint 401K                6
Name: count, dtype: int64
