In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from scipy import stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

def comprehensive_analysis(df):
    # 1. Basic statistics and missing values
    print("Basic Statistics:")
    print(df.describe())
    print("\nMissing Values:")
    print(df.isnull().sum())

    # 2. Correlation analysis
    corr_matrix = df.corr()
    plt.figure(figsize=(20, 16))
    sns.heatmap(corr_matrix, cmap='coolwarm', annot=False)
    plt.title('Correlation Heatmap')
    plt.tight_layout()
    plt.show()

    # 3. Identify highly correlated features
    high_corr = np.where(np.abs(corr_matrix) > 0.8)
    high_corr_pairs = [(corr_matrix.index[x], corr_matrix.columns[y]) for x, y in zip(*high_corr) if x != y and x < y]
    print("\nHighly correlated feature pairs:")
    for pair in high_corr_pairs:
        print(f"{pair[0]} - {pair[1]}: {corr_matrix.loc[pair[0], pair[1]]:.2f}")

    # 4. Variance Inflation Factor (VIF) for multicollinearity
    X = add_constant(df)
    vif_data = pd.DataFrame()
    vif_data["feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    print("\nVariance Inflation Factors:")
    print(vif_data.sort_values('VIF', ascending=False))

    # 5. PCA for dimensionality assessment
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df)
    pca = PCA()
    pca_result = pca.fit_transform(scaled_data)
    
    cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
    plt.figure(figsize=(10, 6))
    plt.plot(cumulative_variance_ratio)
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance Ratio')
    plt.title('PCA Cumulative Explained Variance')
    plt.tight_layout()
    plt.show()

    # 6. Scree plot
    plt.figure(figsize=(10, 6))
    plt.plot(pca.explained_variance_ratio_)
    plt.xlabel('Principal Component')
    plt.ylabel('Explained Variance Ratio')
    plt.title('Scree Plot')
    plt.tight_layout()
    plt.show()

    # 7. Outlier detection using multiple methods
    # Isolation Forest
    iso_forest = IsolationForest(contamination=0.1, random_state=42)
    outliers_iso = iso_forest.fit_predict(scaled_data)
    
    # Elliptic Envelope
    ee = EllipticEnvelope(contamination=0.1, random_state=42)
    outliers_ee = ee.fit_predict(scaled_data)
    
    # Z-score
    z_scores = np.abs(stats.zscore(df))
    outliers_z = np.where(z_scores > 3)
    
    print(f"\nNumber of potential outliers (Isolation Forest): {sum(outliers_iso == -1)}")
    print(f"Number of potential outliers (Elliptic Envelope): {sum(outliers_ee == -1)}")
    print(f"Number of potential outliers (Z-score > 3): {len(outliers_z[0])}")

    # 8. Skewness and Kurtosis analysis
    skewness = df.skew()
    kurtosis = df.kurtosis()
    print("\nSkewness of features:")
    print(skewness)
    print("\nKurtosis of features:")
    print(kurtosis)

    # 9. Feature-to-feature relationships (sample for efficiency)
    sample_df = df.sample(min(1000, len(df)))
    sns.pairplot(sample_df, diag_kind='kde', plot_kws={'alpha': 0.2})
    plt.tight_layout()
    plt.show()

    # 10. Noise-to-Signal Ratio (approximation using PCA)
    total_variance = np.sum(pca.explained_variance_)
    noise_variance = np.sum(pca.explained_variance_[int(0.95 * len(pca.explained_variance_)):])
    nsr = noise_variance / (total_variance - noise_variance)
    print(f"\nApproximate Noise-to-Signal Ratio: {nsr:.4f}")

# Load your data
df = pd.read_csv('your_dataset.csv')  # Replace with your actual data loading method

# Run the comprehensive analysis
comprehensive_analysis(df)

FileNotFoundError: [Errno 2] No such file or directory: 'your_dataset.csv'

To interpret the results:

Multicollinearity:

Look for high correlations in the heatmap and the list of highly correlated pairs.
Check VIF values. VIF > 5 suggests moderate multicollinearity, while VIF > 10 indicates severe multicollinearity.
In the PCA results, if a small number of components explain most of the variance, it suggests high multicollinearity.


Noisiness:

Check the number of outliers detected by different methods. A high number of outliers might indicate noisy data.
Look for features with high skewness or kurtosis, which might indicate noise or the need for transformation.
In the PCA results, if many components are needed to explain most of the variance, it might indicate noisy data.
A high Noise-to-Signal Ratio suggests noisier data.


Overall data quality:

Examine the pairplots for unexpected patterns or inconsistencies.
Check if the cumulative explained variance in PCA reaches a high level (e.g., 95%) with a reasonable number of components.



This comprehensive analysis will give you a thorough understanding of the multicollinearity and noisiness in your high-dimensional financial dataset. Based on the results, you can make informed decisions about feature selection, dimensionality reduction, or data transformation techniques to address any issues identified. C