In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from scipy import stats

# Load your data
df = pd.read_csv('your_dataset.csv')  # Replace with your actual data loading method

def assess_noise(df):
    # 1. Basic statistics and missing values
    print("Basic Statistics:")
    print(df.describe())
    print("\nMissing Values:")
    print(df.isnull().sum())

    # 2. Correlation analysis
    corr_matrix = df.corr()
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, cmap='coolwarm', annot=False)
    plt.title('Correlation Heatmap')
    plt.show()

    # 3. Distribution of each feature
    df.hist(figsize=(20, 20), bins=50)
    plt.tight_layout()
    plt.show()

    # 4. PCA for dimensionality assessment
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df)
    pca = PCA()
    pca.fit(scaled_data)
    
    cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
    plt.plot(cumulative_variance_ratio)
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance Ratio')
    plt.title('PCA Cumulative Explained Variance')
    plt.show()

    # 5. Outlier detection using Isolation Forest
    iso_forest = IsolationForest(contamination=0.1, random_state=42)
    outliers = iso_forest.fit_predict(scaled_data)
    print(f"\nNumber of potential outliers detected: {sum(outliers == -1)}")

    # 6. Skewness analysis
    skewness = df.skew()
    print("\nSkewness of features:")
    print(skewness)

    # 7. Feature-to-feature relationships
    sns.pairplot(df.sample(1000), diag_kind='kde', plot_kws={'alpha': 0.2})
    plt.show()

# Run the assessment
assess_noise(df)


KeyboardInterrupt

