In [None]:
# 1. Import Libraries
import scipy.io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
import warnings
warnings.filterwarnings('ignore')

# 2. Load Data
mat = scipy.io.loadmat('activeinspection_recipe1.mat')
print("Data keys:", mat.keys())

# Example: Common keys ['X', 'y', 'feature_names']
for k in mat.keys():
    if not k.startswith('__'):
        print(f"Key: {k} / Type: {type(mat[k])} / Shape: {np.shape(mat[k])}")

# -- ADAPT TO YOUR ACTUAL KEY NAMES BELOW --
X = pd.DataFrame(mat['X'])
y = pd.Series(mat['y'].flatten())
feature_names = [str(f[0]) for f in mat['feature_names'].flatten()]
X.columns = feature_names

# 3. Basic Info
print("\n=== BASIC DATA INFO ===")
print(f"Number of samples: {X.shape[0]}")
print(f"Number of features: {X.shape[1]}")
print(f"Feature names (first 10): {X.columns[:10].tolist()}")

# 4. Missing Values
print("\n=== MISSING VALUE CHECK ===")
missing = X.isnull().sum()
if missing.sum() > 0:
    print(missing[missing > 0])
else:
    print("No missing values detected.")

# 5. Basic Descriptive Statistics
print("\n=== SUMMARY STATISTICS ===")
display(X.describe().T)

# 6. Imbalance Check (Binary Classification)
print("\n=== TARGET LABEL DISTRIBUTION (Imbalance) ===")
class_dist = y.value_counts()
print(class_dist)
print(f"Faulty (1) rate: {class_dist.get(1,0)/len(y):.5f}")
plt.figure(figsize=(4,3))
sns.countplot(x=y)
plt.title("Label Distribution: Faulty (1) vs Normal (0)")
plt.xlabel("Fault Label")
plt.ylabel("Count")
plt.show()
if class_dist.get(1,0)/len(y) < 0.1:
    print("WARNING: This is a highly imbalanced dataset (rare faults, <10%).")

# 7. Feature Distribution Overview (Histograms)
print("\n=== FEATURE DISTRIBUTIONS ===")
n_cols = 4
n_rows = int(np.ceil(X.shape[1] / n_cols))
plt.figure(figsize=(18, 4 * n_rows))
for i, col in enumerate(X.columns):
    plt.subplot(n_rows, n_cols, i+1)
    sns.histplot(X[col], bins=30, kde=True, color='steelblue')
    plt.title(col)
    plt.tight_layout()
plt.suptitle('Feature Histograms', y=1.02, fontsize=16)
plt.show()

# 8. Feature Distribution by Class (KDE)
print("\n=== FEATURE DISTRIBUTION BY CLASS ===")
plt.figure(figsize=(18, 3 * n_rows))
for i, col in enumerate(X.columns):
    plt.subplot(n_rows, n_cols, i+1)
    try:
        sns.kdeplot(X.loc[y==0, col], label='Normal', fill=True, lw=1)
        sns.kdeplot(X.loc[y==1, col], label='Faulty', fill=True, color='r', lw=1)
        plt.title(col)
        if i == 0:
            plt.legend()
    except:
        pass
plt.suptitle('Feature Distributions by Label', y=1.02, fontsize=16)
plt.tight_layout()
plt.show()

# 9. Correlation Matrix (Detect Multicollinearity)
print("\n=== FEATURE CORRELATION MATRIX ===")
plt.figure(figsize=(12,10))
corr = X.corr()
sns.heatmap(corr, cmap='coolwarm', center=0, square=True, cbar=True)
plt.title("Correlation Matrix of Features")
plt.show()

# 10. Fault Ratio Highlight
fault_ratio = class_dist.get(1,0) / len(y)
print(f"\nProportion of Faulty samples: {fault_ratio:.5f}")
if fault_ratio < 0.01:
    print("Note: Extreme class imbalance. Requires special handling in modeling (e.g. sampling, loss adjustment, metrics).")

# 11. Top Features Most Correlated with Fault Label
print("\n=== TOP FEATURES CORRELATED WITH FAULT LABEL ===")
corr_with_y = X.apply(lambda col: np.corrcoef(col, y)[0,1])
abs_corr = corr_with_y.abs().sort_values(ascending=False)
print(abs_corr.head(10))
plt.figure(figsize=(8,4))
abs_corr.head(10).plot(kind='bar')
plt.title("Top 10 Features Most Correlated with Fault")
plt.ylabel("Absolute Pearson Correlation")
plt.show()

# 12. t-SNE Projection (Visualizing High-dimensional Fault Patterns)
print("\n=== t-SNE VISUALIZATION (Class Separation) ===")
n_samples = min(5000, len(X))  # For speed/memory, sample if necessary
idx = np.random.choice(len(X), n_samples, replace=False)
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_2d = tsne.fit_transform(X.iloc[idx])
plt.figure(figsize=(8,6))
plt.scatter(X_2d[y.iloc[idx]==0,0], X_2d[y.iloc[idx]==0,1], alpha=0.5, label='Normal', s=7)
plt.scatter(X_2d[y.iloc[idx]==1,0], X_2d[y.iloc[idx]==1,1], alpha=0.7, label='Faulty', s=16, color='r')
plt.legend()
plt.title("t-SNE Projection: Faulty vs Normal")
plt.show()

# 13. Outlier/Extreme Value Check
print("\n=== OUTLIER CHECK (Extreme Values) ===")
outlier_summary = X.apply(lambda col: ((col < col.quantile(0.01)) | (col > col.quantile(0.99))).sum())
print("Top 10 features with most extreme outliers:\n", outlier_summary.sort_values(ascending=False).head(10))

# 14. Additional Notes for Semiconductor Manufacturing Data
print("\n--- DOMAIN NOTE ---")
print(
    "This dataset comes from semiconductor manufacturing, where fault rates are extremely low but costly.\n"
    "Data is high-dimensional and prone to noise, multicollinearity, and outlier contamination due to sensor variance and process complexity.\n"
    "Serious class imbalance and rare faults are expected; special strategies are necessary in modeling."
)

print("\n[EDA COMPLETED]")

ModuleNotFoundError: No module named 'scipy'