In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import StandardScaler
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.decomposition import PCA


In [2]:
ingredient_to_category = {
    # Nuts
    "peanuts": "Nuts",
    "cashew": "Nuts",
    "chestnuts": "Nuts",
    "pistachios": "Nuts",
    "almond": "Nuts",
    "hazelnut": "Nuts",
    "walnuts": "Nuts",
    "pecans": "Nuts",
    "brazil_nut": "Nuts",
    "pili_nut": "Nuts",
    
    # Spices
    "cumin": "Spices",
    "star_anise": "Spices",
    "nutmeg": "Spices",
    "cloves": "Spices",
    "ginger": "Spices",
    "allspice": "Spices",
    "chervil": "Spices",
    "mustard": "Spices",
    "cinnamon": "Spices",
    "saffron": "Spices",
    
    # Herbs
    "angelica": "Herbs",
    "garlic": "Herbs",
    "chives": "Herbs",
    "turnip": "Herbs",
    "dill": "Herbs",
    "mugwort": "Herbs",
    "chamomile": "Herbs",
    "coriander": "Herbs",
    "oregano": "Herbs",
    "mint": "Herbs",
    
    # Fruits
    "kiwi": "Fruits",
    "pineapple": "Fruits",
    "banana": "Fruits",
    "lemon": "Fruits",
    "mandarin_orange": "Fruits",
    "strawberry": "Fruits",
    "apple": "Fruits",
    "mango": "Fruits",
    "peach": "Fruits",
    "pear": "Fruits",
    
    # Vegetables
    "cauliflower": "Vegetables",
    "brussel_sprouts": "Vegetables",
    "broccoli": "Vegetables",
    "sweet_potato": "Vegetables",
    "asparagus": "Vegetables",
    "avocado": "Vegetables",
    "radish": "Vegetables",
    "tomato": "Vegetables",
    "potato": "Vegetables",
    "cabbage": "Vegetables",
}


In [4]:
training_data = defaultdict(list)
testing_data = defaultdict(list)
real_testing_data = defaultdict(list)

training_path = "/home/dewei/workspace/smell-net/training"
testing_path = "/home/dewei/workspace/smell-net/testing"
# real_testing_path = "/home/dewei/workspace/smell-net/processed_real_time_testing"
max_len = 0  # Track minimum length across all series
num_data = 0

# Walk through the training directory
for folder_name in os.listdir(training_path):
    folder_path = os.path.join(training_path, folder_name)
    
    if os.path.isdir(folder_path):  # Make sure it's a folder
        for filename in os.listdir(folder_path):
            if filename.endswith(".csv"):
                cur_path = os.path.join(folder_path, filename)
                df = pd.read_csv(cur_path)
                training_data[folder_name].append(df)
                max_len = max(max_len, df.shape[0])  # Update minimum length
                num_data += df.shape[0]

for folder_name in os.listdir(testing_path):
    folder_path = os.path.join(testing_path, folder_name)
    
    if os.path.isdir(folder_path):  # Make sure it's a folder
        for filename in os.listdir(folder_path):
            if filename.endswith(".csv"):
                cur_path = os.path.join(folder_path, filename)
                df = pd.read_csv(cur_path)
                testing_data[folder_name].append(df)
                max_len = max(max_len, df.shape[0])  # Update minimum length
                num_data += df.shape[0]

# for folder_name in os.listdir(real_testing_path):
#     folder_path = os.path.join(real_testing_path, folder_name)
    
#     if os.path.isdir(folder_path):  # Make sure it's a folder
#         for filename in os.listdir(folder_path):
#             if filename.endswith(".csv"):
#                 cur_path = os.path.join(folder_path, filename)
#                 df = pd.read_csv(cur_path)
#                 real_testing_data[folder_name].append(df)
#                 min_len = min(min_len, df.shape[0])  # Update minimum length

In [4]:
print(num_data)

180718


In [5]:
aggregated_training = []
aggregated_testing = []

# Aggregate training data
for ingredient, dfs in training_data.items():
    for i, df in enumerate(dfs):
        df = df.copy()  # Safe copy
        df['ingredient'] = ingredient
        df['file_id'] = f"{ingredient}_train_{i}"
        df['time_step'] = range(len(df))
        aggregated_training.append(df)

# Aggregate testing data
for ingredient, dfs in testing_data.items():
    for i, df in enumerate(dfs):
        df = df.copy()
        df['ingredient'] = ingredient
        df['file_id'] = f"{ingredient}_test_{i}"
        df['time_step'] = range(len(df))
        aggregated_testing.append(df)

# Concatenate into final DataFrames
aggregated_training = pd.concat(aggregated_training, ignore_index=True)
aggregated_testing = pd.concat(aggregated_testing, ignore_index=True)

# Map the ingredient to category
aggregated_training['category'] = aggregated_training['ingredient'].map(ingredient_to_category)
aggregated_testing['category'] = aggregated_testing['ingredient'].map(ingredient_to_category)

# Check a few examples
print(aggregated_training[['ingredient', 'category']].drop_duplicates().head())

       ingredient    category
0          ginger      Spices
2976   brazil_nut        Nuts
5983      walnuts        Nuts
9263      oregano       Herbs
12216     cabbage  Vegetables


In [7]:
# Select only the sensor columns (exclude ingredient, file_id, time_step)
sensor_columns = [
    'NO2', 'C2H5OH', 'VOC', 'CO', 'Alcohol', 'LPG', 'Benzene',
    'Temperature', 'Pressure', 'Humidity', 'Gas_Resistance', 'Altitude'
]

# 1. Overall summary (across all training samples)
training_summary = aggregated_training[sensor_columns].describe().round(3)
testing_summary = aggregated_testing[sensor_columns].describe().round(3)

print("Training Data Summary:")
print(training_summary)

print("\nTesting Data Summary:")
print(testing_summary)


Training Data Summary:
              NO2      C2H5OH         VOC          CO     Alcohol         LPG  \
count  150711.000  150711.000  150711.000  150711.000  150711.000  150711.000   
mean       97.795     138.328     195.943     792.938       3.407      30.333   
std       118.680     143.402     196.188      61.667       3.279      38.286   
min        13.000      39.000      26.000     705.000       0.000       2.000   
25%        35.000      65.000      73.000     750.000       1.000      14.000   
50%        46.000      77.000     106.000     776.000       2.000      23.000   
75%       105.000     140.000     232.500     820.000       5.000      32.000   
max       753.000     863.000     953.000    1006.000      42.000     507.000   

            Benzene  Temperature    Pressure    Humidity  Gas_Resistance  \
count  1.507110e+05   150711.000  150711.000  150711.000      150711.000   
mean   1.392784e+09       27.135     949.109      46.610         220.891   
std    2.010508e+09

In [None]:
# 2. Group by ingredient and compute mean and std for each feature
training_grouped_stats = aggregated_training.groupby('ingredient')[sensor_columns].agg(['mean', 'std']).round(3)
testing_grouped_stats = aggregated_testing.groupby('ingredient')[sensor_columns].agg(['mean', 'std']).round(3)

print("Training Data Grouped Statistics (per ingredient):")
print(training_grouped_stats)

print("\nTesting Data Grouped Statistics (per ingredient):")
print(testing_grouped_stats)

In [None]:
training_summary.to_csv("/home/dewei/workspace/SmellNet/data_stats/training_summary.csv")
testing_summary.to_csv("/home/dewei/workspace/SmellNet/data_stats/testing_summary.csv")
training_grouped_stats.to_csv("/home/dewei/workspace/SmellNet/data_stats/training_grouped_stats.csv")
testing_grouped_stats.to_csv("/home/dewei/workspace/SmellNet/data_stats/testing_grouped_stats.csv")

In [None]:
# Set a style for prettier plots
sns.set_theme(style="whitegrid")

# Plot distributions for training data
for feature in sensor_columns:
    plt.figure(figsize=(8, 5))
    sns.histplot(aggregated_training[feature], kde=True, bins=50, color='skyblue')
    plt.title(f"Training Data: Distribution of {feature}")
    plt.xlabel(feature)
    plt.ylabel("Frequency")
    plt.tight_layout()
    plt.show()

In [None]:
for feature in sensor_columns:
    plt.figure(figsize=(8, 5))
    sns.kdeplot(aggregated_training[feature], label="Training", fill=True, alpha=0.5)
    sns.kdeplot(aggregated_testing[feature], label="Testing", fill=True, alpha=0.5)
    plt.title(f"Distribution of {feature}: Training vs Testing")
    plt.xlabel(feature)
    plt.ylabel("Density")
    plt.legend()
    plt.tight_layout()
    plt.show()

In [None]:
for feature in sensor_columns:
    plt.figure(figsize=(8, 5))
    sns.kdeplot(aggregated_training[feature], label="Training", fill=True, alpha=0.5)
    sns.kdeplot(aggregated_testing[feature], label="Testing", fill=True, alpha=0.5)
    plt.title(f"Distribution of {feature}: Training vs Testing")
    plt.xlabel(feature)
    plt.ylabel("Density")
    plt.legend()
    plt.tight_layout()
    plt.savefig(f"/Users/derre/Documents/workspace/SmellNet/data_stats/feature_distribution_train_vs_test_{feature}.png", dpi=300)
    plt.close()

In [None]:
training_category_stats = aggregated_training.groupby('category')[sensor_columns].agg(['mean', 'std']).round(3)
testing_category_stats = aggregated_testing.groupby('category')[sensor_columns].agg(['mean', 'std']).round(3)

sns.set_theme(style="whitegrid")

for feature in sensor_columns:
    # Prepare data
    means = training_category_stats[feature]['mean']
    stds = training_category_stats[feature]['std']
    
    categories = means.index.tolist()
    mean_values = means.values
    std_values = stds.values
    
    # Create plot
    plt.figure(figsize=(10, 6))
    sns.barplot(
        x=categories, 
        y=mean_values, 
        palette="muted"
    )

    # Add error bars manually
    plt.errorbar(
        x=range(len(categories)), 
        y=mean_values, 
        yerr=std_values, 
        fmt='none', 
        c='black', 
        capsize=5
    )

    plt.title(f"Mean {feature} per Category (Training Data)", fontsize=16)
    plt.xlabel("Category", fontsize=14)
    plt.ylabel(f"Mean {feature} Value", fontsize=14)
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.savefig(f"/Users/derre/Documents/workspace/smell-net/data_stats/feature_distributions_category/{feature}_mean_per_category.png", dpi=300)
    plt.close()

In [None]:
sns.set_theme(style="whitegrid")

for feature in sensor_columns:
    plt.figure(figsize=(10, 6))
    
    # Plot one KDE per category
    for category in aggregated_training['category'].dropna().unique():
        subset = aggregated_training[aggregated_training['category'] == category]
        sns.kdeplot(
            subset[feature], 
            label=category, 
            fill=True, 
            alpha=0.3
        )
    
    plt.title(f"Training Data: {feature} Distribution by Category", fontsize=16)
    plt.xlabel(feature, fontsize=14)
    plt.ylabel("Density", fontsize=14)
    plt.legend(title="Category", fontsize=10)
    plt.tight_layout()
    plt.savefig(f"/home/dewei/workspace/smell-net/data_stats/feature_distributions_category/{feature}_kde_by_category.png", dpi=300)
    plt.close()

In [None]:
sampled_files = ["angelica_test_0", "mint_test_0"]

In [42]:
sampled_files = ["angelica_test_0", "mint_test_0"]
plot_columns = ["NO2"]

for file_id in sampled_files:
    df = aggregated_testing[aggregated_testing['file_id'] == file_id].copy()

    for col in plot_columns:
        # Wider figure for better aspect ratio
        fig, ax = plt.subplots(figsize=(8, 4))

        ax.plot(df['time_step'], df[col], color='steelblue')

        # Titles and labels
        ax.set_title(f"NO2 over Time", fontsize=22, fontweight='bold')
        ax.set_xlabel("Time Step", fontsize=20)
        ax.set_ylabel(col, fontsize=20)
        ax.tick_params(axis='both', which='major', labelsize=16)
        ax.grid(True)

        plt.tight_layout()
        plt.savefig(f"/home/dewei/workspace/smell-net/data_stats/time_series_analysis/{file_id}_{col}_timeseries.png", dpi=300)
        plt.close()

In [12]:
# 1. Compute correlation matrix
correlation_matrix = aggregated_training[sensor_columns].corr()

# 2. Plot the correlation matrix (larger and bolder)
plt.figure(figsize=(16, 14))  # Larger figure

sns.heatmap(
    correlation_matrix,
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    vmin=-1, vmax=1,
    square=True,
    cbar_kws={"shrink": 0.75},
    annot_kws={"size": 20}  # Larger annotation text
)

# Bold and large title
plt.title("Sensor Feature Correlation Matrix", fontsize=30, fontweight='bold')

# Bigger and readable tick labels
plt.xticks(rotation=45, ha='right', fontsize=25)
plt.yticks(rotation=45, fontsize=25)

plt.tight_layout(pad=2.0)
plt.savefig("/home/dewei/workspace/SmellNet/data_stats/feature_correlation.png", dpi=300)
plt.close()


In [8]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# === Prepare Sensor Data ===
X_raw = aggregated_training[sensor_columns]
y_category = aggregated_training['category']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_raw)

# === PCA Transformation ===
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
explained_var = pca.explained_variance_ratio_
print(f"Explained variance ratios: {explained_var}")

# === Create DataFrame for Visualization ===
pca_df = pd.DataFrame({
    'PC1': X_pca[:, 0],
    'PC2': X_pca[:, 1],
    'Category': y_category
})

# === Plotting ===
sns.set(style="whitegrid", context="notebook")  # 'notebook' = larger fonts
custom_palette = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd"]

plt.figure(figsize=(12, 8))
plot = sns.scatterplot(
    data=pca_df,
    x='PC1', y='PC2',
    hue='Category',
    palette=custom_palette,
    s=8,             # larger dots
    alpha=0.6,
    linewidth=0
)

# Axis labels with explained variance
plt.xlabel(f"PC1 ({explained_var[0]*100:.1f}% variance)", fontsize=25)
plt.ylabel(f"PC2 ({explained_var[1]*100:.1f}% variance)", fontsize=25)
plt.title("PCA of Sensor Data", fontsize=33, fontweight='bold')

# Tick label fonts
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)

# Legend styling
plt.legend(
    title="Ingredient Category",
    title_fontsize=25,
    fontsize=25,
    bbox_to_anchor=(1.05, 1),
    loc='upper left',
    handletextpad=0.4,
    borderaxespad=0.2,
    labelspacing=0.8,
    handlelength=2.5,
    markerscale=5   # Increase dot size in legend
)

# Save figure
plt.tight_layout()
plt.savefig("/home/dewei/workspace/smell-net/data_stats/PCA_sensor_data_category.png", dpi=300)
plt.close()


Explained variance ratios: [0.39145314 0.34491947]


In [10]:
# Get feature contributions (loadings) for PC1 and PC2
loadings = pd.DataFrame(pca.components_.T,  # transpose to get features as rows
                        columns=['PC1', 'PC2'],
                        index=sensor_columns)

# Compute magnitude of contribution (Euclidean norm)
loadings['Magnitude'] = (loadings[['PC1', 'PC2']]**2).sum(axis=1)**0.5

# Sort features by magnitude
top_features = loadings.sort_values('Magnitude', ascending=False)
print("Feature Contributions:")
print(top_features)


Feature Contributions:
                     PC1       PC2  Magnitude
C2H5OH         -0.001529  0.463920   0.463922
VOC             0.002555  0.461955   0.461962
NO2            -0.000228  0.457690   0.457690
Pressure        0.452413  0.009620   0.452515
Altitude       -0.452181 -0.010113   0.452294
Humidity       -0.452143  0.001751   0.452146
Temperature    -0.449991 -0.010153   0.450106
Gas_Resistance  0.333414 -0.241611   0.411753
CO              0.017569  0.396739   0.397128
Benzene         0.056469  0.339980   0.344638
Alcohol         0.229455  0.100311   0.250423
LPG             0.128875  0.142016   0.191774


In [22]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Seaborn style for large, clear visuals
sns.set(style="whitegrid", context="notebook", font_scale=1.4)

# Output directory
output_dir = "/home/dewei/workspace/smell-net/data_stats/pca_by_category_colored"
os.makedirs(output_dir, exist_ok=True)

# Iterate over each category
for category in aggregated_training['category'].unique():
    group_df = aggregated_training[aggregated_training['category'] == category]
    X = group_df[sensor_columns]

    # Standardize
    X_scaled = StandardScaler().fit_transform(X)

    # PCA
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)
    explained_var = pca.explained_variance_ratio_

    # Build PCA DataFrame
    pca_df = pd.DataFrame({
        'PC1': X_pca[:, 0],
        'PC2': X_pca[:, 1],
        'Ingredient': group_df['ingredient'].values
    })

    # Create figure
    fig, ax = plt.subplots(figsize=(12, 8))

    # Scatter plot with larger, semi-transparent dots
    sns.scatterplot(
        data=pca_df,
        x='PC1', y='PC2',
        hue='Ingredient',
        palette='tab20',
        s=8,
        alpha=0.7,
        linewidth=0,
        ax=ax
    )

    # Axis and title styling
    ax.set_title(f"{category} PCA", fontsize=33, fontweight='bold')
    ax.set_xlabel(f"PC1 ({explained_var[0]*100:.1f}% variance)", fontsize=25)
    ax.set_ylabel(f"PC2 ({explained_var[1]*100:.1f}% variance)", fontsize=25)
    ax.tick_params(axis='both', labelsize=16)

    # Adjust legend: bold text and bigger markers
    plt.legend(
        title="Ingredient",
        title_fontsize=25,
        fontsize=25,
        bbox_to_anchor=(1.05, 1),
        loc='upper left',
        handletextpad=0.4,
        borderaxespad=0.2,
        labelspacing=0.8,
        handlelength=2.5,
        markerscale=5   # Increase dot size in legend
    )

    # Save plot
    plt.tight_layout()
    plt.savefig(f"{output_dir}/PCA_{category.replace(' ', '_')}_by_ingredient.png", dpi=300)
    plt.close()


In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Set up the grid for subplots
categories = aggregated_training['category'].unique()
n_categories = len(categories)

n_cols = 5  # You can change this
n_rows = (n_categories + n_cols - 1) // n_cols  # Ceiling division

plt.figure(figsize=(4 * n_cols, 4 * n_rows))

for i, category in enumerate(categories):
    plt.subplot(n_rows, n_cols, i + 1)
    
    # Select data for this category
    group_df = aggregated_training[aggregated_training['category'] == category]
    X = group_df[sensor_columns]
    
    # Standardize
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # PCA
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)
    explained = pca.explained_variance_ratio_
    
    # Scatter plot
    plt.scatter(X_pca[:, 0], X_pca[:, 1], s=5, alpha=0.6)
    
    # Title with explained variance
    plt.title(f"{category}\n({explained[0]*100:.1f}%, {explained[1]*100:.1f}%)", fontsize=10)
    
    # Remove ticks but keep axis labels
    plt.xticks([])
    plt.yticks([])
    plt.xlabel('PC1', fontsize=8)
    plt.ylabel('PC2', fontsize=8)

plt.tight_layout()
plt.show()


In [None]:
missing_category_rows = aggregated_training[aggregated_training['category'].isna()]

# Display them
print(missing_category_rows)

# Optional: check how many
print(f"Number of rows with missing category: {len(missing_category_rows)}")

In [12]:
labels = ['Cashew', 'Peanuts', 'Pecans', 'Walnuts', 'Pili Nut']
cos_similarities = [0.7176, 0.5056, 0.4912, 0.3945, 0.3466]
colors = ['midnightblue'] + ['royalblue'] * 4

plt.figure(figsize=(6, 4.5))  # Match aspect ratio to 4:3

bars = plt.bar(labels, cos_similarities, color=colors, edgecolor='black', linewidth=1.2)

for bar, sim in zip(bars, cos_similarities):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.015, 
             f"{sim:.2f}", ha='center', va='bottom', fontsize=14)

plt.title("Top-5 Predicted Ingredients", fontsize=20, fontweight='bold')
plt.ylabel("Cosine Similarity", fontsize=20)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.ylim(0, max(cos_similarities) + 0.1)
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()

plt.savefig("model_predictions.png", dpi=300)
plt.close()