In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from src.data.load_data import load_dataset
from src.data.preprocess import pad_dataset
from src.data.feature_engineering import extract_features_dataset

# Step 1: Load data
data_list, labels_list = load_dataset('../data/raw')
padded_list = pad_dataset(data_list)
features_df = extract_features_dataset(padded_list)
labels_df = pd.DataFrame(labels_list, columns=['ENR', 'CIP'])

# Combine features & labels
full_df = pd.concat([features_df, labels_df], axis=1)
full_df.head()


In [None]:
# Plot 1 sample
sample_df = padded_list[0]

plt.figure(figsize=(10, 4))
plt.plot(sample_df['X'], label='X')
plt.plot(sample_df['Y'], label='Y')
plt.title('Sample Time Series')
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(6, 4))
sns.scatterplot(x=labels_df['ENR'], y=labels_df['CIP'])
plt.title('Label Distribution (ENR vs CIP)')
plt.xlabel('ENR Concentration (uM)')
plt.ylabel('CIP Concentration (uM)')
plt.show()


In [None]:
plt.figure(figsize=(12, 8))
corr = full_df.corr()
sns.heatmap(corr, cmap='coolwarm', center=0)
plt.title('Feature & Label Correlation Heatmap')
plt.show()


In [None]:
import joblib
import numpy as np

# Load trained model
model = joblib.load('../outputs/models/random_forest_model.pkl')

# RandomForest trả về importance cho mỗi target => trung bình 2 cái
importances = np.mean([est.feature_importances_ for est in model.estimators_], axis=0)

# Plot
feat_names = features_df.columns
feat_importance_df = pd.DataFrame({'Feature': feat_names, 'Importance': importances})
feat_importance_df = feat_importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feat_importance_df)
plt.title('Feature Importance (Random Forest)')
plt.show()
