In [None]:
!pip install numpy pandas matplotlib seaborn scikit-learn xgboost tensorflow


: 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, cross_val_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from tensorflow.keras.layers import Input, Conv1D, BatchNormalization, Dropout, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.impute import KNNImputer
from sklearn.preprocessing import RobustScaler, PolynomialFeatures, PowerTransformer
from sklearn.feature_selection import SelectKBest, mutual_info_classif
import logging
import tensorflow as tf

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
# Load the training and test data
train_series = pd.read_csv('converted_train_series.csv')
train_events = pd.read_csv('train_events.csv')
test_series = pd.read_csv('converted_test_series.csv')

# Merge train_series and train_events
train_data = pd.merge(train_series, train_events, on=['series_id', 'step'], how='left')
train_data['event'].fillna('none', inplace=True)
train_data['label'] = train_data['event'].apply(lambda x: 1 if x == 'onset' else (0 if x == 'wakeup' else -1))
train_data = train_data[train_data['label'] != -1]

# Feature Engineering
train_data['angle_diff'] = train_data['anglez'].diff()
train_data['anglez_squared'] = train_data['anglez'] ** 2
train_data['log_enmo'] = np.log1p(train_data['enmo'])

# Feature Selection
features = ['anglez', 'enmo', 'angle_diff', 'anglez_squared', 'log_enmo']
X = train_data[features]
y = train_data['label']

# === VISUALIZATION BEFORE PREPROCESSING ===
import seaborn as sns
import matplotlib.pyplot as plt

# Missing values
print("Missing values in features:")
print(X.isnull().sum())

# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(X.corr(), annot=True, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.show()

# Feature distributions
for feature in features:
    plt.figure(figsize=(8, 6))
    sns.kdeplot(train_data[feature], shade=True, color='blue')
    plt.title(f"Distribution of {feature}")
    plt.show()

# Pairplot
sns.pairplot(train_data[features + ['label']], hue='label', palette='viridis')
plt.suptitle("Pairplot of Features", y=1.02)
plt.show()

# === PREPROCESSING ===
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, PolynomialFeatures, PowerTransformer
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import train_test_split

# Imputation
print("\n=== Step 1: Imputation ===")
imputer = KNNImputer(n_neighbors=5)
X_imputed = imputer.fit_transform(X)

# Feature Scaling
print("\n=== Step 2: Scaling ===")
scaler = StandardScaler()  # You can experiment with other scalers like MinMaxScaler
X_scaled = scaler.fit_transform(X_imputed)

# Power Transformation
print("\n=== Step 3: Power Transformation ===")
transformer = PowerTransformer(method='yeo-johnson')
X_transformed = transformer.fit_transform(X_scaled)

# Polynomial Feature Expansion
print("\n=== Step 4: Polynomial Feature Expansion ===")
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X_transformed)

# Feature Selection using Mutual Information
print("\n=== Step 5: Feature Selection ===")
selector = SelectKBest(score_func=mutual_info_classif, k=10)
X_selected = selector.fit_transform(X_poly, y)

# Selected feature names
feature_names = poly.get_feature_names_out(features)
selected_features = feature_names[selector.get_support()]
print("Selected features after feature selection:")
print(selected_features)

# === DATA SPLIT ===
print("\n=== Step 6: Splitting Data ===")
X_train, X_val, y_train, y_val = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Check if any feature has NaN or Inf values after transformation
import numpy as np
print("\n=== Step 7: Checking NaN or Inf values ===")
print("Any NaN in X_train:", np.any(np.isnan(X_train)))
print("Any Inf in X_train:", np.any(np.isinf(X_train)))
print("Any NaN in X_val:", np.any(np.isnan(X_val)))
print("Any Inf in X_val:", np.any(np.isinf(X_val)))

# === Further Processing ===
# Optional: Check for multicollinearity if you want to drop highly correlated features
import pandas as pd
corr_matrix = pd.DataFrame(X_train).corr()
print("\nCorrelation matrix of features:")
print(corr_matrix)


In [None]:
# Function to compute class weights
def compute_weights(y_train):
    """Compute class weights for imbalanced datasets."""
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    return {0: class_weights[0], 1: class_weights[1]}

# Compute class weights
class_weight_dict = compute_weights(y_train)
print(f"Class weights: {class_weight_dict}")

In [None]:
from sklearn.ensemble import ExtraTreesClassifier


In [None]:
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score, accuracy_score
import numpy as np

# Custom scorer that combines F1 and accuracy
def combined_scorer(y_true, y_pred):
    f1 = f1_score(y_true, y_pred, average='macro')
    acc = accuracy_score(y_true, y_pred)
    return 0.7 * f1 + 0.3 * acc  # Weighted combination favoring F1

# Enhanced function to define models with expanded parameter grids
def define_models(class_weight_dict):
    """Define machine learning models with expanded hyperparameter grids."""
    return {
        'XGBoost': (XGBClassifier(random_state=42, eval_metric='logloss', tree_method='gpu_hist'), {
            'n_estimators': [100, 200, 300, 400],
            'max_depth': [3, 5, 7, 9, 11],
            'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2],
            'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
            'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
            'gamma': [0, 0.1, 0.2, 0.3, 0.4],
            'min_child_weight': [1, 3, 5],
            'scale_pos_weight': [class_weight_dict[1]/class_weight_dict[0]]
        }),
        'Logistic Regression': (LogisticRegression(random_state=42, max_iter=5000), {
            'C': np.logspace(-4, 4, 20),
            'penalty': ['l1', 'l2', 'elasticnet'],
            'solver': ['saga'],
            'l1_ratio': [0, 0.25, 0.5, 0.75, 1] if 'elasticnet' in ['l1', 'l2', 'elasticnet'] else [None],
            'class_weight': [class_weight_dict, 'balanced']
        }),
        'Random Forest': (RandomForestClassifier(random_state=42), {
            'n_estimators': [100, 200, 300, 400],
            'max_depth': [None, 10, 20, 30, 50],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['sqrt', 'log2', 0.5, 0.8],
            'bootstrap': [True, False],
            'class_weight': ['balanced', 'balanced_subsample', class_weight_dict]
        }),
        'SVM': (SVC(probability=True, random_state=42), {
            'C': np.logspace(-2, 3, 20),
            'kernel': ['rbf', 'poly', 'sigmoid'],
            'gamma': ['scale', 'auto'] + list(np.logspace(-3, 1, 10)),
            'class_weight': [class_weight_dict, 'balanced']
        }),
        'ExtraTrees': (ExtraTreesClassifier(random_state=42), {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'max_features': ['sqrt', 'log2', 0.5, 0.8],
            'class_weight': ['balanced', class_weight_dict]
        })
    }

# Enhanced training function with more folds and early stopping
def train_models(models, X_train, y_train, cv_splits=15, scoring=make_scorer(combined_scorer)):
    """Train models using increased CV folds and optimized search."""
    cv = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=42)
    best_models = {}
    
    for model_name, (model, param_grid) in models.items():
        logging.info(f"\n=== Training {model_name} with {cv_splits}-fold CV ===")
        
        # Use RandomizedSearchCV with more iterations
        search = RandomizedSearchCV(
            model, param_grid, 
            n_iter=50,  # Increased from 20
            cv=cv,
            scoring=scoring,
            n_jobs=-1,
            random_state=42,
            verbose=2
        )
        
        search.fit(X_train, y_train)
        best_models[model_name] = search.best_estimator_
        
        # Enhanced evaluation
        cv_scores = cross_val_score(
            search.best_estimator_, 
            X_train, y_train, 
            cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42),
            scoring=scoring,
            n_jobs=-1
        )
        
        logging.info(f"Best {model_name} params: {search.best_params_}")
        logging.info(f"10-fold CV Score: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")
        
        # Feature importance for tree-based models
        if hasattr(search.best_estimator_, 'feature_importances_'):
            importances = search.best_estimator_.feature_importances_
            logging.info(f"Top 5 features: {sorted(zip(features, importances), key=lambda x: x[1], reverse=True)[:5]}")
    
    return best_models

# Usage
class_weight_dict = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {0: class_weight_dict[0], 1: class_weight_dict[1]}

models = define_models(class_weight_dict)
best_models = train_models(models, X_train, y_train, cv_splits=15)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Function to evaluate a model and return metrics
def evaluate_model(model, X_val, y_val, is_keras=False):
    """
    Evaluate a model and return accuracy, precision, recall, and F1-score.
    Args:
        model: Trained model.
        X_val: Validation features.
        y_val: Validation labels.
        is_keras: Boolean indicating if the model is a Keras model.
    Returns:
        Dictionary of evaluation metrics.
    """
    if is_keras:
        y_pred = (model.predict(X_val) > 0.5).astype(int).flatten()
    else:
        y_pred = model.predict(X_val)

    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred, average='macro')
    recall = recall_score(y_val, y_pred, average='macro')
    f1 = f1_score(y_val, y_pred, average='macro')

    return {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1
    }

# Select the best model based on validation F1 score
if best_models:
    # Dictionary to store evaluation results for all models
    evaluation_results = {}

    # Evaluate each model and log its metrics
    for name, model in best_models.items():
        is_keras = isinstance(model, tf.keras.Model)
        metrics = evaluate_model(model, X_val_seq if is_keras else X_val, y_val_seq.flatten() if is_keras else y_val, is_keras=is_keras)
        
        # Store the evaluation results
        evaluation_results[name] = metrics
        
        # Log the evaluation metrics for the current model
        logging.info(f"\n=== Evaluation Results for {name} ===")
        logging.info(f"Accuracy: {metrics['Accuracy']:.4f}")
        logging.info(f"Precision: {metrics['Precision']:.4f}")
        logging.info(f"Recall: {metrics['Recall']:.4f}")
        logging.info(f"F1-Score: {metrics['F1-Score']:.4f}")

    # Select the best model based on F1-score
    final_model_name, best_metrics = max(
        evaluation_results.items(),
        key=lambda x: x[1]['F1-Score']  # Use F1-score as the key for comparison
    )
    final_model = best_models[final_model_name]

    # Log the best model and its metrics
    logging.info(f"\n=== Selected Best Model ===")
    logging.info(f"Model: {final_model.__class__.__name__}")
    logging.info(f"Accuracy: {best_metrics['Accuracy']:.4f}")
    logging.info(f"Precision: {best_metrics['Precision']:.4f}")
    logging.info(f"Recall: {best_metrics['Recall']:.4f}")
    logging.info(f"F1-Score: {best_metrics['F1-Score']:.4f}")
else:
    logging.warning("No models were successfully trained.")

In [None]:
# Prepare predictions for submission
test_data = test_series.copy()
test_data['angle_diff'] = test_data['anglez'].diff()
test_data['anglez_squared'] = test_data['anglez'] ** 2
test_data['log_enmo'] = np.log1p(test_data['enmo'])
X_test = test_data[features]
X_test_imputed = imputer.transform(X_test)
X_test_scaled = scaler.transform(X_test_imputed)
X_test_poly = poly.transform(X_test_scaled)
X_test_selected = selector.transform(X_test_poly)

# Predict using the best model
if isinstance(final_model, tf.keras.Model):
    X_test_seq, _ = create_sequences(X_test_selected, np.zeros(len(X_test_selected)), sequence_length)
    test_predictions = (final_model.predict(X_test_seq) > 0.5).astype(int).flatten()
else:
    test_predictions = final_model.predict(X_test_selected)

# Prepare the submission DataFrame
submission = pd.DataFrame({
    'row_id': range(len(test_series)),
    'series_id': test_series['series_id'],
    'step': test_series['step'],
    'event': ['onset' if pred == 1 else 'wakeup' if pred == 0 else 'none' for pred in test_predictions],
    'score': [max(prob) for prob in final_model.predict_proba(X_test_selected)] if hasattr(final_model, 'predict_proba') else [0.5] * len(test_series)
})

# Save the submission file
submission.to_csv('submission_final.csv', index=False)
print("Submission file created: submission_final.csv")

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# === Step 1: Load Submission File ===
# Assuming the submission file is already created and named 'submission_final.csv'
submission_df = pd.read_csv('submission_final.csv')

# Filter rows where the event is either 'onset' or 'wakeup'
submission_filtered = submission_df[submission_df['event'].isin(['onset', 'wakeup'])]

# === Step 2: Evaluate Confidence Scores ===
print("\n=== Confidence Score Summary ===")

# Group by event type and calculate summary statistics
confidence_summary = submission_filtered.groupby('event')['score'].agg(['mean', 'median', 'min', 'max', 'std'])
print(confidence_summary)

# === Step 3: Visualize Confidence Score Distributions ===
print("\n=== Visualizing Confidence Score Distributions ===")

plt.figure(figsize=(12, 6))

# Confidence scores for 'onset'
plt.subplot(1, 2, 1)
sns.histplot(submission_filtered[submission_filtered['event'] == 'onset']['score'], bins=30, kde=True, color='blue')
plt.title('Confidence Scores for Onset Events')
plt.xlabel('Confidence Score')
plt.ylabel('Frequency')

# Confidence scores for 'wakeup'
plt.subplot(1, 2, 2)
sns.histplot(submission_filtered[submission_filtered['event'] == 'wakeup']['score'], bins=30, kde=True, color='orange')
plt.title('Confidence Scores for Wakeup Events')
plt.xlabel('Confidence Score')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
import joblib
import tensorflow as tf

# After preparing the final submission
submission.to_csv('submission_final.csv', index=False)
print("Submission file created: submission_final.csv")

# Save the final model
if isinstance(final_model, tf.keras.Model):
    # Save Keras model
    final_model.save('final_keras_model.h5')  # Saves the model in H5 format
    print("Keras model saved as 'final_keras_model.h5'")
else:
    # Save scikit-learn model
    joblib.dump(final_model, 'final_sklearn_model.pkl')  # Saves the model in pickle format
    print("scikit-learn model saved as 'final_sklearn_model.pkl'")
