# 1. Requirements

1. Install necessary Python packages:
   - pandas
   - matplotlib
   - numpy
   - opencv-python (cv2)
   - tqdm
   - detectron2 (for object detection)
   - torch (for deep learning models)
   - torchvision (for data transformations)
   - efficientnet_pytorch (for feature extraction using EfficientNet)
   - scikit-learn (for data splitting and evaluation)
   
2. Command to install dependencies:

```bash
pip install pandas matplotlib numpy opencv-python tqdm detectron2 torch torchvision efficientnet_pytorch scikit-learn


# 2. Dataset Preparation

```
dataset/
├── arm_flapping/
│   ├── video1.mp4
│   ├── video2.mp4
│   └── ...
├── headbanging/
│   ├── video1.mp4
│   ├── video2.mp4
│   └── ...
└── spinning/
    ├── video1.mp4
    ├── video2.mp4
    └── ...
```


In [None]:
import os
import pandas as pd

def create_csv(dataset_dir, output_csv):
    classes = ['arm_flapping', 'headbanging', 'spinning', 'handaction']
    data = []
    for label, class_name in enumerate(classes):
        class_dir = os.path.join(dataset_dir, class_name)
        for filename in os.listdir(class_dir):
            if filename.endswith('.mp4'):
                filepath = os.path.join(class_dir, filename)
                data.append({'video_path': filepath, 'label': label})
    df = pd.DataFrame(data)
    df.to_csv(output_csv, index=False)

# Usage
dataset_dir = 'ESPD_trimmed_videos'
output_csv = 'dataset_labels.csv'
create_csv(dataset_dir, output_csv)

## Visualizations

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('dataset_labels.csv')

# Count the number of videos in each class
class_counts = df['label'].value_counts()

# Create a bar plot
plt.figure(figsize=(8, 6))
plt.bar(class_counts.index, class_counts.values, tick_label=['Arm Flapping', 'Headbanging', 'Spinning', 'Hand Action'])
plt.xlabel('Classes')
plt.ylabel('Number of Videos')
plt.title('Number of Videos per Class')
plt.show()


## Pie Chart

In [None]:
# Pie chart for class distribution
plt.figure(figsize=(8, 8))
plt.pie(class_counts, labels=['Arm Flapping', 'Headbanging', 'Spinning', 'Hand Action'], autopct='%1.1f%%', startangle=90, colors=['#ff9999','#66b3ff','#99ff99','#ff0334'])
plt.title('Class Distribution')
plt.show()

## Frames Per Video Distribution

In [None]:
import numpy as np
import cv2

# Get the number of frames per video
df['num_frames'] = df['video_path'].apply(lambda x: int(cv2.VideoCapture(x).get(cv2.CAP_PROP_FRAME_COUNT)))

# Create histogram for number of frames per video
plt.figure(figsize=(10, 6))
plt.hist(df['num_frames'], bins=20, color='skyblue', edgecolor='black')
plt.xlabel('Number of Frames')
plt.ylabel('Number of Videos')
plt.title('Distribution of Number of Frames per Video')
plt.show()


## Average Frames per Class

In [None]:
# Calculate average number of frames per class
avg_frames_per_class = df.groupby('label')['num_frames'].mean()

# Bar plot for average frames per class
plt.figure(figsize=(8, 6))
plt.bar(avg_frames_per_class.index, avg_frames_per_class.values, tick_label=['Arm Flapping', 'Headbanging', 'Spinning','Hand Action'], color=['#ff9999','#66b3ff','#99ff99','#ff0334'])
plt.xlabel('Classes')
plt.ylabel('Average Number of Frames')
plt.title('Average Number of Frames per Class')
plt.show()


## Video Duration Distribution

In [None]:
# Assuming a frame rate of 30 FPS
frame_rate = 30
df['duration'] = df['num_frames'] / frame_rate

# Create histogram for video durations
plt.figure(figsize=(10, 6))
plt.hist(df['duration'], bins=20, color='lightcoral', edgecolor='black')
plt.xlabel('Video Duration (seconds)')
plt.ylabel('Number of Videos')
plt.title('Distribution of Video Durations')
plt.show()


# 3. Video to Frames Conversion

In [None]:
import cv2
import os
import pandas as pd
from tqdm import tqdm

def video_to_frames(video_path, num_frames=20, resize=(300, 300)):
    cap = cv2.VideoCapture(video_path)
    frames = []
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
    for idx in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if ret:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = cv2.resize(frame, resize)
            frames.append(frame)
        else:
            # If frame reading fails, append the last successful frame
            if frames:
                frames.append(frames[-1])
            else:
                # If no frames have been read yet, append a black image
                frames.append(np.zeros((resize[1], resize[0], 3), dtype=np.uint8))
    cap.release()
    return frames


In [None]:
import numpy as np

df = pd.read_csv('dataset_labels.csv')
df['frames'] = df['video_path'].apply(lambda x: video_to_frames(x))
df.to_pickle('dataset_frames.pkl')  # Save for later use

## Sample Frames For Each Class

In [None]:
import cv2
import numpy as np

def show_sample_frames(df, num_frames=5):
    # Get the unique labels (classes)
    unique_labels = df['label'].unique()

    for label in unique_labels:
        # Get a sample video from the class
        sample_video_path = df[df['label'] == label]['video_path'].values[0]
        print(f"Showing frames for class: {label}")
        
        # Convert video to frames
        frames = video_to_frames(sample_video_path, num_frames=num_frames)
        
        # Display sample frames
        fig, axes = plt.subplots(1, num_frames, figsize=(15, 5))
        for i, frame in enumerate(frames):
            # Convert frame to RGB (if needed) for displaying with matplotlib
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            axes[i].imshow(frame_rgb)
            axes[i].axis('off')
        plt.show()

# Show sample frames for each class
show_sample_frames(df)

## Sample Videos Grid

In [None]:
def show_sample_grid(df, num_samples=3):
    classes = df['label'].unique()
    fig, axes = plt.subplots(len(classes), num_samples, figsize=(15, 5 * len(classes)))

    for i, class_label in enumerate(classes):
        sample_videos = df[df['label'] == class_label].sample(num_samples)
        for j, video_path in enumerate(sample_videos['video_path']):
            frames = video_to_frames(video_path, num_frames=1)  # Extract just 1 frame
            frame_rgb = cv2.cvtColor(frames[0], cv2.COLOR_BGR2RGB)
            axes[i, j].imshow(frame_rgb)
            axes[i, j].axis('off')
            axes[i, j].set_title(f'Class: {class_label} - Video: {j+1}')

    plt.tight_layout()
    plt.show()

# Show sample grid of frames
show_sample_grid(df, num_samples=5)


# 4. Child Detection and Cropping

In [3]:
import torch
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2 import model_zoo
import pandas as pd
from tqdm import tqdm
import cv2
import os
import numpy as np

def setup_detectron2():
    cfg = get_cfg()
    # add project-specific config (e.g., TensorMask) here if you're not running a model in detectron2's core library
    cfg.merge_from_file("D:/dataset/detectron2/configs/COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml")
    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5  # set threshold for this model
    # Find a model from detectron2's model zoo. You can use the https://dl.fbaipublicfiles... url as well
    cfg.MODEL.WEIGHTS = "detectron2://COCO-Detection/faster_rcnn_R_50_FPN_3x/137849458/model_final_280758.pkl"
    predictor = DefaultPredictor(cfg)
    return predictor


def crop_child(predictor, frame):
    outputs = predictor(frame)
    instances = outputs["instances"]
    if len(instances) == 0:
        return frame  # If no detections, return original frame
    # Assuming the first detected person is the target child
    boxes = instances.pred_boxes.tensor.cpu().numpy()
    scores = instances.scores.cpu().numpy()
    # Select the box with the highest score
    best_idx = np.argmax(scores)
    box = boxes[best_idx]
    x1, y1, x2, y2 = box.astype(int)
    # Ensure the coordinates are within frame boundaries
    x1, y1 = max(x1, 0), max(y1, 0)
    x2, y2 = min(x2, frame.shape[1]), min(y2, frame.shape[0])
    cropped = frame[y1:y2, x1:x2]
    print("2.Child cropped successfully")
    return cropped

# Apply cropping to all frames
def crop_all_frames(df, predictor):
    cropped_frames = []
    for frames in tqdm(df['frames'], desc="Cropping frames"):
        cropped_video = [crop_child(predictor, frame) for frame in frames]
        cropped_frames.append(cropped_video)
    df['cropped_frames'] = cropped_frames
    return df


## Create directory to save cropped images


In [None]:
def create_dir_structure(base_dir):
    dirs = ['train', 'test', 'val']
    for d in dirs:
        os.makedirs(os.path.join(base_dir, d), exist_ok=True)

## Save cropped frames into respective folders

In [None]:
def save_cropped_frames(df, predictor, base_dir='data/image_split/'):
    create_dir_structure(base_dir)
    cropped_frames = []
    for idx, frames in tqdm(enumerate(df['frames']), desc="Cropping and saving frames"):
        cropped_video = []
        for i, frame in enumerate(frames):
            cropped_frame = crop_child(predictor, frame)
            # Save the cropped frame
            img_name = f"img_{idx}_frame_{i}.jpg"
            folder = os.path.join(base_dir, 'all')  # Temporary folder for all data before splitting
            os.makedirs(folder, exist_ok=True)
            img_path = os.path.join(folder, img_name)
            cv2.imwrite(img_path, cropped_frame)
            cropped_video.append(cropped_frame)
        cropped_frames.append(cropped_video)
    df['cropped_frames'] = cropped_frames
    return df

## Split data into train, test, and validation

In [None]:
from sklearn.model_selection import train_test_split 
# Split data into train, test, and validation
def split_data(base_dir='data/image_split/', test_size=0.2, val_size=0.1):
    images = os.listdir(os.path.join(base_dir, 'all'))
    train_val_images, test_images = train_test_split(images, test_size=test_size, random_state=42)
    train_images, val_images = train_test_split(train_val_images, test_size=val_size/(1-test_size), random_state=42)

    def move_images(image_list, split_type):
        dest_dir = os.path.join(base_dir, split_type)
        for image in image_list:
            src = os.path.join(base_dir, 'all', image)
            dst = os.path.join(dest_dir, image)
            os.rename(src, dst)

    move_images(train_images, 'train')
    move_images(test_images, 'test')
    move_images(val_images, 'val')

    print(f"Data split: {len(train_images)} train, {len(test_images)} test, {len(val_images)} val")


### Usage

In [None]:
print("Predictor setup started")
predictor = setup_detectron2()
print("Predictor setup successfully passed")

df = pd.read_pickle('dataset_frames.pkl')
print("Cropping started")
df = save_cropped_frames(df, predictor)
print("Cropping and saving passed")

# Save dataframe with cropped frames
df.to_pickle('dataset_cropped_frames2.pkl')

In [None]:
from sklearn.model_selection import train_test_split

split_data()
print("Data split into train, test, and val successfully")

## Comparison of Cropped vs. Original Frames

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import os
import pandas as pd
from tqdm import tqdm
import numpy as np

df = pd.read_pickle('dataset_cropped_frames2.pkl')

def video_to_frames(video_path, num_frames=20, resize=(300, 300)):
    cap = cv2.VideoCapture(video_path)
    frames = []
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
    for idx in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if ret:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = cv2.resize(frame, resize)
            frames.append(frame)
        else:
            if frames:
                frames.append(frames[-1])
            else:
                frames.append(np.zeros((resize[1], resize[0], 3), dtype=np.uint8))
    cap.release()
    return frames

In [None]:
def compare_cropped_vs_original(df, sample_idx=0, num_frames=5):
    original_frames = video_to_frames(df['video_path'].iloc[sample_idx], num_frames=num_frames)
    cropped_frames = df['cropped_frames'].iloc[sample_idx][:num_frames]

    fig, axes = plt.subplots(2, num_frames, figsize=(15, 6))
    for i in range(num_frames):
        # Original frame
        frame_rgb = cv2.cvtColor(original_frames[i], cv2.COLOR_BGR2RGB)
        axes[0, i].imshow(frame_rgb)
        axes[0, i].axis('off')
        axes[0, i].set_title(f'Original Frame {i+1}')
        
        # Cropped frame
        if cropped_frames:
            cropped_rgb = cv2.cvtColor(cropped_frames[i], cv2.COLOR_BGR2RGB)
            axes[1, i].imshow(cropped_rgb)
            axes[1, i].axis('off')
            axes[1, i].set_title(f'Cropped Frame {i+1}')
    
    plt.tight_layout()
    plt.show()

compare_cropped_vs_original(df, sample_idx=0, num_frames=5)

# 5. Feature Extraction with EfficientNet-B3

In [None]:
# frame_dataset.py

import torch
from torch.utils.data import Dataset
from torchvision import transforms
class FrameDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((224, 224)),  # Resize all frames to 224x224
            transforms.ToTensor(),
            transforms.Normalize(
                mean=[0.485, 0.456, 0.406],  
                std=[0.229, 0.224, 0.225]    
            )
        ])
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        frames = self.df.iloc[idx]['cropped_frames']
        label = self.df.iloc[idx]['label']
        frames = [self.transform(frame) for frame in frames]
        frames = torch.stack(frames)
        return frames, label

In [None]:
import cv2
import numpy as np
import pandas as pd
from efficientnet_pytorch import EfficientNet
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

# Function to extract features
def extract_features(effnet, dataloader, device):
    effnet.eval()
    all_features = []
    all_labels = []
    with torch.no_grad():
        for batch_frames, batch_labels in tqdm(dataloader, desc="Extracting features"):
            batch_frames = batch_frames.to(device)
            batch_size, num_frames, C, H, W = batch_frames.size()
            batch_frames = batch_frames.view(-1, C, H, W)
            features = effnet(batch_frames)
            feature_dim = features.size(1)
            features = features.view(batch_size, num_frames, feature_dim)
            all_features.append(features.cpu())
            all_labels.append(batch_labels.cpu())
    
    all_features = torch.cat(all_features, dim=0)
    all_labels = torch.cat(all_labels, dim=0)
    return all_features, all_labels

# EfficientNet-B3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
effnet_b3 = EfficientNet.from_pretrained('efficientnet-b3', num_classes= 500)
effnet_b3._fc = torch.nn.Identity()
effnet_b3.to(device)


df = pd.read_pickle('dataset_cropped_frames2.pkl')
dataset = FrameDataset(df)
dataloader = DataLoader(dataset, batch_size=8, shuffle=False, num_workers=0, pin_memory=True)


features, labels = extract_features(effnet_b3, dataloader, device)

# Save features and labels
torch.save({'features': features, 'labels': labels}, 'extracted_features2.pth')

# 6. Preparing Data for Temporal Modeling

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

class ActionDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features  
        self.labels = labels      
    
    def __len__(self):
        return self.features.size(0)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]


data = torch.load('extracted_features2.pth') 

features = data['features']  
labels = data['labels']      

dataset = ActionDataset(features, labels)

In [None]:
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        # Return a tuple of (feature, label)
        return self.features[idx], self.labels[idx]


# Define the MS-TCN Model 

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score
import numpy as np

# Define a single-stage TCN block
class TCNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, num_layers, kernel_size=5):
        super(TCNBlock, self).__init__()
        layers = []
        dilation = 1
        for _ in range(num_layers):
            padding = (kernel_size - 1) * dilation // 2  # Calculate appropriate padding
            layers.append(
                nn.Conv1d(in_channels, out_channels, kernel_size, padding=padding, dilation=dilation)
            )
            layers.append(nn.ReLU())
            in_channels = out_channels
            dilation *= 2  # Double the dilation at each layer
        self.tcn = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.tcn(x)

# Define the Multi-Stage TCN model
class MSTCN(nn.Module):
    def __init__(self, num_stages=3, in_channels=20, out_channels=256, num_classes=4):
        super(MSTCN, self).__init__()
        # Adjust to handle the input channels mismatch (1x1 conv to match input channels)
        self.conv1x1 = nn.Conv1d(in_channels, out_channels, kernel_size=1)  # in_channels=20
        
        # Create multiple stages of TCN blocks
        self.stages = nn.ModuleList([TCNBlock(out_channels, out_channels, num_layers=5) for _ in range(num_stages)])
        
        # Fully connected layers for classification
        self.fc1 = nn.Linear(out_channels, 256)
        self.fc2 = nn.Linear(256, num_classes)

    def forward(self, x):
        # x shape: (batch_size, num_features, num_frames)
        out = self.conv1x1(x)  # Shape: (batch_size, out_channels, num_frames)
        for stage in self.stages:
            out = stage(out) + out  # Residual connection
        out = out.mean(dim=-1)  # Global average pooling over time (temporal dimension)
        out = nn.ReLU()(self.fc1(out))
        out = self.fc2(out)
        return out

# Dataset class to handle the features and labels
class ActionDataset(torch.utils.data.Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]


# Load extracted features (adjust this to load your dataset appropriately)
data = torch.load('extracted_features.pth')
features = data['features']  # Shape: (num_samples, num_frames, feature_dim)
labels = data['labels']  # Shape: (num_samples,)

# Create dataset and dataloader
dataset = ActionDataset(features, labels)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize model, optimizer, and loss function
print(features.shape)  # Check the shape of features before passing to model
model = MSTCN(num_stages=3, in_channels=features.size(1), out_channels=256, num_classes=4).to(device)  # Match in_channels to features.size(1)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
print("Optimizer and loss function initialized")

In [None]:
# Training loop
def train_model(model, dataloader, optimizer, criterion, num_epochs=50):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        print(f"Training Epoch {epoch + 1}/{num_epochs}")
        all_labels = []
        all_preds = []
        for features, labels in dataloader:
            features = features.to(device)
            labels = labels.to(device)
            
            # Forward pass
            # print(f"Input shape: {features.shape}")
            outputs = model(features)
            
            # Calculate loss
            loss = criterion(outputs, labels)
            
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # Update running loss
            running_loss += loss.item()
            
            # Store predictions and labels for F1-score calculation
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
        
        # Calculate F1-score for the epoch
        f1 = f1_score(all_labels, all_preds, average='macro')
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(dataloader):.4f}, F1-Score: {f1:.4f}')

# Train the model
train_model(model, dataloader, optimizer, criterion, num_epochs=50)

# 7. Defining the MS-TCN++ Model

In [None]:
import torch.nn as nn

# Define Dual Dilated Conv1d block
class DualDilatedConv1d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=5, dilation=1):
        super(DualDilatedConv1d, self).__init__()
        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size=kernel_size, padding=(kernel_size//2)*dilation, dilation=dilation)
        self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size=kernel_size, padding=(kernel_size//2)*dilation, dilation=dilation)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        out = self.relu(self.conv1(x))
        out = self.relu(self.conv2(out))
        return out

# Define Single Stage MS-TCN++ block
class SingleStageMS_TCNpp(nn.Module):
    def __init__(self, in_channels, hidden_channels, num_classes, num_layers=10):
        super(SingleStageMS_TCNpp, self).__init__()
        layers = []
        dilation = 1
        for _ in range(num_layers):
            layers.append(DualDilatedConv1d(hidden_channels, hidden_channels, dilation=dilation))
            dilation *= 2  # Exponential dilation
        self.network = nn.Sequential(*layers)
        self.conv1x1 = nn.Conv1d(hidden_channels, num_classes, kernel_size=1)
    
    def forward(self, x):
        # x: (batch, features, time)
        out = self.network(x)
        out = self.conv1x1(out)
        return out

class MSTCNPlusPlus(nn.Module):
    def __init__(self, num_stages=4, num_layers=10, hidden_channels=64, num_classes=4, in_channels=20):
        super(MSTCNPlusPlus, self).__init__()
        self.stages = nn.ModuleList()
        self.in_channels = in_channels

        # Initial 1x1 conv to match input channels to hidden channels
        self.conv1x1_in = nn.Conv1d(in_channels, hidden_channels, kernel_size=1)
        
        for _ in range(num_stages):
            self.stages.append(SingleStageMS_TCNpp(hidden_channels, hidden_channels, hidden_channels, num_layers))

        # Final 1x1 conv to map to num_classes
        self.conv1x1_out = nn.Conv1d(hidden_channels, num_classes, kernel_size=1)

    def forward(self, x):
        x = x.transpose(1, 2)  # Transpose to (batch, features, frames)
        x = self.conv1x1_in(x)  # Match input channels to hidden_channels
        
        for stage in self.stages:
            x = stage(x)

        x = self.conv1x1_out(x)  # Match final hidden channels to num_classes
        return x



# Instantiate the model
num_classes = 4
in_channels = features.size(2)  # Adjust based on your input
hidden_channels = 64  # You can adjust this
model = MSTCNPlusPlus(num_stages=4, num_layers=10, hidden_channels=hidden_channels, num_classes=num_classes, in_channels=in_channels)
model = model.to(device)

# 8. Training the Model

In [None]:
from torch.utils.data import Subset, DataLoader
from sklearn.model_selection import KFold
from torch.optim import Adam
from sklearn.metrics import f1_score
import torch.nn as nn
import numpy as np
from tqdm import tqdm
import torch

# Load all features and labels (assuming they are PyTorch tensors)
# features: shape (num_samples, num_frames, feature_dim)
# labels: shape (num_samples,)
features = features
labels = labels

# Define 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=False, random_state=None)
# kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define training parameters
epochs = 200
batch_size = 64
learning_rate = 1e-3
num_classes = 4  # Assuming 4 output classes

# To store results
f1_scores = []

# Loop through each fold
for fold, (train_idx, val_idx) in enumerate(kf.split(features)):
    print(f"\nStarting Fold {fold + 1}")
    
    # Create subsets for training and validation
    train_features = features[train_idx]
    train_labels = labels[train_idx]
    val_features = features[val_idx]
    val_labels = labels[val_idx]
    
    # Create Dataloaders
    train_dataset = torch.utils.data.TensorDataset(train_features, train_labels)
    val_dataset = torch.utils.data.TensorDataset(val_features, val_labels)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
    
    # Instantiate a new model for each fold
    in_channels = features.size(2)  # Number of input features (feature_dim)
    model = MSTCNPlusPlus(num_stages=4, num_layers=10, hidden_channels=64, num_classes=num_classes, in_channels=in_channels)
    model = model.to(device)
    
    # Define optimizer and loss function
    optimizer = Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()  # Modify weights if needed for imbalanced classes
    
    # Training loop
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for batch_features, batch_labels in train_loader:
            batch_features = batch_features.to(device)  # (batch_size, num_frames, feature_dim)
            batch_labels = batch_labels.to(device)      # (batch_size,)
            
            optimizer.zero_grad()
            outputs = model(batch_features)             # (batch_size, num_classes, num_frames)
            outputs = outputs.mean(dim=2)               # Average over time (batch_size, num_classes)
            loss = criterion(outputs, batch_labels)     # Compute loss
            loss.backward()                             # Backpropagation
            optimizer.step()                            # Update weights
            running_loss += loss.item() * batch_features.size(0)
        
        epoch_loss = running_loss / len(train_loader.dataset)
        
        # Validation step
        model.eval()
        all_preds = []
        all_targets = []
        with torch.no_grad():
            for batch_features, batch_labels in val_loader:
                batch_features = batch_features.to(device)
                batch_labels = batch_labels.to(device)
                
                outputs = model(batch_features)           # (batch_size, num_classes, num_frames)
                outputs = outputs.mean(dim=2)             # Average over time (batch_size, num_classes)
                _, preds = torch.max(outputs, 1)          # Get predictions
                all_preds.extend(preds.cpu().numpy())     # Store predictions
                all_targets.extend(batch_labels.cpu().numpy())  # Store true labels
        
        # Calculate weighted F1-score for the validation set
        f1 = f1_score(all_targets, all_preds, average='weighted')
        
        if (epoch + 1) % 10 == 0 or epoch == 0:
            print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss:.4f}, Weighted F1-Score: {f1:.4f}")
    
    # After training, evaluate on validation set
    f1 = f1_score(all_targets, all_preds, average='weighted')
    f1_scores.append(f1)
    print(f"Fold {fold + 1} Weighted F1-Score: {f1:.4f}")

# Final results
print("\nCross-Validation Weighted F1-Scores:", f1_scores)
print("Average Weighted F1-Score:", np.mean(f1_scores))

# 9. Evaluation

In [None]:
print("\nFinal Evaluation:")
for fold, score in enumerate(f1_scores):
    print(f"Fold {fold + 1}: Weighted F1-Score = {score:.4f}")
print(f"Average Weighted F1-Score across all folds: {np.mean(f1_scores):.4f}")

Average Weighted F1-Score across all folds: 0.7802