In [2]:
import pandas as pd

# Full path to your CSV file
csv_path = r"C:\Users\User\OneDrive\RD_Project\RD_Project\EPIC_100_04.csv"

# Define column names based on your description
columns = [
    "narration_id",       # unique segment ID
    "participant_id",     # participant ID
    "video_id",           # video ID
    "narration_timestamp",# timestamp of narration
    "start_timestamp",    # start time of action
    "stop_timestamp",     # stop time of action
    "start_frame",        # start frame
    "stop_frame",         # stop frame
    "narration",          # full narration text
    "verb",               # parsed verb
    "verb_class",         # numeric verb class
    "noun",               # first noun
    "noun_class",         # numeric noun class
    "all_nouns",          # list of all nouns
    "all_noun_classes"    # list of all noun classes
]

# Read the CSV with the column names
df = pd.read_csv(csv_path, header=None, names=columns, on_bad_lines='skip', encoding='utf-8')

# Example: access frames easily
starting_frames = df["start_frame"]
ending_frames = df["stop_frame"]
nouns = df["noun"]

# Example: access time columns easily
start_times = df["start_timestamp"]
stop_times = df["stop_timestamp"]


# how many rows to print 
pd.set_option('display.max_rows', 10)
# Print examples
print(df[["start_frame", "stop_frame", "start_timestamp", "stop_timestamp", "noun"]].head(20))




    start_frame  stop_frame start_timestamp stop_timestamp               noun
0             6         182     00:00:00.11    00:00:03.04                cup
1           172         306     00:00:02.87    00:00:05.10                cup
2          1845        2102     00:00:30.75    00:00:35.04         tablecloth
3          2239        2328     00:00:37.33    00:00:38.81         tablecloth
4          2481        2527     00:00:41.36    00:00:42.13  liquid:washing:up
..          ...         ...             ...            ...                ...
15         4284        4324     00:01:11.40    00:01:12.08                tap
16         4461        4766     00:01:14.35    00:01:19.44               hand
17         4787        4922     00:01:19.79    00:01:22.04             bottle
18         4928        5108     00:01:22.14    00:01:25.14             bottle
19         5122        5167     00:01:25.37    00:01:26.12              glass

[20 rows x 5 columns]


c:\Users\USER\AppData\Local\Programs\Python\Python312\python.exe


In [6]:
# Pre processs the video to frames for the machine learning
import cv2

video_path = "P01_04.mp4"  
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
# has 60 fps
print(fps)

frames = []
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    # sample every 10th frames/ turn to RGB to minimize the memory usage, simpler training
    
    frame_number = int(cap.get(cv2.CAP_PROP_POS_FRAMES))
    # elapsed_time = frame_number / fps
    # print(f"Frame: {frame_number}, Elapsed Time: {elapsed_time:.2f} seconds")
    if frame_number % 10 == 0:
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(frame_rgb)

    #this funtion shows the video frame by frame
    #cv2.imshow("Video", frame)


cap.release()
cv2.destroyAllWindows()

0.0


In [4]:
import numpy as np
import cv2

# X has the video segments for training and y is the labels, outputs, for verbs and nouns
X = []         
y_verb = []   
y_noun = []   

# segmenting the frames bassed on the actions in the dataframe
def segment_frames_based_on_actions(df, frames):

    for i in range(len(df)):

        # Get one row of data at a time
        row = df.iloc[i]

        start_frame = int(row["start_frame"])
        stop_frame = int(row["stop_frame"])

        verb_label = int(row["verb_class"])
        noun_label = int(row["noun_class"])
        # print(f"Processing segment {i+1}/{len(df)}: Frames {start_frame} to {stop_frame}, Verb: {verb_label}, Noun: {noun_label}")
        # segment each frames based on a certain action
        segment_frames = []
        for frame_index in range(len(frames)):
            # actual frame number in the video since the frames are sampled in the factor of 10
            actual_frame_number = frame_index * 10

            if actual_frame_number >= start_frame and actual_frame_number <= stop_frame:
                segment_frames.append(frames[frame_index])

        if len(segment_frames) == 0:
            continue


        resized_frames = []
        for frame in segment_frames:
            small_frame = cv2.resize(frame, (112, 112))  
            resized_frames.append(small_frame)

        # Convert list to numpy array
        segment_array = np.array(resized_frames)
        X.append(segment_array)
        y_verb.append(verb_label)
        y_noun.append(noun_label)
    
    # Normalize pixel values to [0, 1]


segment_frames_based_on_actions(df, frames)
X = np.array(X, dtype=object)

y_verb = np.array(y_verb)
y_noun = np.array(y_noun)
print(y_noun)
# shows how many segments are created for training
print("X shape:", X.shape)
print("y_verb shape:", y_verb.shape)
print("y_noun shape:", y_noun.shape)

[]
X shape: (0,)
y_verb shape: (0,)
y_noun shape: (0,)


In [None]:


from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import TimeDistributed, Conv2D, MaxPooling2D, Flatten, LSTM, Dense, Dropout
import matplotlib.pyplot as plt

# normalize the data, RGB has range between 0 to 255
X = [x.astype("float32") / 255.0 for x in X]

MAX_FRAMES = 16

# need to pad since the training model expects fixedd number of frames
def pad_video_sequence(video, MAX_FRAMES):
    maxlen = MAX_FRAMES
    num_frames = len(video)
    if num_frames >= maxlen:
        return video[:maxlen]
    pad_length = maxlen - num_frames
    padding = np.zeros((pad_length, 112, 112, 3), dtype=np.float32)
    return np.concatenate([video, padding], axis=0)

X_padded = []

for x in X:
    X_padded.append(pad_video_sequence(x, MAX_FRAMES))
X_padded = np.array(X_padded, dtype=np.float32)


# Encode labels
verb_encoder = LabelEncoder()
noun_encoder = LabelEncoder()

#scans all the unique verbs and nouns and assigns a number to each eg run to 0, walk to 1
y_verb_encoded = verb_encoder.fit_transform(y_verb)
y_noun_encoded = noun_encoder.fit_transform(y_noun)

# print("Unique verbs:", len(np.unique(y_verb_encoded)))
# print("Unique nouns:", len(np.unique(y_noun_encoded)))

X_train, X_val, yv_train, yv_val, yn_train, yn_val = train_test_split(
    X_padded, y_verb_encoded, y_noun_encoded, test_size=0.2, random_state=42
)

# print("Train samples:", X_train.shape[0])
# print("Validation samples:", X_val.shape[0])

#count how many unique verb has
num_verb_classes = len(np.unique(y_verb_encoded))


model_verb = Sequential([
    #2D convolution to extract shapes edges from each froame
    TimeDistributed(Conv2D(32, (3,3), activation='relu'), input_shape=(MAX_FRAMES, 112, 112, 3)),
    # down sampling to reduce size
    TimeDistributed(MaxPooling2D(2,2)),
    TimeDistributed(Conv2D(64, (3,3), activation='relu')),
    # uses larer filter 64
    TimeDistributed(MaxPooling2D(2,2)),
    TimeDistributed(Flatten()),
    LSTM(128),
    #Randomly turns off 50% of the neurons during training. to prevent overfitting
    Dropout(0.5),
    Dense(num_verb_classes, activation='softmax')
])

#updatae weights, optimizer to adam
model_verb.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_verb.summary()


# train the model
history = model_verb.fit(
    X_train, yv_train,
    validation_data=(X_val, yv_val),
    epochs=10,
    batch_size=4
)

plt.plot(history.history['accuracy'], label='train acc')
plt.plot(history.history['val_accuracy'], label='val acc')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# save model
model_verb.save("verb_classifier.h5")



ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.