# Data_ingestion

## Dataset download from kaggle

### Linux kaggle setup

In [None]:
!pip install -q kaggle
!mkdir ~/.kaggle
!cp "/content/drive/MyDrive/Colab Notebooks/kaggle.json" ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

### Window kaggle setup

#### CMD

In [None]:
!pip install kaggle
!mkdir %USERPROFILE%\.kaggle
!copy "cridentical\kaggle.json" "%USERPROFILE%\.kaggle\kaggle.json"
!attrib +r %USERPROFILE%\.kaggle\kaggle.json

#### Powershell

In [None]:
! Install-Module -Name kaggle -Force
! mkdir $env:USERPROFILE\.kaggle
! Copy-Item "cridentical/kaggle.json" "$env:USERPROFILE\.kaggle\kaggle.json"
! attrib +r $env:USERPROFILE\.kaggle\kaggle.json

In [None]:
! kaggle datasets download -d sharjeelmazhar/human-activity-recognition-video-dataset

Dataset URL: https://www.kaggle.com/datasets/sharjeelmazhar/human-activity-recognition-video-dataset
License(s): CC-BY-NC-SA-4.0
Downloading human-activity-recognition-video-dataset.zip to /content
100% 14.8G/14.8G [02:46<00:00, 112MB/s]
100% 14.8G/14.8G [02:46<00:00, 95.3MB/s]


In [None]:
# ! curl -L -o /content/two-person-interaction-kinect-dataset.zip\
# https://www.kaggle.com/api/v1/datasets/download/sharjeelmazhar/human-activity-recognition-video-dataset

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 14.8G  100 14.8G    0     0  81.3M      0  0:03:06  0:03:06 --:--:--  126M


In [None]:
import subprocess
import tqdm
import requests

def download_with_progress(url, filename):
  """Downloads a file with a progress bar."""
  response = requests.get(url, stream=True)
  total_size_in_bytes = int(response.headers.get('content-length', 0))
  block_size = 1024  # 1 Kibibyte
  progress_bar = tqdm.tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
  with open(filename, 'wb') as file:
    for data in response.iter_content(block_size):
      progress_bar.update(len(data))
      file.write(data)
  progress_bar.close()
  if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
    print("ERROR, something went wrong")

dataset_url = "https://www.kaggle.com/api/v1/datasets/download/sharjeelmazhar/human-activity-recognition-video-dataset"
destination_path = "/content/two-person-interaction-kinect-dataset.zip"

download_with_progress(dataset_url, destination_path)

## Unziping the dataset

In [None]:
! unzip -q /content/human-activity-recognition-video-dataset.zip

## REsize & frame rate

In [None]:
! pip install ffmpeg-python

In [None]:
import os
import subprocess
from glob import glob
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
# Directories and parameters
root_dir = '/content/'
video_source_dir = os.path.join(root_dir, 'Human Activity Recognition - Video Dataset')
output_dir = os.path.join(root_dir, 'human_activity_recognition_video_dataset_resized')
target_width = 640
target_height = 480
frame_rate = 10
max_workers = 4  # Number of threads to run in parallel

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Gather video files
source_videos = glob(f"{video_source_dir}/*/*")
print(len(source_videos))

def resize_and_reframe_video(input_file, output_file, width, height, fps):
    # Construct ffmpeg command
    if not os.path.exists(output_file):
        return f"Skipping {input_file} as it already exists."
    command = [
        'ffmpeg', '-i', input_file,  # Input video
        '-vf', f'scale={width}:{height}',  # Resize video
        '-r', str(fps),  # Change frame rate
        output_file  # Output file
    ]
    try:
        subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        return f"Processed {input_file} successfully."
    except subprocess.CalledProcessError as e:
        return f"Error processing {input_file}: {e}"

def process_videos_concurrently(source_videos):
    futures = []
    total_videos = len(source_videos)

    with ThreadPoolExecutor(max_workers=max_workers) as executor, tqdm(total=total_videos) as progress_bar:
        for input_video in source_videos:
            # Generate output path
            rel_path = os.path.relpath(input_video, video_source_dir)
            output_video = os.path.join(output_dir, rel_path)

            # Ensure subdirectories exist in the output directory
            os.makedirs(os.path.dirname(output_video), exist_ok=True)

            # Submit task to thread pool
            futures.append(executor.submit(resize_and_reframe_video, input_video, output_video, target_width, target_height, frame_rate))

        # Collect and print the results as they complete
        for future in as_completed(futures):
            print(future.result())
            progress_bar.update(1)  # Update the progress bar

# Process videos concurrently with a progress bar
process_videos_concurrently(source_videos)


1113


  0%|          | 0/1113 [00:00<?, ?it/s]

Processed /content/Human Activity Recognition - Video Dataset/Walking/Walking (100).mp4 successfully.
Processed /content/Human Activity Recognition - Video Dataset/Walking/Walking (68).mp4 successfully.
Processed /content/Human Activity Recognition - Video Dataset/Walking/Walking (67).mp4 successfully.
Processed /content/Human Activity Recognition - Video Dataset/Walking/Walking (75).mp4 successfully.
Processed /content/Human Activity Recognition - Video Dataset/Walking/Walking (149).mp4 successfully.
Processed /content/Human Activity Recognition - Video Dataset/Walking/Walking (98).mp4 successfully.
Processed /content/Human Activity Recognition - Video Dataset/Walking/Walking (142).mp4 successfully.
Processed /content/Human Activity Recognition - Video Dataset/Walking/Walking (99).mp4 successfully.
Processed /content/Human Activity Recognition - Video Dataset/Walking/Walking (48).mp4 successfully.
Processed /content/Human Activity Recognition - Video Dataset/Walking/Walking (29).mp4 s

In [None]:
! zip -r /content/human_activity_recognition_video_dataset_resized.zip /content/human_activity_recognition_video_dataset_resized

  adding: content/human_activity_recognition_video_dataset_resized/ (stored 0%)
  adding: content/human_activity_recognition_video_dataset_resized/Walking/ (stored 0%)
  adding: content/human_activity_recognition_video_dataset_resized/Walking/Walking (68).mp4 (deflated 0%)
  adding: content/human_activity_recognition_video_dataset_resized/Walking/Walking (75).mp4 (deflated 0%)
  adding: content/human_activity_recognition_video_dataset_resized/Walking/Walking (67).mp4 (deflated 0%)
  adding: content/human_activity_recognition_video_dataset_resized/Walking/Walking (100).mp4 (deflated 0%)
  adding: content/human_activity_recognition_video_dataset_resized/Walking/Walking (142).mp4 (deflated 0%)
  adding: content/human_activity_recognition_video_dataset_resized/Walking/Walking (98).mp4 (deflated 0%)
  adding: content/human_activity_recognition_video_dataset_resized/Walking/Walking (149).mp4 (deflated 0%)
  adding: content/human_activity_recognition_video_dataset_resized/Walking/Walking (99)

In [None]:
! cp /content/human_activity_recognition_video_dataset_resized.zip "/content/drive/MyDrive/Colab Notebooks/Human_Activity_Recognition/"

# Extract images from video

In [None]:
! unzip -q "/content/drive/MyDrive/Colab Notebooks/Human_Activity_Recognition/human_activity_recognition_video_dataset_resized.zip"

In [None]:
SEQUENCE_LENGTH = 20
# IMAGE_HEIGHT

In [None]:
import cv2
def frames_extraction(video_path):
    '''
    This function will extract the required frames from a video after resizing and normalizing them.
    Args:
        video_path: The path of the video in the disk, whose frames are to be extracted.
    Returns:
        frames_list: A list containing the resized and normalized frames of the video.
    '''

    # Declare a list to store video frames.
    frames_list = []

    # Read the Video File using the VideoCapture object.
    video_reader = cv2.VideoCapture(video_path)

    # Get the total number of frames in the video.
    video_frames_count = int(video_reader.get(cv2.CAP_PROP_FRAME_COUNT))

    # Calculate the the interval after which frames will be added to the list.
    skip_frames_window = max(int(video_frames_count/SEQUENCE_LENGTH), 1)

    # Iterate through the Video Frames.
    for frame_counter in range(SEQUENCE_LENGTH):

        # Set the current frame position of the video.
        video_reader.set(cv2.CAP_PROP_POS_FRAMES, frame_counter * skip_frames_window)

        # Reading the frame from the video.
        success, frame = video_reader.read()

        # Check if Video frame is not successfully read then break the loop
        if not success:
            break

        # Resize the Frame to fixed height and width.
        # resized_frame = cv2.resize(frame, (IMAGE_HEIGHT, IMAGE_WIDTH))

        # Normalize the resized frame by dividing it with 255 so that each pixel value then lies between 0 and 1
        # normalized_frame = frame / 255

        # Append the normalized frame into the frames list
        frames_list.append(frame)

    # Release the VideoCapture object.
    video_reader.release()

    # Return the frames list.
    return frames_list

In [None]:
from glob import glob
from tqdm.auto import tqdm
import os

class_dirs = glob("/content/human_activity_recognition_video_dataset_resized/*")
len(class_dirs)

7

In [None]:
imageDataset_dest_dir = "imageDataset"
os.makedirs(imageDataset_dest_dir, exist_ok=True)
for class_dir in class_dirs:
    class_name = class_dir.split("/")[-2]
    video_paths = glob(f"{class_dir}/*")
    for video_index, video_path in enumerate(video_paths):
        dest_dir = os.path.join(imageDataset_dest_dir, class_name,f"{video_index:0>5}")
        if os.path.exists(os.path.join(dest_dir, f"{SEQUENCE_LENGTH-1:0>2}.jpg")):
            continue
        frames = frames_extraction(video_path)
        for i, frame in enumerate(frames):
            cv2.imwrite(os.path.join(dest_dir,f"{i:0>2}.jpg"), frame)


In [None]:
import cv2
from glob import glob
from tqdm.auto import tqdm
import os
import concurrent.futures

SEQUENCE_LENGTH = 20

def frames_extraction(video_path):
    '''
    This function will extract the required frames from a video after resizing and normalizing them.
    Args:
        video_path: The path of the video in the disk, whose frames are to be extracted.
    Returns:
        frames_list: A list containing the resized and normalized frames of the video.
    '''

    # Declare a list to store video frames.
    frames_list = []

    # Read the Video File using the VideoCapture object.
    video_reader = cv2.VideoCapture(video_path)

    # Get the total number of frames in the video.
    video_frames_count = int(video_reader.get(cv2.CAP_PROP_FRAME_COUNT))

    # Calculate the the interval after which frames will be added to the list.
    skip_frames_window = max(int(video_frames_count/SEQUENCE_LENGTH), 1)

    # Iterate through the Video Frames.
    for frame_counter in range(SEQUENCE_LENGTH):

        # Set the current frame position of the video.
        video_reader.set(cv2.CAP_PROP_POS_FRAMES, frame_counter * skip_frames_window)

        # Reading the frame from the video.
        success, frame = video_reader.read()

        # Check if Video frame is not successfully read then break the loop
        if not success:
            break

        # Resize the Frame to fixed height and width.
        # resized_frame = cv2.resize(frame, (IMAGE_HEIGHT, IMAGE_WIDTH))

        # Normalize the resized frame by dividing it with 255 so that each pixel value then lies between 0 and 1
        # normalized_frame = frame / 255

        # Append the normalized frame into the frames list
        frames_list.append(frame)

    # Release the VideoCapture object.
    video_reader.release()

    # Return the frames list.
    return frames_list


def process_video(video_path, class_name, video_index, imageDataset_dest_dir):
    '''
    This function processes a single video and saves the extracted frames.
    '''
    dest_dir = os.path.join(imageDataset_dest_dir, class_name, f"{video_index:0>5}")
    # Skip if frames are already extracted.
    if os.path.exists(os.path.join(dest_dir, f"{SEQUENCE_LENGTH-1:0>2}.jpg")):
        return

    # Make destination directory if it doesn't exist.
    os.makedirs(dest_dir, exist_ok=True)

    # Extract frames from the video.
    frames = frames_extraction(video_path)

    # Save the extracted frames as images.
    for i, frame in enumerate(frames):
        cv2.imwrite(os.path.join(dest_dir, f"{i:0>2}.jpg"), frame)


def process_all_videos():
    # Directories for videos and saving frames.
    class_dirs = glob("/content/human_activity_recognition_video_dataset_resized/*")
    imageDataset_dest_dir = "imageDataset"
    # os.makedirs(imageDataset_dest_dir, exist_ok=True)

    # List to hold all tasks for concurrent processing.
    tasks = []

    # Iterate over all classes and their videos.
    for class_dir in class_dirs:
        class_name = class_dir.split("/")[-1]
        video_paths = glob(f"{class_dir}/*")

        for video_index, video_path in enumerate(video_paths):
            # Create task tuple for processing the video.
            tasks.append((video_path, class_name, video_index, imageDataset_dest_dir))

    # Use ThreadPoolExecutor for multithreading or ProcessPoolExecutor for multiprocessing.
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Submit tasks to the thread pool and wrap in tqdm for progress bar.
        list(tqdm(executor.map(lambda p: process_video(*p), tasks), total=len(tasks), desc="Processing Videos"))

if __name__ == "__main__":
    process_all_videos()


Processing Videos:   0%|          | 0/1113 [00:00<?, ?it/s]

In [None]:
! zip -r imageDataset.zip imageDataset

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  adding: imageDataset/Meet and Split/00043/02.jpg (deflated 0%)
  adding: imageDataset/Meet and Split/00056/ (stored 0%)
  adding: imageDataset/Meet and Split/00056/08.jpg (deflated 0%)
  adding: imageDataset/Meet and Split/00056/07.jpg (deflated 0%)
  adding: imageDataset/Meet and Split/00056/17.jpg (deflated 0%)
  adding: imageDataset/Meet and Split/00056/04.jpg (deflated 0%)
  adding: imageDataset/Meet and Split/00056/09.jpg (deflated 0%)
  adding: imageDataset/Meet and Split/00056/15.jpg (deflated 0%)
  adding: imageDataset/Meet and Split/00056/16.jpg (deflated 0%)
  adding: imageDataset/Meet and Split/00056/18.jpg (deflated 0%)
  adding: imageDataset/Meet and Split/00056/11.jpg (deflated 0%)
  adding: imageDataset/Meet and Split/00056/19.jpg (deflated 0%)
  adding: imageDataset/Meet and Split/00056/12.jpg (deflated 0%)
  adding: imageDataset/Meet and Split/00056/03.jpg (deflated 0%)
  adding: imageDataset/Meet and S

In [None]:
! cp "/content/imageDataset.zip" "/content/drive/MyDrive/Colab Notebooks/Human_Activity_Recognition/"