In [54]:
!rm -rf /content/cut_audios_by_emotions

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [17]:
!unzip "/content/drive/MyDrive/continuous_speeches.zip" -d "/content/continuous_speeches"

Archive:  /content/drive/MyDrive/continuous_speeches.zip
  inflating: /content/continuous_speeches/combined_speech_Ses05M_script01_1.wav  
  inflating: /content/continuous_speeches/combined_speech_Ses04M_impro06.wav  
  inflating: /content/continuous_speeches/combined_speech_Ses05F_impro01.wav  
  inflating: /content/continuous_speeches/combined_speech_Ses02M_script01_1.wav  
  inflating: /content/continuous_speeches/combined_speech_Ses04F_script02_2.wav  
  inflating: /content/continuous_speeches/combined_speech_Ses02F_impro06.wav  
  inflating: /content/continuous_speeches/combined_speech_Ses02M_script01_2.wav  
  inflating: /content/continuous_speeches/combined_speech_Ses05F_impro02.wav  
  inflating: /content/continuous_speeches/combined_speech_Ses02F_script03_1.wav  
  inflating: /content/continuous_speeches/combined_speech_Ses01F_impro04.wav  
  inflating: /content/continuous_speeches/combined_speech_Ses05M_script02_2.wav  
  inflating: /content/continuous_speeches/combined_speec

In [18]:
!pip install pydub
!pip install ruptures
!pip install librosa

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Collecting ruptures
  Downloading ruptures-1.1.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Downloading ruptures-1.1.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ruptures
Successfully installed ruptures-1.1.9


In [19]:
import os
import shutil
from pydub import AudioSegment
import numpy as np
import librosa
import ruptures as rpt
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [22]:
import pandas as pd
import os

# Assuming your CSV file is named 'ground_truth.csv' and is in the same directory as your Python script
csv_file_path = '/content/merged_emotions.csv'  # Replace with the actual path to your CSV file

try:
    df = pd.read_csv(csv_file_path)

    # Ensure the necessary columns exist
    if not all(col in df.columns for col in ['conti_path', 'time']):
        raise ValueError("CSV file must contain 'conti_path' and 'time' columns.")

    # Sort the DataFrame by 'contipath' alphabetically
    df_sorted = df.sort_values('conti_path')

    # Print or further process the sorted data
    #print(df_sorted[['conti_path', 'time']])

except FileNotFoundError:
    print(f"Error: CSV file '{csv_file_path}' not found.")
except ValueError as e:
    print(f"Error: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

df_sorted = df_sorted.dropna()

In [23]:
gt = df_sorted["time"].tolist()
for i in range(len(gt)):
  gt[i]=gt[i][1:-2]
  gt[i]=gt[i].split(",")
  gt[i]=[float(x) for x in gt[i]]

In [50]:
def group_and_average_times(times, threshold=1.0):
    """
    Groups consecutive numbers in a list if their difference is less than the threshold
    and calculates their mean.

    Args:
        times (list): A list of float numbers representing times.
        threshold (float): The maximum difference between consecutive numbers to be
                           considered in the same group. Defaults to 1.0.

    Returns:
        list: A new list with the grouped and averaged times.
    """
    if not times:  # Handle empty list
        return []

    grouped_times = []
    current_group = [times[0]]

    for i in range(1, len(times)-1):
        if times[i] - times[i - 1] < threshold:
            current_group.append(times[i])
        else:
            grouped_times.append(sum(current_group) / len(current_group))  # Calculate and append mean
            current_group = [times[i]]  # Start a new group

    # Append the mean of the last group
    grouped_times.append(sum(current_group) / len(current_group))

    return grouped_times


def detect_audio_change_points(audio_file_path, Verbose=True, pen=10):  # Adjusted pen for less sensitivity
    # Load the audio file
    y, sr = librosa.load(audio_file_path, sr=None)

    # Extract MFCC features, shape: (n_mfcc, n_frames)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).T  # Transpose to (n_frames, n_mfcc)

    # Optionally, take the mean across all coefficients for each frame
    mfccs_mean = np.mean(mfccs, axis=1)  # Shape: (n_frames,)

    # Scale MFCCs
    scaler = StandardScaler()
    mfccs_scaled = scaler.fit_transform(mfccs_mean.reshape(-1, 1))  # Reshape for scaler

    # Change point detection
    algo = rpt.Pelt(model="l2").fit(mfccs_scaled)  # Using scaled MFCCs
    change_points = algo.predict(pen=pen)  # Applying the penalty

    # Convert change points to time (in seconds)
    change_points_time = [cp / sr for cp in change_points]
    scale=(len(y)/sr)/(len(mfccs_scaled) / sr)
    timesegs= [x*scale for x in change_points_time]
    finaltimesegs=group_and_average_times(timesegs)
    if (Verbose):
      # Plotting the scaled MFCCs
      plt.figure(figsize=(12, 6))
      plt.plot(np.linspace(0, len(mfccs_scaled) / sr, num=len(mfccs_scaled)), mfccs_scaled, label='Scaled MFCCs')
      # Mark the change points on the plot
      for cp in [x/scale for x in finaltimesegs]:
        plt.axvline(x=cp, color='b', linestyle='--', label='Change Point')
      plt.title('Scaled MFCC Features with Change Points')
      plt.xlabel('Time (seconds)')
      plt.ylabel('Scaled MFCC Coefficients')
      plt.grid()
      plt.show()

    return finaltimesegs


In [51]:
def split_audio_by_timestamps(audio_file_path, timestamps, output_dir):
    """
    Splits an audio file into multiple segments based on timestamps.

    Args:
        audio_file_path (str): Path to the input audio file.
        timestamps (list): A list of timestamps (in seconds) where the audio should be split.
        output_dir (str): Directory to save the split audio segments.
    """
    audio = AudioSegment.from_file(audio_file_path)
    filename, ext = os.path.splitext(os.path.basename(audio_file_path))

    # Convert timestamps to milliseconds
    timestamps_ms = [int(t * 1000) for t in timestamps]

    # Add start and end times to the timestamps list
    timestamps_ms = [0] + timestamps_ms + [len(audio)]
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True) #This line is being added
    # Split audio into segments
    #print(timestamps_ms)
    for i in range(len(timestamps_ms) - 1):
        start_time = timestamps_ms[i]
        end_time = timestamps_ms[i + 1]

        segment = audio[start_time:end_time]
        output_path = os.path.join(output_dir, f"{filename}_part_{i + 1}{ext}")
        segment.export(output_path, format=ext[1:])  # Use original file extension

    print(f"Audio file '{filename}{ext}' split into {len(timestamps_ms) - 1} segments and saved to '{output_dir}'")




In [55]:
audio_dir = "/content/continuous_speeches"  # Directory containing the audio files
output_dir= "/content/cut_audios_by_emotions"
predicted_values=[]

for filename in df_sorted["conti_path"]:
    if filename.endswith((".mp3", ".wav")):  # Add other audio extensions if needed
        audio_file_path = filename
        print(f"Processing audio file: {audio_file_path}")
        timesegs = detect_audio_change_points(audio_file_path, Verbose=False)
        split_audio_by_timestamps(audio_file_path, timesegs, output_dir)
        predicted_values.append(timesegs)

Processing audio file: /content/continuous_speeches/combined_speech_Ses01F_impro01.wav
Audio file 'combined_speech_Ses01F_impro01.wav' split into 33 segments and saved to '/content/cut_audios_by_emotions'
Processing audio file: /content/continuous_speeches/combined_speech_Ses01F_impro02.wav
Audio file 'combined_speech_Ses01F_impro02.wav' split into 44 segments and saved to '/content/cut_audios_by_emotions'
Processing audio file: /content/continuous_speeches/combined_speech_Ses01F_impro03.wav
Audio file 'combined_speech_Ses01F_impro03.wav' split into 40 segments and saved to '/content/cut_audios_by_emotions'
Processing audio file: /content/continuous_speeches/combined_speech_Ses01F_impro04.wav
Audio file 'combined_speech_Ses01F_impro04.wav' split into 60 segments and saved to '/content/cut_audios_by_emotions'
Processing audio file: /content/continuous_speeches/combined_speech_Ses01F_impro05.wav
Audio file 'combined_speech_Ses01F_impro05.wav' split into 53 segments and saved to '/content

In [64]:
def evaluate_boundaries(ground_truth, predicted, tolerance):
    """
    Evaluate how well `predicted` boundaries match `ground_truth` boundaries,
    given a specified time tolerance.
    Ignores (does not penalize) predicted boundaries that do not align
    (i.e., no penalty for false positives).

    Args:
        ground_truth (list or array-like): Times of true emotion-change boundaries.
        predicted (list or array-like): Times of predicted emotion-change boundaries.
        tolerance (float): Maximum allowed difference (seconds, frames, etc.)
                           for a predicted boundary to be considered correct.

    Returns:
        coverage (float): Fraction of ground truth boundaries matched by
                          at least one predicted boundary within tolerance.
        avg_offset (float): Average absolute time difference for matched boundaries.
    """
    ground_truth = np.array(ground_truth)
    predicted = np.array(predicted)

    if len(ground_truth) == 0:
        # if no ground truth boundaries, define coverage = 1.0 or 0.0 as appropriate for your application
        return 1.0, 0.0
    if len(predicted) == 0:
        # if no predictions at all, coverage is 0, offset is undefined
        return 0.0, 0.0

    matched_count = 0
    total_diff = 0.0

    for gt_boundary in ground_truth:
        # find the predicted boundary closest in time
        differences = np.abs(predicted - gt_boundary)
        min_diff = np.min(differences)
        # check if it's within tolerance
        if min_diff <= tolerance:
            matched_count += 1
            total_diff += min_diff

    coverage = matched_count / len(ground_truth)
    avg_offset = total_diff / matched_count if matched_count > 0 else 0.0

    return coverage, avg_offset


ground_truth_values = gt[0:29]
predicted_Values = predicted_values[0:29]

possible_tolerances = [0.25,0.5,0.75,1,1.5,2,10000]

for tol in possible_tolerances:
  #print("Tolerance:", tol)
  Hit_rates = []
  offsets = []
  for i in range(len(predicted_Values)):
    #print("Ground Truth:", ground_truth_values[i])
    #print("Predicted   :", predicted_values[i])
    Hit_rate, avg_offset = evaluate_boundaries(ground_truth_values[i],
                                                   predicted_Values[i],
                                                   tolerance=tol)
    #print(f"{tol:9.2f} | {coverage:8.3f} | {avg_offset:9.3f}")
    Hit_rates.append(Hit_rate)
    offsets.append(avg_offset)

  print("Hit rate with tolerance:", tol, "sec is: ", sum(Hit_rates)/len(Hit_rates)*100,"% , and offset is: ",sum(offsets)/len(offsets)," sec.")

Hit rate with tolerance: 0.25 sec is:  19.395525251338082 % , and offset is:  0.12137459060653036  sec.
Hit rate with tolerance: 0.5 sec is:  37.32274414091873 % , and offset is:  0.23956157292945213  sec.
Hit rate with tolerance: 0.75 sec is:  53.49961700149593 % , and offset is:  0.3585196960154593  sec.
Hit rate with tolerance: 1 sec is:  65.4925966427859 % , and offset is:  0.45337949615437634  sec.
Hit rate with tolerance: 1.5 sec is:  81.4537776513945 % , and offset is:  0.6016580941102793  sec.
Hit rate with tolerance: 2 sec is:  89.42502052174461 % , and offset is:  0.7022455400650227  sec.
Hit rate with tolerance: 10000 sec is:  100.0 % , and offset is:  0.9495607383295213  sec.


In [58]:
import csv
import os
from pydub import AudioSegment

def create_audio_csv(cut_audio_dir, continuous_speeches_dir, csv_file_path):
    """
    Creates a CSV file with information about cut audio files.
    """

    with open(csv_file_path, 'w', newline='') as csvfile:
        fieldnames = ['file_path', 'root_file', 'length']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for filename in os.listdir(cut_audio_dir):
            if filename.endswith('.wav'):  # Adjust file extension if needed
                cut_audio_path = os.path.join(cut_audio_dir, filename)

                # Extract root filename (assuming consistent naming convention)
                root_filename = filename.split('_part_')[0] + '.wav' # Adjust if needed
                root_file_path = os.path.join(continuous_speeches_dir, root_filename)

                try:
                    audio = AudioSegment.from_file(cut_audio_path)
                    length = len(audio) / 1000  # Length in seconds
                    writer.writerow({'file_path': cut_audio_path, 'root_file': root_file_path, 'length': length})

                except FileNotFoundError:
                    print(f"Warning: Root file not found for {filename}")
                    writer.writerow({'file_path': cut_audio_path, 'root_file': 'Not Found', 'length': 0})
                except Exception as e:
                    print(f"Error processing {filename}: {e}")
                    writer.writerow({'file_path': cut_audio_path, 'root_file': 'Error', 'length': 0})

# Example usage (replace with your actual paths)
cut_audio_dir = "/content/cut_audios_by_emotions"
continuous_speeches_dir = "/content/continuous_speeches"
csv_file_path = "/content/cut_audio_info.csv"

create_audio_csv(cut_audio_dir, continuous_speeches_dir, csv_file_path)

In [62]:
import csv

def create_csv_with_predictions(df_sorted, predicted_values, ground_truth_values, csv_file_path):
    """
    Creates a CSV file with directory, predicted values, and ground truth values.
    """
    try:
        with open(csv_file_path, 'w', newline='') as csvfile:
            fieldnames = ['directory', 'predicted_values', 'ground_truth_values',"ground_truth_emotions"]
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

            for i in range(min(len(df_sorted["conti_path"]), len(predicted_values), len(ground_truth_values), len(df_sorted["emotions"]))):
                writer.writerow({
                    'directory': df_sorted["conti_path"].iloc[i],
                    'predicted_values': predicted_values[i],
                    'ground_truth_values': ground_truth_values[i]
                    ,'ground_truth_emotions':df_sorted["emotions"].iloc[i]
                })
        print(f"CSV file '{csv_file_path}' created successfully.")
    except Exception as e:
        print(f"An error occurred while creating the CSV file: {e}")


# Example usage (assuming you have df_sorted, predicted_values, and ground_truth_values)
csv_file_path = "/content/segmentationPredictions.csv"  # Replace with your desired file path
create_csv_with_predictions(df_sorted, predicted_Values, ground_truth_values, csv_file_path)

CSV file '/content/segmentationPredictions.csv' created successfully.


In [60]:
!zip -r /content/cut_audios_by_emotions.zip /content/cut_audios_by_emotions

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  adding: content/cut_audios_by_emotions/combined_speech_Ses04F_script01_3_part_53.wav (deflated 17%)
  adding: content/cut_audios_by_emotions/combined_speech_Ses01M_impro05_part_5.wav (deflated 19%)
  adding: content/cut_audios_by_emotions/combined_speech_Ses04M_script01_2_part_31.wav (deflated 20%)
  adding: content/cut_audios_by_emotions/combined_speech_Ses05F_impro06_part_46.wav (deflated 42%)
  adding: content/cut_audios_by_emotions/combined_speech_Ses03F_impro05_part_15.wav (deflated 16%)
  adding: content/cut_audios_by_emotions/combined_speech_Ses01M_script03_1_part_18.wav (deflated 30%)
  adding: content/cut_audios_by_emotions/combined_speech_Ses04M_impro08_part_35.wav (deflated 31%)
  adding: content/cut_audios_by_emotions/combined_speech_Ses04M_impro06_part_59.wav (deflated 38%)
  adding: content/cut_audios_by_emotions/combined_speech_Ses05M_script01_3_part_19.wav (deflated 22%)
  adding: content/cut_audios_by_e

In [61]:
!mv /content/cut_audios_by_emotions.zip /content/drive/MyDrive