In [1]:
# Note:

# The following files are the outputs of the Kaggle notebook titled "MELD: Removing Corrupted Records".
# They provide cleaned datasets by excluding corrupted records and include information about the removed entries.



# Corrupted Files Information:

# Corrupted videos: 

# {'train': [{'125_3': '/kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw/train/train_splits/dia125_utt3.mp4', 'y': 0, 'label': 'neutral'}], 
#  'dev': [{'110_7': '/kaggle/input/meld-emotion-recognition/MELD.Raw/MELD.Raw/dev/dev_splits_complete/dia110_utt7.mp4', 'y': 0, 'label': 'neutral'}], 
#  'test': []}



# Corrupted audios: 

# {'train': [{'125_3': '/kaggle/input/meld-audio/audio_train/dia125_utt3.wav', 'y': 0, 'label': 'neutral'}], 
#  'dev': [{'110_7': '/kaggle/input/meld-audio/audio_dev/dia110_utt7.wav', 'y': 0, 'label': 'neutral'}], 
#  'test': []}



# File Details:

# 1. "/kaggle/input/meld-emotion-recognition/JSON files/JSON files/Updated CSV/dev_sent_emo_cleaned.csv"

#    This file contains the cleaned version of dev_sent_emo.csv. Rows corresponding to corrupted video or audio
#    files were removed. Specifically, records with Dialogue_ID and Utterance_ID forming the key "110_7" were
#    excluded based on the corrupted file information in:
#    - Corrupted videos: /kaggle/input/meld-emotion-recognition/JSON files/JSON files/MELD Data Format/MELD_corrupted_video_data.json
#    - Corrupted audios: /kaggle/input/meld-emotion-recognition/JSON files/JSON files/MELD Data Format/MELD_corrupted_audio_data.json



# 2. "/kaggle/input/meld-emotion-recognition/JSON files/JSON files/Updated CSV/train_sent_emo_cleaned.csv"

#    This file contains the cleaned version of train_sent_emo.csv. Rows with Dialogue_ID and Utterance_ID
#    forming the key "125_3" were removed based on corrupted file data from the same sources as above.



# 3. "/kaggle/input/meld-emotion-recognition/JSON files/JSON files/Updated CSV/test_sent_emo_cleaned.csv"

#    This file is identical to test_sent_emo.csv, as no corrupted files (video or audio) were found in the "test"
#    split. Consequently, no records were removed.



# 4. "/kaggle/input/meld-emotion-recognition/JSON files/JSON files/Updated CSV/dev_sent_emo_removed.csv"

#    This file lists the records removed from dev_sent_emo.csv. The removed records correspond to Dialogue_ID and
#    Utterance_ID values of "110" and "7", respectively.



# 5. "/kaggle/input/meld-emotion-recognition/JSON files/JSON files/Updated CSV/train_sent_emo_removed.csv"

#    This file contains the records removed from train_sent_emo.csv. The excluded records correspond to Dialogue_ID
#    and Utterance_ID values of "125" and "3", respectively.



# 6. "/kaggle/input/meld-emotion-recognition/JSON files/JSON files/Updated CSV/test_sent_emo_removed.csv"

#    This file is empty, as no corrupted video or audio files were present in the "test" split. Thus, no records
#    were removed from test_sent_emo.csv.

In [2]:
# THE KAGGLE NOTEBOOK `MELD : Updated MELD Data JSON Files` IS USED TO UPDATE THE JSON FORMAT FILES FOR MELD DATASET 
# SO THAT THE ENTRIES CORRESPONDING TO THE RECORDS IN MELD_corrupted_video_data.json OR MELD_corrupted_audio_data.json, 
# WHICH REPRESENT THE CORRUPTED VIDEO OR AUDIO FILES IN THE MELD DATASET, ARE REMOVED.

# JSON FORMAT FILES FOR MELD DATASET THAT IS TO BE UPDATED:

# 1. MELD_Data.json
# 2. MELD_Textual_Data.json
# 3. MELD_Video_Data.json
# 4. MELD_audio_data_updated.json

# THE JSON FILES USED FOR UPDATING THE ABOVE FILES:

# 1. MELD_corrupted_video_data.json
# 2. MELD_corrupted_audio_data.json

# THE OUTPUT OBTAINED AFTER REMOVING CORRUPTED ENTRIES:

# 1. MELD_Data_Cleaned.json
# 2. MELD_Textual_Data_Cleaned.json
# 3. MELD_Video_Data_Cleaned.json
# 4. MELD_Audio_Data_Updated_Cleaned.json

---

# Functions to Save and Read Data in JSON Format Using Python

In [3]:
import json

In [4]:
def save_to_json(data, file_path):
    """
    Save data to a JSON file.
    
    Args:
    - data: Python object (e.g., dict or list) to save.
    - file_path: Path to the JSON file.
    """
    try:
        with open(file_path, "w") as json_file:
            json.dump(data, json_file, indent=4)  # Save with pretty formatting
        print(f"Data successfully saved to {file_path}")
    except Exception as e:
        print(f"Error saving data to JSON: {e}")

In [5]:
def read_from_json(file_path):
    """
    Read data from a JSON file.
    
    Args:
    - file_path: Path to the JSON file.
    
    Returns:
    - The Python object (e.g., dict or list) loaded from the JSON file.
    """
    try:
        with open(file_path, "r") as json_file:
            data = json.load(json_file)
        print(f"Data successfully loaded from {file_path}")
        return data
    except Exception as e:
        print(f"Error reading data from JSON: {e}")
        return None

---

# Removing Corrupted Records from `MELD_Data.json` Based on `MELD_corrupted_video_data.json`

In [6]:
import json

In [7]:
# Function to remove inconsistent records from MELD_data
def remove_inconsistent_from_meld(meld_data_file, corrupted_data_file):
    # Load MELD_data and MELD_corrupted_video_data from JSON files
    with open(meld_data_file, 'r') as file:
        meld_data = json.load(file)

    with open(corrupted_data_file, 'r') as file:
        corrupted_data = json.load(file)
    
    # Filter function to remove inconsistent records
    def filter_meld_data(data, corrupted_entries):
        filtered_data = []
        for entry in data["data"]:
            # Generate the dia_utt key
            dia_utt_key = f"{entry['dialog']}_{entry['utterance']}"
            split = entry['split']
            
            # Check if the dia_utt_key is inconsistent in the corresponding split
            if split in corrupted_entries:
                split_inconsistencies = corrupted_entries[split]
                corrupted_keys = {list(item.keys())[0] for item in split_inconsistencies}
                
                # Exclude entry if dia_utt_key is in corrupted_keys
                if dia_utt_key in corrupted_keys:
                    continue
            
            # Keep the record if not inconsistent
            filtered_data.append(entry)
        return filtered_data

    # Remove inconsistent records
    meld_data["data"] = filter_meld_data(meld_data, corrupted_data)
    
    return meld_data

In [8]:
# Example usage:
# Input files
MELD_Data = "/kaggle/input/meld-emotion-recognition/JSON files/JSON files/MELD Data Format/MELD_Data.json"
MELD_corrupted_video_data = "/kaggle/input/meld-emotion-recognition/JSON files/JSON files/MELD Data Format/MELD_corrupted_video_data.json"

# Call the function
MELD_Data_Cleaned = remove_inconsistent_from_meld(MELD_Data, MELD_corrupted_video_data)

# Save the cleaned MELD_data back to a JSON file
with open("MELD_Data_Cleaned.json", 'w') as file:
    json.dump(MELD_Data_Cleaned, file, indent=4)

print("Inconsistent records removed. Cleaned MELD_data saved to 'MELD_Data_Cleaned.json'.")

Inconsistent records removed. Cleaned MELD_data saved to 'MELD_Data_Cleaned.json'.


---

# Read Cleaned MELD Data in JSON Format

In [9]:
# Example Usage
if __name__ == "__main__":
    # Filepath
    MELD_Data_Cleaned_path = "/kaggle/working/MELD_Data_Cleaned.json"

    # Read data from JSON
    MELD_Data_Cleaned = read_from_json(MELD_Data_Cleaned_path)
    print("Loaded Data:", MELD_Data_Cleaned)

Data successfully loaded from /kaggle/working/MELD_Data_Cleaned.json


---

# Counting Dictionaries by Split Value in MELD_Data_Cleaned

In [10]:
if __name__ == "__main__":
    # Count occurrences of 'split' values
    train_count = sum(1 for d in MELD_Data_Cleaned['data'] if d.get('split') == 'train')
    dev_count = sum(1 for d in MELD_Data_Cleaned['data'] if d.get('split') == 'dev')
    test_count = sum(1 for d in MELD_Data_Cleaned['data'] if d.get('split') == 'test')

    # Print the counts
    print("Number of dictionaries with split = 'train':", train_count)
    print("Number of dictionaries with split = 'dev':", dev_count)
    print("Number of dictionaries with split = 'test':", test_count)

Number of dictionaries with split = 'train': 9988
Number of dictionaries with split = 'dev': 1108
Number of dictionaries with split = 'test': 2610


---

# Removing Corrupted Records from `MELD_Textual_Data.json` Based on `MELD_corrupted_video_data.json`

In [11]:
# Function to remove corrupted video entries
def remove_corrupted_entries(corrupted_data, video_data):
    for split in corrupted_data:
        corrupted_entries = corrupted_data[split]
        
        # Loop through each corrupted entry and remove it from the corresponding split
        for corrupted_entry in corrupted_entries:
            corrupted_key = list(corrupted_entry.keys())[0]
            
            # Remove the corrupted entry from the corresponding split in the video data
            video_data[split] = [entry for entry in video_data[split] if list(entry.keys())[0] != corrupted_key]
    
    return video_data

In [12]:
# Load the MELD_corrupted_video_data.json file
with open('/kaggle/input/meld-emotion-recognition/JSON files/JSON files/MELD Data Format/MELD_corrupted_video_data.json', 'r') as f:
    MELD_corrupted_video_data = json.load(f)

# Load the MELD_Textual_Data.json file
with open('/kaggle/input/meld-emotion-recognition/JSON files/JSON files/MELD Data Format/MELD_Textual_Data.json', 'r') as f:
    MELD_Textual_Data = json.load(f)

In [13]:
# Remove the corrupted entries from the text data
MELD_Textual_Data_Cleaned = remove_corrupted_entries(MELD_corrupted_video_data, MELD_Textual_Data)

# Save the updated text data to a new JSON file
with open('MELD_Textual_Data_Cleaned.json', 'w') as f:
    json.dump(MELD_Textual_Data_Cleaned, f, indent=4)

print("Corrupted entries removed and updated data saved to MELD_Textual_Data_Cleaned.json")

Corrupted entries removed and updated data saved to MELD_Textual_Data_Cleaned.json


In [14]:
if __name__ == "__main__":
    # Count occurrences of 'split' values in each section ('train', 'dev', 'test')
    train_count = sum(1 for d in MELD_Textual_Data_Cleaned['train'])
    dev_count = sum(1 for d in MELD_Textual_Data_Cleaned['dev'])
    test_count = sum(1 for d in MELD_Textual_Data_Cleaned['test'])

    # Print the counts
    print("Number of dictionaries in 'train':", train_count)
    print("Number of dictionaries in 'dev':", dev_count)
    print("Number of dictionaries in 'test':", test_count)

Number of dictionaries in 'train': 9988
Number of dictionaries in 'dev': 1108
Number of dictionaries in 'test': 2610


---

# Removing Corrupted Records from `MELD_Video_Data.json` Based on `MELD_corrupted_video_data.json`

In [15]:
# Load the MELD_corrupted_video_data.json file
with open('/kaggle/input/meld-emotion-recognition/JSON files/JSON files/MELD Data Format/MELD_corrupted_video_data.json', 'r') as f:
    MELD_corrupted_video_data = json.load(f)

# Load the MELD_Video_Data.json file
with open('/kaggle/input/meld-emotion-recognition/JSON files/JSON files/MELD Data Format/MELD_Video_Data.json', 'r') as f:
    MELD_Video_Data = json.load(f)

In [16]:
# Remove the corrupted entries from the video data
MELD_Video_Data_Cleaned = remove_corrupted_entries(MELD_corrupted_video_data, MELD_Video_Data)

# Save the updated video data to a new JSON file
with open('MELD_Video_Data_Cleaned.json', 'w') as f:
    json.dump(MELD_Video_Data_Cleaned, f, indent=4)

print("Corrupted entries removed and updated data saved to MELD_Video_Data_Cleaned.json")

Corrupted entries removed and updated data saved to MELD_Video_Data_Cleaned.json


In [17]:
if __name__ == "__main__":
    # Count occurrences of 'split' values in each section ('train', 'dev', 'test')
    train_count = sum(1 for d in MELD_Video_Data_Cleaned['train'])
    dev_count = sum(1 for d in MELD_Video_Data_Cleaned['dev'])
    test_count = sum(1 for d in MELD_Video_Data_Cleaned['test'])

    # Print the counts
    print("Number of dictionaries in 'train':", train_count)
    print("Number of dictionaries in 'dev':", dev_count)
    print("Number of dictionaries in 'test':", test_count)

Number of dictionaries in 'train': 9988
Number of dictionaries in 'dev': 1108
Number of dictionaries in 'test': 2610


---

# Removing Corrupted Records from `MELD_audio_data_updated.json` Based on `MELD_corrupted_video_data.json`

In [18]:
# Load the MELD_corrupted_video_data.json file
with open('/kaggle/input/meld-emotion-recognition/JSON files/JSON files/MELD Data Format/MELD_corrupted_video_data.json', 'r') as f:
    MELD_corrupted_video_data = json.load(f)

# Load the MELD_audio_data_updated.json file
with open('/kaggle/input/meld-emotion-recognition/JSON files/JSON files/MELD Data Format/MELD_audio_data_updated.json', 'r') as f:
    MELD_audio_data_updated = json.load(f)

In [19]:
# Remove the corrupted entries from the audio data
MELD_Audio_Data_Updated_Cleaned = remove_corrupted_entries(MELD_corrupted_video_data, MELD_audio_data_updated)

# Save the updated audio data to a new JSON file
with open('MELD_Audio_Data_Updated_Cleaned.json', 'w') as f:
    json.dump(MELD_Audio_Data_Updated_Cleaned, f, indent=4)

print("Corrupted entries removed and updated data saved to MELD_Audio_Data_Updated_Cleaned.json")

Corrupted entries removed and updated data saved to MELD_Audio_Data_Updated_Cleaned.json


In [20]:
if __name__ == "__main__":
    # Count occurrences of 'split' values in each section ('train', 'dev', 'test')
    train_count = sum(1 for d in MELD_Audio_Data_Updated_Cleaned['train'])
    dev_count = sum(1 for d in MELD_Audio_Data_Updated_Cleaned['dev'])
    test_count = sum(1 for d in MELD_Audio_Data_Updated_Cleaned['test'])

    # Print the counts
    print("Number of dictionaries in 'train':", train_count)
    print("Number of dictionaries in 'dev':", dev_count)
    print("Number of dictionaries in 'test':", test_count)

Number of dictionaries in 'train': 9988
Number of dictionaries in 'dev': 1108
Number of dictionaries in 'test': 2610
