<a href="https://colab.research.google.com/github/GemmaGorey/Dissertation/blob/main/Dissertation_GG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Initial Colab setup below - Run  top two cells once only per session**


In [None]:
!pip install -q condacolab
import condacolab
condacolab.install()
# install mamba to use instead of pip

In [None]:
# Create the config file and build the environment.
yaml_content = """
name: dissertation
channels:
  - pytorch
  - conda-forge
dependencies:
  - python=3.11
  - pytorch=2.2.2
  - torchvision=0.17.2
  - torchaudio
  - librosa
  - numpy<2
  - pandas
  - jupyter
  - wandb
"""

# Write the string content to a file -  'environment.yml'.
with open('environment.yml', 'w') as f:
    f.write(yaml_content)

print("environment.yml file created successfully.")

# create the environment using mamba from the yml file.
print("\n Creating environment")

!mamba env create -f environment.yml --quiet && echo -e "\n 'dissertation' environment is ready to use."

In [None]:
# imports and setting up of GitHub and W&B

# clone project repository from GitHub
print("⏳ Cloning GitHub repository...")
!git clone https://github.com/GemmaGorey/Dissertation.git
print("Repository cloned.")

#Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

#imports
import pandas as pd
import librosa
import os
import numpy as np
import matplotlib.pyplot as plt
import librosa.display
from transformers import AutoTokenizer
from tqdm.auto import tqdm
import torch

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') #loading the tokenizer for lyrics processing
print("Tokenizer loaded.")

In [None]:
#loading in the dataset -using complete which has the lyrics and music.



# create string for path to where dataset lives
data_path = '/content/drive/MyDrive/dissertation/MERGE_Bimodal_Complete/'

# load the files

print("Loading MERGE Metadata ")
merge_df = pd.read_csv(data_path + 'merge_bimodal_complete_metadata.csv')

print("\n Loading Valence-Arousal values")
av_df = pd.read_csv(data_path + 'merge_bimodal_complete_av_values.csv')


print("\n Datasets loaded successfully.")

# Inspect the files
print("\n First 5 rows of the data")
display(merge_df.head())

print("\n First 5 rows of the MERGE AV Values")
display(av_df.head())

In [None]:
# merging the two dataframes using the common column

final_df = pd.merge(merge_df, av_df, left_on='Audio_Song', right_on='Audio_Song')

print(" DataFrames merged")

print("\n First 5 rows  MASTER DataFrame")
display(final_df.head())

In [None]:
#checking data - in different quadrants
print(final_df['Quadrant'].value_counts())

In [None]:
#checking data
print(final_df[['Valence', 'Arousal']].describe())

In [None]:
#checking no blank entries
print(final_df[['Quadrant', 'Valence', 'Arousal']].info())

In [None]:
def load_song_data(song_id, lyric_id, quadrant):
    """
    Loads the audio and lyrics for a given song ID
    """
    print(f"Attempting to load song: {song_id}")
    try:
        # Construct the file path to google drive
        base_path = '/content/drive/MyDrive/dissertation/MERGE_Bimodal_Complete'

        # Audio files url addition as these are in subfolders for each emotion quadrant
        audio_path = os.path.join(base_path, 'audio', quadrant, f"{song_id}.mp3")

        # Lyric files url addition as these are in a separate main folder
        lyrics_path = os.path.join(base_path, 'lyrics', quadrant, f"{lyric_id}.txt")

        # load the audio file
        audio_waveform, sample_rate = librosa.load(audio_path, sr=None) #preserve original sample rate

        #Process Audio into a Mel Spectrogram
        mel_spectrogram = librosa.feature.melspectrogram(y=audio_waveform, sr=sample_rate)

        # Convert to decibels
        db_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)

        # load lyrics text
        with open(lyrics_path, 'r', encoding='utf-8') as f:
            lyrics_text = f.read()
#TODO CHECK ALL THESE PARAMETERS AND SEE IF THESE ARE STANDARD OR IF I NEED TO CHANGE ANYTHING

        #Tokenise raw text
        encoded_lyrics = tokenizer(
            lyrics_text,
            padding='max_length',
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )

        print(f"Successfully loaded and processed {song_id}")
        #returns the spectrogram and the tokenised lyrics
        return db_spectrogram, encoded_lyrics

    except Exception as e:
        print(f"An error occurred for {song_id}: {e}")
        return None, None


In [None]:

#testing the output of the above function by picking a song.

#pick a song between 0 and 2215
test_song_index = 1111

test_audio_id = final_df.iloc[test_song_index]['Audio_Song']
test_lyric_id = final_df.iloc[test_song_index]['Lyric_Song_x']
test_quadrant = final_df.iloc[test_song_index]['Quadrant']

print(f"Audio ID: {test_audio_id}")
print(f"Lyric ID: {test_lyric_id}")
print(f"Quadrant: {test_quadrant}")

#variables for the spectorgram and tokenised lyrics loaded.
spectrogram, encoded_lyrics = load_song_data(test_audio_id, test_lyric_id, test_quadrant)


#check the output of the lyrics
if encoded_lyrics is not None:
    print("\n--- Tokenized Lyrics Output ---")

    #dictionary of tensors
    print("Encoded Tensors:")
    display(encoded_lyrics)

    #readable tokens to check
    tokens = tokenizer.convert_ids_to_tokens(encoded_lyrics['input_ids'][0])

    print("\nSample of Tokens Produced:")
    print(tokens[:50])

#check the output of the audio
if spectrogram is not None:
    print("\n--- Spectrogram Data ---")
    print(f"Shape: {spectrogram.shape}")

    #show spectogram
    print("\nDisplaying Spectrogram:")
    plt.figure(figsize=(12, 4))
    #get the sample rate for the axis display
    audio_file_path = os.path.join('/content/drive/MyDrive/dissertation/MERGE_Bimodal_Complete/audio', test_quadrant, f'{test_audio_id}.mp3')
    sr = librosa.get_samplerate(path=audio_file_path)

    librosa.display.specshow(spectrogram, sr=sr, x_axis='time', y_axis='mel')

    plt.colorbar(format='%+2.0f dB')
    plt.title(f'Mel Spectrogram for {test_audio_id}')
    plt.tight_layout()
    plt.show()

In [None]:

#define where to save the files
base_path = '/content/drive/MyDrive/dissertation/output_from_code/'
spectrogram_save_dir = os.path.join(base_path, 'processed_spectrograms/')
lyrics_save_dir = os.path.join(base_path, 'processed_lyrics/')

# Create the directories if they don't exist
os.makedirs(spectrogram_save_dir, exist_ok=True)
os.makedirs(lyrics_save_dir, exist_ok=True)

print(f"Spectrograms will be saved in: {spectrogram_save_dir}")
print(f"Lyric tensors will be saved in: {lyrics_save_dir}")

#process the audio and lyrics storing info as processed
processed_records = []

#iterate over all rows and get origress bar
for index, row in tqdm(final_df.iterrows(), total=final_df.shape[0], desc="Processing Songs"):

    #get song info from each row
    song_id = row['Audio_Song']
    lyric_id = row['Lyric_Song_x']
    quadrant = row['Quadrant']

    #call function to transform
    spectrogram, encoded_lyrics = load_song_data(song_id, lyric_id, quadrant)

    #check if it worked
    if spectrogram is not None and encoded_lyrics is not None:
        try:
            #save the spectorgram
            spectrogram_path = os.path.join(spectrogram_save_dir, f"{song_id}.npy")
            np.save(spectrogram_path, spectrogram)

            #save tokenised lyric tensors
            lyrics_path = os.path.join(lyrics_save_dir, f"{song_id}.pt")
            torch.save(encoded_lyrics, lyrics_path)

            #record file path
            processed_records.append({
                'song_id': song_id,
                'spectrogram_path': spectrogram_path,
                'lyrics_path': lyrics_path,
                'valence': row['Valence'],
                'arousal': row['Arousal'],
                'quadrant': quadrant
            })
        except Exception as e:
            print(f"Error saving files for {song_id}: {e}")


#save new dataframe for using in experiments
processed_df = pd.DataFrame(processed_records)
master_file_path = os.path.join(base_path, 'master_processed_file_list.csv')
processed_df.to_csv(master_file_path, index=False)

print("\n PROCESSING COMPLETE!")
print(f"Saved {len(processed_df)} records.")
print(f"Master file list saved to: {master_file_path}")
display(processed_df.head())

In [None]:

#path to where this was saved
base_path = '/content/drive/MyDrive/dissertation/output_from_code/'
master_file_path = os.path.join(base_path, 'master_processed_file_list.csv')

#load the master csv with all the paths and VA values
master_df = pd.read_csv(master_file_path)

print("Master csv loaded")
display(master_df.head())

#check data for 1 song
#pick a song between 0 and 2215
test_final_song_index = 1111
song_info = master_df.iloc[test_final_song_index]

print(f"\n--- Loading data for song: {song_info['song_id']} ---")

#Load the spectrogram from the file
spectrogram = np.load(song_info['spectrogram_path'])

#load the lyric tensors
encoded_lyrics = torch.load(song_info['lyrics_path'], weights_only=False)

#Get the labels
valence = song_info['valence']
arousal = song_info['arousal']

#check the data
print("Spectrogram Shape:", spectrogram.shape)
print("Encoded Lyrics Tensors:", encoded_lyrics)
print(f"Labels - Valence: {valence}, Arousal: {arousal}")
