# Fine-tuning Notebook

### Clone the repositories

Clone the repository app repository and the fine-tuning repository into the current working directory. Make sure to set `GITHUB_USERNAME` and `JasmiApp_TOKEN` to corresponding values.


In [None]:
!git clone https://github.com/Hobit2002/TracheoSpeech_ASR

In [None]:
from google.colab import userdata
try:
  git_branch = "main"
except (userdata.SecretNotFoundError, userdata.NotebookAccessError) as e:
  git_branch = "main"
try:
  github_username = userdata.get('GITHUB_USERNAME')
  github_token = userdata.get('JasmiApp_TOKEN')
  print("Loaded Github credentials from secrets.")
  print("Cloning repository...")
  !git clone -b "{git_branch}" "https://{github_username}:{github_token}@github.com/Hobit2002/JasmiApp.git"
  print("Clone done")
  del github_token
except (userdata.SecretNotFoundError, userdata.NotebookAccessError) as e:
  print("Could not Github credentials from secrets! Please enter them below:")
  !read -p "Your Github username: " github_username_sh; read -p "Your Github access token: " -s github_token_sh; echo "Cloneing repository"; git clone -b main "https://${github_username_sh}:${github_token_sh}@github.com/Hobit2002/JasmiApp.git"
  print("Clone done")
  github_username = ""
  github_token = ""

### Prepare the environment

In [None]:

! pip install -r JasmiApp/requirements.txt

### Update the config in TracheoSpeech_ASR

In [None]:
!mv JasmiApp/fine_tuning/fine_tune_config.py TracheoSpeech_ASR/asr/whisper_config.py

### Download and extract the data

Upload your fine-tuning data collection to Google Drive, enable link sharing and place the file id extracted from the link into the gdown command.

In [None]:
%cd TracheoSpeech_ASR
!mkdir data
!gdown --id 1AVUwcq5vA81I5U5SuQJaj0xD-7dk8IVi -O data/TracheoSpeech.zip
!unzip data/TracheoSpeech.zip -d data/TracheoSpeech
!mv data/TracheoSpeech/public_dataset/* data/TracheoSpeech/

### Download the model
As specified above, upload your current model to Google Drive, enable link sharing and place the file id extracted from the link into the gdown command.

In [None]:
!mkdir artifacts
!mkdir artifacts/checkpoint
!gdown --id 1nGVpZwJW9cvPtjUNCts3Tu1B0ibIG6Q8 -O artifacts/checkpoint/base_adapted_patient.ckpt

### Download the MLM model

In [None]:
!python download_data.py mlm_model

### Fine-tune

In [None]:
!python asr/train_whisper.py base_adapted_patient

### Save trained model onto your Google Drive

In [None]:
import os
from google.colab import drive
from google.colab import auth
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload

# Authenticate and mount Google Drive
auth.authenticate_user()
drive.mount('/content/drive')


local_directory = '/content/TracheoSpeech_ASR/artifacts/checkpoint'
drive_folder_name = 'TracheoSpeech_ASR_Checkpoints'

# Find the newest file in the local directory
try:
    files = [f for f in os.listdir(local_directory) if os.path.isfile(os.path.join(local_directory, f))]
    if not files:
        print(f"No files found in {local_directory}")
    else:
        newest_file = max(files, key=lambda x: os.path.getmtime(os.path.join(local_directory, x)))
        newest_file_path = os.path.join(local_directory, newest_file)
        print(f"Newest file found: {newest_file}")

        # Build the Drive API service
        drive_service = build('drive', 'v3')

        # Find the ID of the destination folder in Google Drive
        query = f"name='{drive_folder_name}' and mimeType='application/vnd.google-apps.folder' and trashed=false"
        results = drive_service.files().list(q=query, fields="files(id, name)").execute()
        items = results.get('files', [])

        if not items:
            print(f"Folder '{drive_folder_name}' not found in Google Drive.")
        else:
            drive_folder_id = items[0]['id']
            print(f"Found folder '{drive_folder_name}' with ID: {drive_folder_id}")

            # Upload the file to Google Drive
            file_metadata = {'name': newest_file, 'parents': [drive_folder_id]}
            media = MediaFileUpload(newest_file_path, resumable=True)
            file = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute()
            print(f"File ID: {file.get('id')}")
            print(f"File '{newest_file}' uploaded to Google Drive folder '{drive_folder_name}'.")

except FileNotFoundError:
    print(f"Local directory not found: {local_directory}")
except Exception as e:
    print(f"An error occurred: {e}")