## No need to run again

In [None]:
import io
import os
import tarfile
import tempfile
import torch
import re
import json
import boto3
import matplotlib.pyplot as plt
import requests
import torchaudio
from botocore import UNSIGNED
from botocore.config import Config
from IPython.display import Audio
from torchaudio.utils import download_asset
import IPython
from urllib import request
import IPython.display as ipd
import numpy as np
import random
from datasets import Dataset
import pandas as pd



from transformers import Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor


from transformers import TrainingArguments,Trainer
from IPython.display import clear_output


In [None]:
# Useful functions

def plot_waveform(waveform, sample_rate):
    waveform = waveform.numpy()

    num_channels, num_frames = waveform.shape
    time_axis = torch.arange(0, num_frames) / sample_rate

    figure, axes = plt.subplots(num_channels, 1)
    if num_channels == 1:
        axes = [axes]
    for c in range(num_channels):
        axes[c].plot(time_axis, waveform[c], linewidth=1)
        axes[c].grid(True)
        if num_channels > 1:
            axes[c].set_ylabel(f"Channel {c+1}")
    figure.suptitle("waveform")
    plt.show(block=False)


def plot_specgram(waveform, sample_rate, title="Spectrogram"):
    waveform = waveform.numpy()

    num_channels, num_frames = waveform.shape

    figure, axes = plt.subplots(num_channels, 1)
    if num_channels == 1:
        axes = [axes]
    for c in range(num_channels):
        axes[c].specgram(waveform[c], Fs=sample_rate)
        if num_channels > 1:
            axes[c].set_ylabel(f"Channel {c+1}")
    figure.suptitle(title)
    plt.show(block=False)



chars_to_remove_regex = '[\,\(\)\/\?\.\!\-\;\:\"\“\%\‘\\”\�\'\\’\\ː]'

def remove_special_characters(batch):
    batch = re.sub(chars_to_remove_regex, '', batch).lower()
    return batch

def replace_hatted_characters(batch):
    batch = re.sub('[â]', 'a', batch)
    batch = re.sub('[î]', 'i', batch)
    batch = re.sub('[ô]', 'o', batch)
    batch = re.sub('[û]', 'u', batch)
    batch = re.sub('[μ]', 'u', batch)
    batch = re.sub('[è]', 'e', batch)
    batch = re.sub('[ú]', 'u', batch)
    batch = re.sub('[ó]', 'o', batch)
    batch = re.sub('[í]', 'i', batch)
    batch = re.sub('[é]', 'e', batch)
    return batch


## tokenizer
def extract_all_chars(batch):
  all_text = " ".join(batch["sentence"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}



### obtain audio array
def audio_array(data):
  audio =  [ ]

  for index,value in  enumerate(data.path):
    data_1={}
    print(f"total:{len(data)}")
    print(index)
    #SAMPLE_MP3 = download_asset(f"/content/corpus/sw/clips/{value}")
    try:
        SAMPLE_MP3 = download_asset(f"/content/train/{value}")
        waveform ,sample_rate = torchaudio.load(SAMPLE_MP3)
        #waveform = waveform.to(device)
        if sample_rate != 16000:
            waveform = torchaudio.functional.resample(waveform, sample_rate, 16000)
        data_1["array"] =waveform
        #data_1["path"] =f"/content/corpus/sw/clips/{value}"
        data_1["path"] =f"/content/train/{value}"
        data_1["sampling_rate"]= 16000
        audio.append(data_1)
        clear_output(wait=True)
    except Exception as err:
        print(err)
        continue


  return audio

#testing random data audio
def test_data_audio(data,position=0):
  rand_int = random.randint(0, len(data)-1) if position == 0 else position
  print(data.sentence[rand_int])
  return data.audio[rand_int]["array"].cpu()



def train_test_generator(data):
  train = pd.DataFrame({})
  input_values=[]
  input_length =[]
  labels= []
  data.reset_index(inplace=True)
  for i in range(0,len(data)):
      print(f"total:{len(data)}")
      print(i)
      audio = data["audio"][i]

      # batched output is "un-batched"
      input_values.append(processor(audio["array"].cpu().numpy(), sampling_rate=audio["sampling_rate"]).input_values[0])
      input_length.append(len(input_values[-1]))
      with processor.as_target_processor():
          labels.append(processor(data["sentence"][i]).input_ids)

      clear_output(wait=True)

  train["input_values"] = input_values
  train["input_length"] = input_length
  train["labels"] = labels

  print(train.head())
  # Convert pandas DataFrame to datasets.Dataset
  train = Dataset.from_pandas(train)

  return train



In [None]:
# PREPARING THE GPU
print(torch.__version__)
print(torchaudio.__version__)

torch.random.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)

In [None]:

FIRST_URL="https://drive.usercontent.google.com/download?id=1fZEwGM7kbQb-jU2jHiMSTQVjR0s91n_L&export=download&authuser=0&confirm=t&uuid=44d4a06f-30f3-4247-91cb-9df1f09233d6&at=APZUnTWQcri3LUD1pbohPW7-szwV:1699200614451"


#FIRST_FILENAME = FIRST_URL.split("/")[-1].split("?")[0]

#if os.path.isfile(FIRST_FILENAME):
#  print("Found", FIRST_FILENAME, ", skipping download.")
#else:
print("Downloading...")
request.urlretrieve(FIRST_URL, "train.tar.gz")



In [None]:
import os

In [None]:
os.listdir("/content/train")


In [None]:
data = pd.read_csv("mod_train.csv")

In [None]:
for i in range(len(data.index)):
  pathy = data["path"][i]
  try:
    download_asset(f"/content/train/{pathy}")
  except Exception as err:
    print(err)
    print(i)
    data.drop([i],axis=0,inplace=True)
    continue

In [None]:
data.to_csv("mod_train.csv",index=False)

In [None]:
data.shape

In [None]:
! tar -xf "/content/drive/MyDrive/Copy of train0.tar.gz"

In [None]:
#data = pd.read_csv("/content/corpus/sw/train.tsv",nrows=7000, delimiter='\t').drop(["client_id","up_votes","down_votes","age","gender","accents","locale","segment","variant"],axis=1)#
data = pd.read_csv("mod_train.csv",skiprows=14000,nrows=1000,)
data.columns = ['client_id', 'path', 'sentence', 'up_votes', 'down_votes', 'age','gender', 'accents', 'variant', 'locale', 'segment']
data=data.drop(["client_id","up_votes","down_votes","age","gender","accents","locale","segment","variant"],axis=1)#

#test_data = pd.read_csv("/content/corpus/sw/invalidated.tsv", delimiter='\t').drop(["client_id","up_votes","down_votes","age","gender","accents","locale","segment"],axis=1)
print(data.head())



#Removing unwanted characters
data["sentence"] = data["sentence"].map(lambda x:remove_special_characters(str(x)))
#test_data["sentence"] = test_data["sentence"].map(lambda x:remove_special_characters(str(x)))

# Replacing hatted characters
data["sentence"] = data["sentence"].map(lambda x:replace_hatted_characters(str(x)))
#test_data["sentence"] = test_data["sentence"].map(lambda x:replace_hatted_characters(x))





In [None]:
 #Identifying all characters in the sentence
vocab=[]
for value in data["sentence"]:
  all_text="".join(value)
  vocab = list(set(vocab+list(set(all_text))))

print(vocab)

## Removing " " replacing with tab
vocab_dict = {v: k for k, v in enumerate(sorted(vocab))}
print(vocab_dict)
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

print(vocab_dict)

## adding useful characters
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
print(len(vocab_dict))
print(vocab_dict)

## save
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [None]:
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("./", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [None]:
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

In [None]:
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [None]:
data["audio"] = audio_array(data)
print(data.head())
# test_data["audio"] = audio_array(test_data)

# print(test_data.head())

In [None]:
from sklearn.model_selection import train_test_split as tts


In [None]:
train_data,valid_data=  tts(data,test_size=0.1)

In [None]:
train = train_test_generator(train_data)
test = train_test_generator(valid_data)

In [None]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch


In [None]:

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
from datasets import  load_metric
wer_metric = load_metric("wer")

In [None]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [None]:
from transformers import Wav2Vec2ForCTC


model = Wav2Vec2ForCTC.from_pretrained(
  "./",
  attention_dropout=0.0,
  hidden_dropout=0.0,
  feat_proj_dropout=0.0,
  mask_time_prob=0.0,
  layerdrop=0.0,
  ctc_loss_reduction="mean",
  pad_token_id=processor.tokenizer.pad_token_id,
  vocab_size=len(processor.tokenizer))

In [None]:
model.freeze_feature_extractor()

In [None]:
training_args = TrainingArguments(
  output_dir="./",
  group_by_length=True,
  per_device_train_batch_size=32,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=2,
  gradient_checkpointing=True,
  fp16=True,
  save_steps=100,
  eval_steps=50,
  logging_steps=50,
  learning_rate=3e-4,
  warmup_steps=100,
  save_total_limit=2,
  push_to_hub=True,
)

In [None]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train,
    eval_dataset=test,

    tokenizer=processor.feature_extractor,
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()

# Testing

In [None]:
model = trainer.model

In [None]:
! tar -xf "test"

In [None]:
import os

In [None]:
test_names = []
for value in os.listdir("test"):
  name = f"{value}"
  print(name)
  test_names.append(name)

In [None]:
test = []
for value in os.listdir("test"):
  name = f"/test/{value}"
  print(name)
  test.append(name)

In [None]:
len(test)

In [None]:
audio = []
for index,value in  enumerate(test):
    print(f"total:{len(test)}")
    print(index)
    data_1={}
    SAMPLE_MP3 = download_asset(value)
    waveform ,sample_rate = torchaudio.load(SAMPLE_MP3)
    #waveform = waveform.to(device)
    if sample_rate != 16000:
        waveform = torchaudio.functional.resample(waveform, sample_rate, 16000)
    data_1["array"] =waveform
    data_1["name"] =f"{value}"
    data_1["sampling_rate"]= 16000
    audio.append(data_1)
    clear_output(wait=True)

In [None]:
def test_generator(data):
  train = pd.DataFrame({})
  input_values=[]
  input_length =[]
  labels= []

  for i in range(0,len(data)):
      print(f"total:{len(data)}")
      print(i)
      audio = data[i]["array"]

      # batched output is "un-batched"
      input_values.append(processor(audio.cpu().numpy(), sampling_rate=data[i]["sampling_rate"]).input_values[0])
      input_length.append(len(input_values[-1]))
      clear_output(wait=True)

  train["input_values"] = input_values
  train["input_length"] = input_length

  print(train.head())
  # Convert pandas DataFrame to datasets.Dataset
  train = Dataset.from_pandas(train)

  return train


In [None]:
  test_values = test_generator(audio)

In [None]:
model.to("cuda")

In [None]:
import time

In [None]:
text_output=[]
for index in range(len(test_values)):
    print(f"total:{len(test_values)}")
    print(index)
    start = time.time()
    input_dict = processor(test_values[index]["input_values"],sampling_rate=16000, return_tensors="pt", padding=True)

    logits = model(input_dict.input_values.to("cuda")).logits

    pred_ids = torch.argmax(logits, dim=-1)[0]
    text_output.append(processor.decode(pred_ids))
    end  = time.time()
    clear_output(wait=True)
    print(f"time remaining:{((end-start)/60)*(len(test_values)-index)}m")



In [None]:
text_output


In [None]:
! rm /home/ASR/train.tar.gz

In [None]:
import pandas as pd

In [None]:
submission = pd.DataFrame({"path":test_names,"sentence":text_output})

In [None]:
submission.to_csv("submission1.csv")