In [None]:
%%capture
!pip install datasets transformers evaluate

In [None]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import torch

from collections import defaultdict
from dataclasses import dataclass
from typing import Any, Dict, List, Union

from scipy.special import softmax
from evaluate import load

from datasets import load_dataset, Audio
from transformers import DataCollatorWithPadding
from transformers import pipeline
from transformers import AutoFeatureExtractor
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

# Exploration

In [None]:
df = pd.DataFrame({'id': dataset['id'], 'language': dataset['language'], 'is_tts': dataset['is_tts']})
df.head()

Unnamed: 0,id,language,is_tts
0,MAR_F_INDIC_00197,Marathi,0
1,SAN_M_DISGUST_00159,Sanskrit,0
2,TAM_F_DISGUST_00243,Tamil,1
3,BRX_F_DISGUST_00208,Bodo,0
4,DOI_M_SAD_00304,Dogri,0


In [None]:
pd.Series(df['is_tts']).value_counts()

Unnamed: 0_level_0,count
is_tts,Unnamed: 1_level_1
1,51
0,49


In [None]:
language_dist = defaultdict(int)
for lang in dataset['language']:
    language_dist[lang] += 1
dict(language_dist)

{'Marathi': 8,
 'Sanskrit': 5,
 'Tamil': 5,
 'Bodo': 12,
 'Dogri': 3,
 'English': 5,
 'Hindi': 5,
 'Gujarati': 8,
 'Odia': 6,
 'Manipuri': 8,
 'Nepali': 9,
 'Malayalam': 10,
 'Assamese': 4,
 'Kannada': 6,
 'Telugu': 3,
 'Bengali': 3}

In [None]:
class_counts = pd.crosstab(df["language"], df["is_tts"])
class_counts.columns = ["Human (0)", "AI-Generated (1)"]
print(class_counts)

           Human (0)  AI-Generated (1)
language                              
Assamese           2                 2
Bengali            1                 2
Bodo               4                 8
Dogri              2                 1
English            2                 3
Gujarati           1                 7
Hindi              3                 2
Kannada            4                 2
Malayalam          5                 5
Manipuri           5                 3
Marathi            5                 3
Nepali             6                 3
Odia               4                 2
Sanskrit           3                 2
Tamil              2                 3
Telugu             0                 3


In [None]:
class_counts = pd.crosstab(df[(df['language'] == 'Sanskrit')]['id'].str[:-6], df["is_tts"])
class_counts.columns = ["Human (0)", "AI-Generated (1)"]
print(class_counts)

               Human (0)  AI-Generated (1)
id                                        
SAN_M_BOOK             1                 0
SAN_M_CONV             0                 1
SAN_M_DISGUST          1                 0
SAN_M_HAPPY            1                 0
SAN_M_NAMES            0                 1


In [None]:
for lang in language_dist.keys():
    if lang == 'English':
        continue
    print("="*10)
    print(lang)
    print(df[(df['language'] == lang)]['id'].str[:-6].nunique())
    print(df[(df['language'] == lang)]['id'].str[:-6].unique())

Assamese
32
['ASM_F_ANGER' 'ASM_F_UMANG' 'ASM_F_DIGI' 'ASM_F_BB' 'ASM_F_ALEXA'
 'ASM_F_CONV' 'ASM_F_DISGUST' 'ASM_F_FEAR' 'ASM_F_HAPPY' 'ASM_F_INDIC'
 'ASM_F_BOOK' 'ASM_F_WIKI' 'ASM_F_NEWS' 'ASM_F_NAMES' 'ASM_F_SAD'
 'ASM_F_SURPRISE' 'ASM_M_ANGER' 'ASM_M_BB' 'ASM_M_ALEXA' 'ASM_M_UMANG'
 'ASM_M_DIGI' 'ASM_M_CONV' 'ASM_M_DISGUST' 'ASM_M_FEAR' 'ASM_M_HAPPY'
 'ASM_M_INDIC' 'ASM_M_BOOK' 'ASM_M_WIKI' 'ASM_M_NEWS' 'ASM_M_NAMES'
 'ASM_M_SAD' 'ASM_M_SURPRISE']
Bengali
32
['BEN_F_ANGER' 'BEN_F_BB' 'BEN_F_ALEXA' 'BEN_F_DIGI' 'BEN_F_UMANG'
 'BEN_F_CONV' 'BEN_F_DISGUST' 'BEN_F_FEAR' 'BEN_F_HAPPY' 'BEN_F_INDIC'
 'BEN_F_BOOK' 'BEN_F_WIKI' 'BEN_F_NEWS' 'BEN_F_NAMES' 'BEN_F_SAD'
 'BEN_F_SURPRISE' 'BEN_M_ANGER' 'BEN_M_ALEXA' 'BEN_M_UMANG' 'BEN_M_BB'
 'BEN_M_DIGI' 'BEN_M_CONV' 'BEN_M_DISGUST' 'BEN_M_FEAR' 'BEN_M_HAPPY'
 'BEN_M_INDIC' 'BEN_M_BOOK' 'BEN_M_WIKI' 'BEN_M_NEWS' 'BEN_M_SAD'
 'BEN_M_SANGRAH' 'BEN_M_SURPRISE']
Bodo
32
['BRX_F_ANGER' 'BRX_F_BB' 'BRX_F_ALEXA' 'BRX_F_DIGI' 'BRX_F_UMANG'
 'BRX_F_CONV

In [None]:
df[(df['language'] == 'Sanskrit')]['id'].str[:-6].value_counts()
#df[(df['is_tts'] == 1) & (df['language'] == 'Hindi')]['id'].str[:-6].value_counts()

Unnamed: 0_level_0,count
id,Unnamed: 1_level_1
SAN_M_ANGER,167
SAN_M_CONV,167
SAN_M_DISGUST,167
SAN_M_FEAR,167
SAN_M_HAPPY,167
SAN_M_BOOK,167
SAN_M_WIKI,167
SAN_M_NEWS,166
SAN_M_NAMES,166
SAN_M_SAD,166


In [None]:
df[(df['id'].str[:-6] == 'SAN_M_SURPRISE')][:100]

Unnamed: 0,id,language,is_tts
15635,SAN_M_SURPRISE_00046,Sanskrit,0
15636,SAN_M_SURPRISE_00008,Sanskrit,1
15637,SAN_M_SURPRISE_00305,Sanskrit,1
15638,SAN_M_SURPRISE_00135,Sanskrit,0
15639,SAN_M_SURPRISE_00255,Sanskrit,1
...,...,...,...
15730,SAN_M_SURPRISE_00148,Sanskrit,1
15731,SAN_M_SURPRISE_00323,Sanskrit,1
15732,SAN_M_SURPRISE_00161,Sanskrit,0
15733,SAN_M_SURPRISE_00084,Sanskrit,1


In [None]:
sample = 15638
dataset['train'][sample]

{'text': 'चायपोटिकाः एवं विन्यासेन सङ्कलयितुं शक्यन्ते इति मया कदापि न कल्पितम्। महार्घं उपहारम् इव दृश्यते इदम्।',
 'id': 'SAN_M_SURPRISE_00135',
 'language': 'Sanskrit',
 'is_tts': 0,
 'audio': {'path': 'SAN_M_SURPRISE_00135.wav',
  'array': array([-8.79964591e-06,  1.16017764e-05, -1.38342566e-05, ...,
         -3.19491664e-05, -3.09406605e-05,  0.00000000e+00]),
  'sampling_rate': 16000}}

In [None]:
from IPython.display import Audio

rate = dataset['train'][sample]['audio']['sampling_rate']
print('Fake' if dataset['train'][sample]['is_tts'] == 1 else 'Real')
print(sample)
Audio(dataset['train'][sample]['audio']['array'], rate=dataset['train'][sample]['audio']['sampling_rate'])

Real
15638


In [None]:
sample

15638

In [None]:
results = pipe(dataset['train'][sample]['audio']['array'])
results

[{'score': 0.7755188941955566, 'label': 'fake'},
 {'score': 0.22448107600212097, 'label': 'real'}]

In [None]:
subset = dataset['train'].select(range(15635, 15635+100))

In [None]:
subset[0]

{'text': 'अस्माकं क्षेत्रे पुनर्नवीकरणोर्जोपक्रमस्य माध्यमेन कार्यान्विताभिः विशालाभिः सौरपटलैः अहं विस्मितः अभवम्।',
 'id': 'SAN_M_SURPRISE_00046',
 'language': 'Sanskrit',
 'is_tts': 0,
 'audio': {'path': 'SAN_M_SURPRISE_00046.wav',
  'array': array([ 2.00646095e-08,  5.71782444e-09,  1.02252082e-08, ...,
         -6.98903602e-09,  5.99771033e-09,  0.00000000e+00]),
  'sampling_rate': 16000}}

In [None]:
def classify(audio_array):
    results = pipe(audio_array)
    ans = {}
    for res in results:
        ans[res['label']] = res['score']
    return 1 if max(ans, key=ans.get) == 'fake' else 0

In [None]:
classify(dataset['train'][15638]['audio']['array'])

0

In [None]:
total_correct = 0
for sample in tqdm(subset):
    if classify(sample['audio']['array']) == sample['is_tts']:
        total_correct += 1
total_correct

  0%|          | 0/100 [00:00<?, ?it/s]

50

In [None]:
total_correct

77

# Code

In [None]:
full = load_dataset("SherryT997/IndicTTS-Deepfake-Challenge-Data")

dataset = full["train"]
test_data = full["test"]

Resolving data files:   0%|          | 0/35 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/35 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/35 [00:00<?, ?files/s]

train-00000-of-00035.parquet:   0%|          | 0.00/453M [00:00<?, ?B/s]

train-00001-of-00035.parquet:   0%|          | 0.00/461M [00:00<?, ?B/s]

train-00002-of-00035.parquet:   0%|          | 0.00/464M [00:00<?, ?B/s]

train-00003-of-00035.parquet:   0%|          | 0.00/443M [00:00<?, ?B/s]

train-00004-of-00035.parquet:   0%|          | 0.00/470M [00:00<?, ?B/s]

train-00005-of-00035.parquet:   0%|          | 0.00/475M [00:00<?, ?B/s]

train-00006-of-00035.parquet:   0%|          | 0.00/447M [00:00<?, ?B/s]

train-00007-of-00035.parquet:   0%|          | 0.00/516M [00:00<?, ?B/s]

train-00008-of-00035.parquet:   0%|          | 0.00/557M [00:00<?, ?B/s]

train-00009-of-00035.parquet:   0%|          | 0.00/521M [00:00<?, ?B/s]

train-00010-of-00035.parquet:   0%|          | 0.00/491M [00:00<?, ?B/s]

train-00011-of-00035.parquet:   0%|          | 0.00/426M [00:00<?, ?B/s]

train-00012-of-00035.parquet:   0%|          | 0.00/414M [00:00<?, ?B/s]

train-00013-of-00035.parquet:   0%|          | 0.00/473M [00:00<?, ?B/s]

train-00014-of-00035.parquet:   0%|          | 0.00/481M [00:00<?, ?B/s]

train-00015-of-00035.parquet:   0%|          | 0.00/467M [00:00<?, ?B/s]

train-00016-of-00035.parquet:   0%|          | 0.00/532M [00:00<?, ?B/s]

train-00017-of-00035.parquet:   0%|          | 0.00/510M [00:00<?, ?B/s]

train-00018-of-00035.parquet:   0%|          | 0.00/471M [00:00<?, ?B/s]

train-00019-of-00035.parquet:   0%|          | 0.00/501M [00:00<?, ?B/s]

train-00020-of-00035.parquet:   0%|          | 0.00/559M [00:00<?, ?B/s]

train-00021-of-00035.parquet:   0%|          | 0.00/541M [00:00<?, ?B/s]

train-00022-of-00035.parquet:   0%|          | 0.00/558M [00:00<?, ?B/s]

train-00023-of-00035.parquet:   0%|          | 0.00/599M [00:00<?, ?B/s]

train-00024-of-00035.parquet:   0%|          | 0.00/576M [00:00<?, ?B/s]

train-00025-of-00035.parquet:   0%|          | 0.00/547M [00:00<?, ?B/s]

train-00026-of-00035.parquet:   0%|          | 0.00/537M [00:00<?, ?B/s]

train-00027-of-00035.parquet:   0%|          | 0.00/421M [00:00<?, ?B/s]

train-00028-of-00035.parquet:   0%|          | 0.00/382M [00:00<?, ?B/s]

train-00029-of-00035.parquet:   0%|          | 0.00/287M [00:00<?, ?B/s]

train-00030-of-00035.parquet:   0%|          | 0.00/282M [00:00<?, ?B/s]

train-00031-of-00035.parquet:   0%|          | 0.00/688M [00:00<?, ?B/s]

train-00032-of-00035.parquet:   0%|          | 0.00/613M [00:00<?, ?B/s]

train-00033-of-00035.parquet:   0%|          | 0.00/309M [00:00<?, ?B/s]

train-00034-of-00035.parquet:   0%|          | 0.00/424M [00:00<?, ?B/s]

test-00000-of-00004.parquet:   0%|          | 0.00/356M [00:00<?, ?B/s]

test-00001-of-00004.parquet:   0%|          | 0.00/364M [00:00<?, ?B/s]

test-00002-of-00004.parquet:   0%|          | 0.00/410M [00:00<?, ?B/s]

test-00003-of-00004.parquet:   0%|          | 0.00/291M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/31102 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2635 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/35 [00:00<?, ?it/s]

In [None]:
dataset = dataset.shuffle(seed=37).select(range(10000))

In [None]:
df = pd.DataFrame({'id': dataset['id'], 'language': dataset['language'], 'is_tts': dataset['is_tts']})
pd.Series(df['is_tts']).value_counts()

Unnamed: 0_level_0,count
is_tts,Unnamed: 1_level_1
0,5028
1,4972


In [None]:
language_dist = defaultdict(int)
for lang in dataset['language']:
    language_dist[lang] += 1
dict(language_dist)

{'English': 663,
 'Odia': 572,
 'Marathi': 604,
 'Tamil': 643,
 'Nepali': 664,
 'Telugu': 653,
 'Kannada': 638,
 'Malayalam': 669,
 'Manipuri': 603,
 'Hindi': 681,
 'Sanskrit': 624,
 'Gujarati': 512,
 'Bodo': 635,
 'Bengali': 582,
 'Dogri': 642,
 'Assamese': 615}

In [None]:
class_counts = pd.crosstab(df["language"], df["is_tts"])
class_counts.columns = ["Real (0)", "TTS (1)"]
print(class_counts)

           Real (0)  TTS (1)
language                    
Assamese        301      314
Bengali         272      310
Bodo            337      298
Dogri           325      317
English         344      319
Gujarati        259      253
Hindi           354      327
Kannada         318      320
Malayalam       335      334
Manipuri        296      307
Marathi         300      304
Nepali          325      339
Odia            290      282
Sanskrit        301      323
Tamil           330      313
Telugu          341      312


In [None]:
model_id = "facebook/wav2vec2-base-960h"
feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id, do_normalize=True, return_attention_mask=True
)

model = AutoModelForAudioClassification.from_pretrained(model_id, num_labels=2)

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def prepare_sample(batch):
    audio = batch["audio"]["array"]
    # Extract features using feature_extractor (specific to HuBERT)
    inputs = feature_extractor(audio,
                               sampling_rate=16000,
                               max_length=80000,
                               padding="max_length",
                               truncation=True,
                               return_tensors="pt")

    batch["input_values"] = inputs.input_values[0]
    batch["labels"] = torch.tensor(batch["is_tts"], dtype=torch.float)
    return batch

In [None]:
# applying preprocessing
dataset = dataset.map(prepare_sample, remove_columns=dataset.column_names)

# splitting dataset into train and validation sets
dataset = dataset.train_test_split(test_size=0.1, shuffle=True, seed=37)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 9000
    })
    test: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 1000
    })
})

In [None]:
@dataclass
class DataCollatorWithPadding_:
    """
    Data collator that pads inputs but keeps labels unchanged.
    """
    processor: feature_extractor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]

        # applying padding using processor
        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )

        batch["labels"] = torch.tensor([feature["labels"] for feature in features], dtype=torch.long)
        return batch

data_collator = DataCollatorWithPadding_(feature_extractor, padding=True)

In [None]:
roc_auc_metric = load("roc_auc")

def compute_metrics(pred):
    # model predictions logits
    logits = pred.predictions

    # converting logits to probabilities using softmax
    pred_probs = softmax(logits, axis=-1)[:, 1]  # probability of class TTS

    # getting true labels
    labels = pred.label_ids

    # computing roc_auc score
    roc_auc = roc_auc_metric.compute(prediction_scores=pred_probs, references=labels)["roc_auc"]
    return {"roc_auc": roc_auc}

Downloading builder script:   0%|          | 0.00/9.54k [00:00<?, ?B/s]

In [None]:
model.to("cuda")
model.freeze_feature_encoder() # only training classifier head

In [None]:
training_args = TrainingArguments(
    output_dir="audioclassification",
    group_by_length=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_strategy="steps",
    num_train_epochs=10,
    fp16=True,
    metric_for_best_model="roc_auc",
    save_steps=1000,
    eval_steps=500,
    logging_steps=500,
    learning_rate=3e-5,
    weight_decay=0.005,
    warmup_ratio=0.1,
    save_total_limit=2,
    load_best_model_at_end=True,
    save_strategy="steps",
    label_names=["labels"],
    report_to="none"
)

In [None]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=feature_extractor,
)

  trainer = Trainer(


In [None]:
trainer.train()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ensures model is in evaluation mode
model.eval()

results = []
for sample in tqdm(test_data):
    id = sample["id"]
    audio = sample["audio"]["array"]

    # extracting features
    inputs = feature_extractor(audio,
                               sampling_rate=16000,
                               max_length=80000,
                               padding="max_length",
                               truncation=True,
                               return_tensors="pt")

    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        logits = model(**inputs).logits  # raw model outputs (logits)
    probabilities = softmax(logits.cpu().numpy(), axis=-1)

    # probability of being TTS
    is_tts = round(probabilities[0, 1], 3)
    results.append([id, is_tts])

submission = pd.DataFrame(results, columns=["id", "is_tts"])
submission.to_csv("./submission.csv", index=False)

  0%|          | 0/2635 [00:00<?, ?it/s]

In [None]:
submission["is_tts"].value_counts().head(10)