# **Imports**

In [1]:
import numpy as np
import librosa
import soundfile as sf
from pathlib import Path
from tqdm import tqdm

# **Configuration**

In [5]:
DATA_ROOT = Path("../data")

SAMPLE_RATE = 16000
MAX_DURATION = 5.0  # seconds
MAX_SAMPLES = int(SAMPLE_RATE * MAX_DURATION)

N_MFCC = 40
N_FFT = 1024
HOP_LENGTH = 512

LABEL_MAP = {
    "AI": 0,
    "Human": 1,
}

# **Audio Loading with Fixed Duration**

In [6]:
def load_audio(path, target_sr=16000):
    y, sr = sf.read(path)

    # Convert stereo to mono
    if y.ndim > 1:
        y = y.mean(axis=1)

    # Resample if needed
    if sr != target_sr:
        y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)

    # Normalize amplitude
    y = librosa.util.normalize(y)

    # Clip or pad to MAX_SAMPLES
    if len(y) > MAX_SAMPLES:
        y = y[:MAX_SAMPLES]
    else:
        y = np.pad(y, (0, MAX_SAMPLES - len(y)))

    return y

# **Feature Extraction**

In [7]:
def extract_mfcc(y):
    mfcc = librosa.feature.mfcc(y=y, sr=SAMPLE_RATE, n_mfcc=N_MFCC, n_fft=N_FFT, hop_length=HOP_LENGTH)
    
    # Normalize Per Audio Sample
    mfcc = (mfcc - np.mean(mfcc)) / (np.std(mfcc) + 1e-9)
    
    return mfcc

# **Dataset Traversal & Feature Collection**

In [8]:
X = []
y_labels = []
languages = []

for label_name, label_value in LABEL_MAP.items():
    label_dir = DATA_ROOT / label_name
    
    for lang_dir in label_dir.iterdir():
        if not lang_dir.is_dir():
            continue
            
        language = lang_dir.name
        
        for audio_file in tqdm(list(lang_dir.glob("*.mp3")), desc=f"{label_name}-{language}"):
            
            try:
                audio = load_audio(audio_file)
                mfcc_feat = extract_mfcc(audio)
                
                X.append(mfcc_feat)
                y_labels.append(label_value)
                languages.append(language)
                
            except Exception as e:
                print(f"Failed: {audio_file} -> {e}")

  from .autonotebook import tqdm as notebook_tqdm
AI-English: 100%|██████████| 200/200 [00:04<00:00, 44.34it/s] 
AI-Hindi: 100%|██████████| 200/200 [00:01<00:00, 111.85it/s]
AI-Malayalam: 100%|██████████| 200/200 [00:01<00:00, 121.02it/s]
AI-Tamil: 100%|██████████| 200/200 [00:01<00:00, 109.07it/s]
AI-Telugu: 100%|██████████| 200/200 [00:01<00:00, 113.27it/s]
Human-English: 100%|██████████| 200/200 [00:05<00:00, 38.38it/s]
Human-Hindi: 100%|██████████| 200/200 [00:03<00:00, 52.68it/s]
Human-Malayalam: 100%|██████████| 200/200 [00:03<00:00, 57.38it/s]
Human-Tamil: 100%|██████████| 200/200 [00:03<00:00, 55.66it/s]
Human-Telugu: 100%|██████████| 200/200 [00:04<00:00, 48.93it/s]


In [11]:
X = np.array(X, dtype=np.float32)
y_labels = np.array(y_labels)
languages = np.array(languages)

print("Feature Matrix Shape:", X.shape)
print("Labels Shape:", y_labels.shape)
print("Languages Shape:", languages.shape)

print("AI Samples:", np.sum(y_labels == 0))
print("Human Samples:", np.sum(y_labels == 1))

Feature Matrix Shape: (2000, 40, 157)
Labels Shape: (2000,)
Languages Shape: (2000,)
AI Samples: 1000
Human Samples: 1000


In [12]:
FEATURE_DIR = Path("../artifacts/features/DL")
FEATURE_DIR.mkdir(parents=True, exist_ok=True)

np.save(FEATURE_DIR / "X_features.npy", X)
np.save(FEATURE_DIR / "y_labels.npy", y_labels)
np.save(FEATURE_DIR / "languages.npy", languages)

print("Features saved to:", FEATURE_DIR)

Features saved to: ..\artifacts\features\DL
