##Import

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from google.colab import drive
warnings.filterwarnings('ignore')

In [None]:
!pip install librosa numpy praat-parselmouth

Collecting praat-parselmouth
  Downloading praat_parselmouth-0.4.5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (2.9 kB)
Downloading praat_parselmouth-0.4.5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (10.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.7/10.7 MB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: praat-parselmouth
Successfully installed praat-parselmouth-0.4.5


In [None]:
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import librosa
import numpy as np
import parselmouth
from parselmouth.praat import call

In [None]:
df = pd.read_csv("/content/drive/MyDrive/IPD/code/audio/augmented_audio_paths_modified.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,index,text,hate,audio,augment
0,0,48.0,@odysseuslahori @IqShoaib Hang till death all ...,1.0,/content/drive/MyDrive/IPD/code/audio/claude_t...,/content/drive/MyDrive/IPD/code/audio/augmente...
1,1,84.0,"""where they ultimately need to be"" is back in ...",1.0,/content/drive/MyDrive/IPD/code/audio/claude_t...,/content/drive/MyDrive/IPD/code/audio/augmente...
2,2,126.0,"""I'm trying to take you out"" to ""you don't loo...",1.0,/content/drive/MyDrive/IPD/code/audio/claude_t...,/content/drive/MyDrive/IPD/code/audio/augmente...
3,3,298.0,Facts . That's why I cut these hoes off,1.0,/content/drive/MyDrive/IPD/code/audio/claude_t...,/content/drive/MyDrive/IPD/code/audio/augmente...
4,4,454.0,"""I'm a whore! Yay! Glorify being a whore! This...",1.0,/content/drive/MyDrive/IPD/code/audio/claude_t...,/content/drive/MyDrive/IPD/code/audio/augmente...


##Feature Extraction

In [None]:
def extract_audio_features(audio_path):
    y, sr = librosa.load(audio_path, sr=None)  # y is the audio time series, sr is the sample rate

    features = {}

    # 1. Pitch (Fundamental Frequency)
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
    pitches = pitches[pitches > 0]  # Filter out zeros
    features['pitch_mean'] = np.mean(pitches) if len(pitches) > 0 else 0
    features['pitch_std'] = np.std(pitches) if len(pitches) > 0 else 0

    # 2. Spectral Centroid
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    features['spectral_centroid_mean'] = np.mean(spectral_centroid)
    features['spectral_centroid_std'] = np.std(spectral_centroid)

    # 3. Zero Crossing Rate
    zcr = librosa.feature.zero_crossing_rate(y)
    features['zcr_mean'] = np.mean(zcr)
    features['zcr_std'] = np.std(zcr)

    # 4. Root Mean Square Energy (RMS)
    rms = librosa.feature.rms(y=y)
    features['rms_mean'] = np.mean(rms)
    features['rms_std'] = np.std(rms)

    # 5. Spectral Rolloff
    spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr, roll_percent=0.85)
    features['spectral_rolloff_mean'] = np.mean(spectral_rolloff)
    features['spectral_rolloff_std'] = np.std(spectral_rolloff)

    # 6. Speaking Rate
    hop_length = 512  # Default hop length in librosa
    duration = librosa.get_duration(y=y, sr=sr)
    voiced_frames = librosa.effects.split(y, top_db=20)  # Split based on silence
    speaking_rate = len(voiced_frames) / duration if duration > 0 else 0
    features['speaking_rate'] = speaking_rate

    # 7. Harmonic-to-Noise Ratio
    try:
        sound = parselmouth.Sound(audio_path)
        pitch = call(sound, "To Pitch", 0.0, 75, 600)
        harmonicity = call(sound, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0)
        hnr_values = []
        for time in pitch.ts():
            harmonicity_value = call(harmonicity, "Get value at time", time, "Linear")
            if not np.isnan(harmonicity_value):
                hnr_values.append(harmonicity_value)

        hnr_mean = sum(hnr_values) / len(hnr_values) if len(hnr_values) > 0 else 0
        hnr_std = np.std(hnr_values) if len(hnr_values) > 1 else 0
        features["hnr_mean"] = hnr_mean
        features["hnr_std"] = hnr_std

    except Exception as e:
        print(f"Error calculating HNR: {e}")
        features["hnr_mean"] = 0
        features["hnr_std"] = 0
    return features

In [None]:
extract_audio_features(df['audio'][0])

{'pitch_mean': 1458.9489,
 'pitch_std': 1122.4113,
 'spectral_centroid_mean': 2528.35677489655,
 'spectral_centroid_std': 2069.900019920371,
 'zcr_mean': 0.12556875218226257,
 'zcr_std': 0.16939099511939446,
 'rms_mean': 0.08379578,
 'rms_std': 0.06489106,
 'spectral_rolloff_mean': 4404.384165502794,
 'spectral_rolloff_std': 3283.588469842132,
 'speaking_rate': 1.310272536687631,
 'hnr_mean': -59.82541865716342,
 'hnr_std': 100.61475483803599}

In [None]:
columns = ['index','pitch_mean', 'pitch_std', 'spectral_centroid_mean', 'spectral_centroid_std',
                                    'zcr_mean', 'zcr_std', 'rms_mean', 'rms_std', 'spectral_rolloff_mean',
                                    'spectral_rolloff_std', 'speaking_rate', 'hnr_mean', 'hnr_std','hate']
feature_df = pd.DataFrame(columns=columns)
for index, row in df.iterrows():
    id = row['index'] if not np.isnan(row['index']) else df['index'][index-1000] + 0.1
    audio_path = row['augment']
    features = extract_audio_features(audio_path)
    features['index'] = id
    features['hate'] = row['hate']
    feature_df.loc[len(feature_df)] = features

In [None]:
feature_df.to_csv("/content/drive/MyDrive/IPD/code/audio/feature_extracted.csv")

In [None]:
feature_df.head()

Unnamed: 0,index,pitch_mean,pitch_std,spectral_centroid_mean,spectral_centroid_std,zcr_mean,zcr_std,rms_mean,rms_std,spectral_rolloff_mean,spectral_rolloff_std,speaking_rate,hnr_mean,hnr_std,hate
0,48.0,1947.130737,1143.748535,4378.746982,1410.851612,0.257575,0.187818,0.088702,0.060642,8825.233502,1323.673214,1.4413,-62.506859,98.441387,1.0
1,84.0,1591.703857,1099.045532,3034.581428,1378.520063,0.138716,0.139052,0.068526,0.042545,6412.378772,2409.543056,0.810636,-36.677607,88.585121,1.0
2,126.0,1188.561035,950.071716,1893.726573,1256.04165,0.072548,0.074697,0.037494,0.026806,3655.643439,2759.555598,0.971251,-61.680579,97.244589,1.0
3,298.0,1519.516846,994.942383,3255.370323,2274.918026,0.166019,0.177654,0.0666,0.068555,5721.10948,3701.30765,1.893939,-107.198889,106.638173,1.0
4,454.0,1457.022949,1060.380127,2268.857176,1897.60337,0.117746,0.140863,0.07651,0.058947,4075.81865,3445.456428,2.245509,-77.738639,107.04361,1.0


In [None]:
feature_df.shape

(2000, 15)

##Applying Classification Models on Features

In [None]:
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import torchaudio
import numpy as np

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
audio_path = df['augment'][0]
waveform, sample_rate = librosa.load(audio_path, sr=None)  # sr=None keeps the original sampling rate

# Resample the audio to 16000 Hz if it's not already at that rate
if sample_rate != 16000:
    waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=16000)
    sample_rate = 16000

# Preprocess the audio to the required format
inputs = processor(waveform, return_tensors="pt", sampling_rate=sample_rate)

# Forward pass through the model
with torch.no_grad():
    outputs = model(**inputs)

# Extract embeddings (hidden states from the last layer)
embeddings = outputs.last_hidden_state

# Convert to numpy array if needed
embeddings_np = embeddings.squeeze().numpy()

# Optionally, you can aggregate (e.g., take mean) over time steps
embedding_mean = np.mean(embeddings_np, axis=0)

print(embedding_mean)  # This will give you the aggregated embedding vector

[-4.18219678e-02  2.92005530e-03 -4.47658189e-02 -5.72414435e-02
  1.01601198e-01 -4.87774163e-02  4.06140089e-02 -4.08355072e-02
  5.93224689e-02 -2.16457620e-01 -1.40266158e-02 -9.77613591e-03
  6.03549629e-02  4.85540032e-02 -1.14686400e-01 -5.29419295e-02
 -4.94702995e-01  2.83650696e-01  1.76958684e-02  6.94755614e-02
 -1.60323784e-01  1.56713892e-02  4.25618559e-01  3.18456907e-03
  9.99013335e-02  1.62466578e-02 -4.66524094e-01  4.21501733e-02
 -7.39853922e-03 -2.08267018e-01  2.20335618e-01 -1.00966813e-02
 -1.19996302e-01 -9.55860615e-02 -2.78904438e-01 -9.87050124e-03
 -7.97200128e-02 -1.57317743e-01 -1.62566707e-01  5.13353981e-02
 -7.88570791e-02 -2.08332330e-01 -1.52300239e-01  2.23618239e-01
 -2.48069167e-01  9.68412757e-02 -2.59894598e-02 -9.82940849e-03
  1.24906795e-02  2.79037096e-02 -7.69215599e-02 -2.45796759e-02
 -9.93796736e-02  4.13622931e-02  8.30868073e-03 -2.97545455e-02
  1.02410316e-02 -4.86629188e-01 -8.91128480e-02 -1.36182234e-01
 -1.44184232e-01 -1.08772