In [None]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
from transformers import Wav2Vec2Processor, Wav2Vec2Model
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
traindf=pd.read_csv('/kaggle/input/shl-intern-hiring-assessment/Dataset/train.csv')
testdf=pd.read_csv('/kaggle/input/shl-intern-hiring-assessment/Dataset/test.csv')

In [None]:
trainaud= '/kaggle/input/shl-intern-hiring-assessment/Dataset/audios/train'
testaud= '/kaggle/input/shl-intern-hiring-assessment/Dataset/audios/test'

In [None]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
model.eval()

In [None]:
if torch.cuda.is_available():
    model = model.cuda()

In [None]:
def extract_embedding(file_path):
    speecharr, sampling_rate = torchaudio.load(file_path)
    speech = speecharr[0].numpy()
    inputs = processor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
    with torch.no_grad():
        if torch.cuda.is_available():
            inputs = {k: v.cuda() for k, v in inputs.items()}
        outputs = model(**inputs)
    return outputs.last_hidden_state[0].mean(dim=0).cpu().numpy()
train_embeddings=[]

In [None]:
for file in tqdm(traindf['filename']):
    path=os.path.join(trainaud,file)
    train_embeddings.append(extract_embedding(path))

In [None]:
X_train = np.stack(train_embeddings)
y_train = traindf['label'].values

In [None]:
regressor = Ridge(alpha=1.0)
regressor.fit(X_train, y_train)
train_preds = regressor.predict(X_train)

In [None]:
rmse = mean_squared_error(y_train, train_preds, squared=False)
pearson = pearsonr(y_train, train_preds)[0]
print(f"Train RMSE: {rmse:.4f}")
print(f"Train Pearson Correlation: {pearson:.4f}")

In [None]:
plt.figure(figsize=(6,6))
sns.regplot(x=y_train, y=train_preds, ci=None)
plt.xlabel("Actual Score")
plt.ylabel("Predicted Score")
plt.title("Actual vs Predicted Grammar Score")
plt.grid(True)
plt.show()

residuals = y_train - train_preds
plt.figure(figsize=(6,4))
sns.histplot(residuals, kde=True)
plt.title("Residual Distribution")
plt.xlabel("Error")
plt.show()

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores, pearson_scores = [],[]

In [None]:
for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]
    
    reg = Ridge(alpha=1.0)
    reg.fit(X_tr, y_tr)
    preds = reg.predict(X_val)
    
    rmse_scores.append(mean_squared_error(y_val, preds, squared=False))
    pearson_scores.append(pearsonr(y_val, preds)[0])

print(f"\n5-Fold CV RMSE: {np.mean(rmse_scores):.4f} ± {np.std(rmse_scores):.4f}")
print(f"5-Fold CV Pearson: {np.mean(pearson_scores):.4f} ± {np.std(pearson_scores):.4f}")

In [None]:
test_embeddings = []
used_filenames = []
for file in tqdm(testdf['filename']):
    path = os.path.join(testaud, file)
    if not os.path.exists(path):
        print(f"Missing: {file}")
        continue
    emb = extract_embedding(path)
    test_embeddings.append(emb)
    used_filenames.append(file)

In [None]:
submission = pd.DataFrame({
    'filename': used_filenames,
    'label': test_preds
})
submission.to_csv('submission3.csv', index=False)
print("submission.csv saved")