# SHL Intern Hiring Assessment 2025
## Grammar Scoring Engine for Spoken Audio

**Pipeline:** Whisper → Grammar Features → Wav2Vec2 → LightGBM → Submission

By Kunal Waghe

my Github: https://github.com/KunalWaghe

---

## 1. Imports and Environment Setup

In [1]:

import os
import numpy as np
import pandas as pd
import torch
import librosa


## 2. Dataset Paths

In [2]:

DATA_ROOT = r"D:\SHL"

TRAIN_AUDIO_FOLDER = os.path.join(DATA_ROOT, "train")
TEST_AUDIO_FOLDER  = os.path.join(DATA_ROOT, "test")

TRAIN_CSV_PATH = os.path.join(DATA_ROOT, "train.csv")
TEST_CSV_PATH  = os.path.join(DATA_ROOT, "test.csv")


## 3. Load CSV Files

In [3]:

train_df = pd.read_csv(TRAIN_CSV_PATH)
test_df  = pd.read_csv(TEST_CSV_PATH)

train_df["transcript"] = ""
test_df["transcript"] = ""


## 4. Whisper Setup (CPU or GPU)

In [4]:

import imageio_ffmpeg
import whisper

ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
os.environ["PATH"] += os.pathsep + os.path.dirname(ffmpeg_path)

device = "cuda" if torch.cuda.is_available() else "cpu"
model_whisper = whisper.load_model("base", device=device)


## 5. Whisper Transcription Function

In [5]:

def whisper_transcribe(filename, audio_folder):
    filename = str(filename)
    candidates = [
        f for f in os.listdir(audio_folder)
        if f.startswith(filename) and f.lower().endswith(".wav")
    ]
    if not candidates:
        return ""
    audio_path = os.path.join(audio_folder, candidates[0])
    try:
        result = model_whisper.transcribe(audio_path, fp16=(device=="cuda"))
        return result["text"].strip()
    except:
        return ""


## 6. Transcribe Train and Test Data

In [6]:

train_df["transcript"] = train_df["filename"].apply(
    lambda x: whisper_transcribe(x, TRAIN_AUDIO_FOLDER)
)

test_df["transcript"] = test_df["filename"].apply(
    lambda x: whisper_transcribe(x, TEST_AUDIO_FOLDER)
)

train_df.to_csv("train_with_transcripts.csv", index=False)
test_df.to_csv("test_with_transcripts.csv", index=False)


## 7. Grammar and Text Feature Extraction

In [5]:

import nltk
import language_tool_python
import textstat
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag

nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

tool = language_tool_python.LanguageTool("en-US")

def extract_text_features(text):
    if not isinstance(text, str) or len(text.strip()) == 0:
        return np.zeros(14)

    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    pos = pos_tag(words)

    n_sent = len(sentences)
    n_words = len(words)
    sent_lengths = [len(word_tokenize(s)) for s in sentences]
    grammar_errors = len(tool.check(text))

    noun_ratio = sum(1 for _, p in pos if p.startswith("NN")) / max(len(pos), 1)
    verb_ratio = sum(1 for _, p in pos if p.startswith("VB")) / max(len(pos), 1)
    adj_ratio  = sum(1 for _, p in pos if p.startswith("JJ")) / max(len(pos), 1)

    filler_words = {"uh", "um", "er", "ah"}
    filler_ratio = sum(w.lower() in filler_words for w in words) / max(n_words, 1)

    readability = textstat.flesch_reading_ease(text)

    return np.array([
        n_sent, n_words,
        np.mean(sent_lengths) if sent_lengths else 0,
        np.std(sent_lengths) if sent_lengths else 0,
        grammar_errors,
        grammar_errors / max(n_words, 1),
        noun_ratio, verb_ratio, adj_ratio,
        filler_ratio,
        readability,
        n_words / max(n_sent, 1),
        text.count(","),
        text.count(".")
    ])


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\knlwa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\knlwa\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## 8. Extract Text Features

In [6]:
import nltk

nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("averaged_perceptron_tagger")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\knlwa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\knlwa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\knlwa\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [7]:

X_text_train = np.vstack(train_df["transcript"].apply(extract_text_features))
X_text_test  = np.vstack(test_df["transcript"].apply(extract_text_features))


## 9. Wav2Vec2 Audio Feature Extraction (Mean + Std Pooling)

In [8]:

from transformers import Wav2Vec2Processor, Wav2Vec2Model

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
wav2vec = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h").to(device)
wav2vec.eval()
if device == "cuda":
    wav2vec = wav2vec.half()

def extract_wav2vec_embedding(filename, audio_folder):
    candidates = [
        f for f in os.listdir(audio_folder)
        if f.startswith(filename) and f.lower().endswith(".wav")
    ]
    if not candidates:
        return np.zeros(1536)

    audio_path = os.path.join(audio_folder, candidates[0])
    speech, _ = librosa.load(audio_path, sr=16000)

    inputs = processor(
        speech, sampling_rate=16000,
        return_tensors="pt", padding=True
    )

    input_values = inputs.input_values.to(device)
    if device == "cuda":
        input_values = input_values.half()

    with torch.no_grad():
        outputs = wav2vec(input_values)

    hidden = outputs.last_hidden_state
    mean = hidden.mean(dim=1)
    std  = hidden.std(dim=1)

    return torch.cat([mean, std], dim=1).float().cpu().numpy().flatten()


  from .autonotebook import tqdm as notebook_tqdm
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 10. Extract and Cache Audio Features

In [9]:

X_audio_train = np.vstack(
    train_df["filename"].apply(
        lambda x: extract_wav2vec_embedding(x, TRAIN_AUDIO_FOLDER)
    )
)

X_audio_test = np.vstack(
    test_df["filename"].apply(
        lambda x: extract_wav2vec_embedding(x, TEST_AUDIO_FOLDER)
    )
)

np.save("X_audio_train_v2.npy", X_audio_train)
np.save("X_audio_test_v2.npy", X_audio_test)


  speech, _ = librosa.load(audio_path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  speech, _ = librosa.load(audio_path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


## 11. Feature Fusion and Scaling

In [10]:

from sklearn.preprocessing import StandardScaler

X_train = np.hstack([X_audio_train, X_text_train])
X_test  = np.hstack([X_audio_test,  X_text_test])

y = train_df["label"].values

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)


## 12. LightGBM Training with Cross-Validation

In [11]:

import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr

kf = KFold(n_splits=5, shuffle=True, random_state=42)

rmses, pears = [], []

for fold, (tr, va) in enumerate(kf.split(X_train_scaled)):
    model = lgb.LGBMRegressor(
        n_estimators=1200,
        learning_rate=0.02,
        num_leaves=32,
        min_data_in_leaf=20,
        feature_fraction=0.6,
        bagging_fraction=0.8,
        bagging_freq=1,
        lambda_l1=1.0,
        lambda_l2=1.0,
        random_state=42
    )

    model.fit(X_train_scaled[tr], y[tr])
    preds = model.predict(X_train_scaled[va])

    rmses.append(np.sqrt(mean_squared_error(y[va], preds)))
    pears.append(pearsonr(y[va], preds)[0])

print("CV RMSE:", np.mean(rmses))
print("CV Pearson:", np.mean(pears))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008463 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 159810
[LightGBM] [Info] Number of data points in the train set: 327, number of used features: 1536
[LightGBM] [Info] Start training from score 2.868502
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008925 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 160178




[LightGBM] [Info] Number of data points in the train set: 327, number of used features: 1536
[LightGBM] [Info] Start training from score 2.958716




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007592 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 159866
[LightGBM] [Info] Number of data points in the train set: 327, number of used features: 1536
[LightGBM] [Info] Start training from score 2.879205
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007249 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 160184
[LightGBM] [Info] Number of data points in the train set: 327, number of used features: 1536
[LightGBM] [Info] Start training from score 2.925076








[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009390 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 160612
[LightGBM] [Info] Number of data points in the train set: 328, number of used features: 1536
[LightGBM] [Info] Start training from score 2.922256
CV RMSE: 0.6849444340411905
CV Pearson: 0.4459711394986002




## 13. Feature Importance Pruning

In [12]:

importances = model.feature_importances_
idx = np.argsort(importances)[::-1][:400]

X_train_pruned = X_train_scaled[:, idx]
X_test_pruned  = X_test_scaled[:, idx]


## 14. Train Final Model and Generate Submission

In [13]:

final_model = lgb.LGBMRegressor(
    n_estimators=1200,
    learning_rate=0.02,
    num_leaves=32,
    min_data_in_leaf=20,
    lambda_l1=1.0,
    lambda_l2=1.0,
    random_state=42
)

final_model.fit(X_train_pruned, y)

test_preds = np.clip(final_model.predict(X_test_pruned), 1, 5)

submission = test_df[["filename"]].copy()
submission["label"] = test_preds
submission.to_csv("submission.csv", index=False)

print("submission.csv generated")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003968 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 52543
[LightGBM] [Info] Number of data points in the train set: 409, number of used features: 400
[LightGBM] [Info] Start training from score 2.910758
submission.csv generated


