In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/shl-intern-hiring-assessment/dataset/sample_submission.csv
/kaggle/input/shl-intern-hiring-assessment/dataset/train.csv
/kaggle/input/shl-intern-hiring-assessment/dataset/test.csv
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_885.wav
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_698.wav
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_1176.wav
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_1215.wav
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_66.wav
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_386.wav
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_1026.wav
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_330.wav
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_72.wav
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_858.wav
/kaggle/input/shl-intern-hiring-ass

# Using OpenAI's Whisper for grammar scoring

## Brief Report

### Objective
The goal is to predict grammar scores (1 to 5) for spoken English audio samples using a machine learning pipeline. The evaluation metric is Pearson correlation.

### Approach Overview
1. **Transcription**: Use OpenAI's Whisper model to transcribe the audio.
2. **Feature Engineering**: Extract handcrafted linguistic features from the transcribed text.
3. **Regression Model**: Use a Gradient Boosting Regressor to predict grammar scores based on features.
 
### Preprocessing Steps
- Audio is loaded using `librosa` at 16kHz sampling rate.
- Whisper is used for transcription in English.
- Extracted features include word count, unique word ratio, grammatical constructs, etc.
 
### Model & Pipeline Architecture
- Whisper for transcription.
- Feature engineering from transcripts.
- Gradient Boosting Regressor (200 estimators, learning rate 0.05).

### Evaluation
- Validation set evaluated using Pearson correlation.


## Importing libraries and initial set up

In [2]:
!pip install -q transformers librosa pandas scikit-learn torch tqdm
!sudo apt install -y ffmpeg

import torch
import librosa
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import warnings

warnings.filterwarnings("ignore")
device = "cuda" if torch.cuda.is_available() else "cpu"


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 129 not upgraded.


## Loading whisper for transcription

In [3]:
# Whisper for transcription
whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-small")
whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
forced_decoder_ids = whisper_processor.get_decoder_prompt_ids(language="en", task="transcribe")

# GBR for scoring
scoring_model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05)

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

## Feature extraction function

 | Feature | Description |
 |--------|-------------|
 | `len(words)` | Total number of words – indicates fluency or verbosity |
 | `len(set(words)) / max(1, len(words))` | Lexical diversity – how many unique words are used |
 | `transcript.count('.')` | Approximate number of sentences – gives a hint of structure |
 | `sum(w.endswith('ing') for w in words)` | Count of words ending with '-ing' – suggests continuous tenses or gerunds |
 | `sum(w.endswith('ed') for w in words)` | Count of words ending with '-ed' – helps identify past tense verbs |
 | `sum(len(w) > 6 for w in words)` | Number of long words (length > 6) – may reflect vocabulary complexity |
 | `transcript.count('the')` | Frequency of the word 'the' – related to proper article usage |

In [4]:
def extract_features(audio_path):
    # Audio processing
    audio, sr = librosa.load(audio_path, sr=16000)
    inputs = whisper_processor(audio, sampling_rate=sr, return_tensors="pt").to(device)
    
    # Transcription
    with torch.no_grad():
        outputs = whisper_model.generate(**inputs, forced_decoder_ids=forced_decoder_ids)
    transcript = whisper_processor.decode(outputs[0], skip_special_tokens=True)
    
    # Feature engineering
    words = transcript.lower().split()
    return np.array([
        len(words),
        len(set(words)) / max(1, len(words)),
        transcript.count('.'),
        sum(w.endswith('ing') for w in words),
        sum(w.endswith('ed') for w in words),
        sum(len(w) > 6 for w in words),
        transcript.count('the')
    ])

## Loading dataset and training it

In [None]:
train_df = pd.read_csv('/kaggle/input/shl-intern-hiring-assessment/dataset/train.csv')
test_df = pd.read_csv('/kaggle/input/shl-intern-hiring-assessment/dataset/test.csv')

# Train features
print("Extracting training features...")
X_train = np.array([extract_features(f"/kaggle/input/shl-intern-hiring-assessment/dataset/audios_train/{f}") 
                   for f in tqdm(train_df['filename'])])
y_train = train_df['label'].values

# Train/val split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

Extracting training features...


  0%|          | 0/444 [00:00<?, ?it/s]

## Train Gradient Boosting Regressor

In [None]:
scoring_model.fit(X_train, y_train)
val_pred = scoring_model.predict(X_val)
print(f"Validation Pearson: {pearsonr(y_val, val_pred)[0]:.3f}")

## Processing test data and generating submission

In [None]:
print("Processing test set...")
X_test = np.array([extract_features(f"/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/{f}") 
                  for f in tqdm(test_df['filename'])])

test_pred = np.clip(scoring_model.predict(X_test), 1, 5)
submission = pd.DataFrame({'filename': test_df['filename'], 'label': test_pred})
submission.to_csv("submission.csv", index=False)
print(submission.head())