In [12]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/shl-intern-hiring-assessment/dataset/sample_submission.csv
/kaggle/input/shl-intern-hiring-assessment/dataset/train.csv
/kaggle/input/shl-intern-hiring-assessment/dataset/test.csv
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_885.wav
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_698.wav
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_1176.wav
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_1215.wav
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_66.wav
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_386.wav
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_1026.wav
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_330.wav
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_72.wav
/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/audio_858.wav
/kaggle/input/shl-intern-hiring-ass

In [13]:
!pip install librosa pydub openai-whisper transformers sentencepiece torch pandas scikit-learn language-tool-python spacy nltk
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m81.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [14]:
import os
import librosa
import soundfile as sf
import pandas as pd
from tqdm.notebook import tqdm

## Audio Preprocessing

In [15]:
import os
import librosa
import soundfile as sf
import pandas as pd
from tqdm.notebook import tqdm

# Paths
AUDIO_DIR = '/kaggle/input/shl-intern-hiring-assessment/dataset/audios_train'
CSV_PATH = '/kaggle/input/shl-intern-hiring-assessment/dataset/train.csv'
PROCESSED_DIR = '/kaggle/working/processed_audio'
os.makedirs(PROCESSED_DIR, exist_ok=True)

# Load CSV and rename columns
train_df = pd.read_csv(CSV_PATH)
train_df.columns = ['filename', 'label']  # Rename for easier use

# Preprocessing function
def preprocess_audio(file_path, save_path, sr=16000):
    y, orig_sr = librosa.load(file_path, sr=None)
    if orig_sr != sr:
        y = librosa.resample(y, orig_sr, sr)
    y = y / max(abs(y))  # Normalise volume
    y, _ = librosa.effects.trim(y, top_db=25)  # Trim silence
    sf.write(save_path, y, sr)

# Preprocess each audio
for filename in tqdm(train_df['filename']):
    in_path = os.path.join(AUDIO_DIR, filename)
    out_path = os.path.join(PROCESSED_DIR, filename)
    preprocess_audio(in_path, out_path)

print("✅ Audio preprocessing completed. Files saved in:", PROCESSED_DIR)


  0%|          | 0/444 [00:00<?, ?it/s]

✅ Audio preprocessing completed. Files saved in: /kaggle/working/processed_audio


In [16]:
files = os.listdir('/kaggle/working/processed_audio')
print(f"🔎 Found {len(files)} preprocessed audio files.\nExample files:\n", files[:5])

# Check sample rate and duration of a random file
sample_file = os.path.join('/kaggle/working/processed_audio', files[0])
y, sr = librosa.load(sample_file, sr=None)

duration = librosa.get_duration(y=y, sr=sr)
print(f"📁 Sample file: {files[0]}")
print(f"🕒 Duration: {duration:.2f} seconds")
print(f"🎧 Sample rate: {sr} Hz")


🔎 Found 444 preprocessed audio files.
Example files:
 ['audio_297.wav', 'audio_77.wav', 'audio_836.wav', 'audio_413.wav', 'audio_504.wav']
📁 Sample file: audio_297.wav
🕒 Duration: 46.82 seconds
🎧 Sample rate: 16000 Hz


## Transcribe Audio with Whisper (base model)

In [17]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🖥️ Using device: {device}")


🖥️ Using device: cuda


In [18]:
import whisper
from tqdm.notebook import tqdm
import pandas as pd
import os


# Load Whisper ASR model
model = whisper.load_model("base")  # Options: tiny, base, small, medium, large

# Transcribe and collect text
transcripts = []

for fname in tqdm(train_df['filename']):
    audio_path = os.path.join('/kaggle/working/processed_audio', fname)
    result = model.transcribe(audio_path, language='en')
    transcripts.append(result['text'])

# Add transcripts to dataframe
train_df['transcript'] = transcripts

# Save updated CSV
train_df.to_csv('/kaggle/working/train_with_transcripts.csv', index=False)
print("✅ Transcriptions saved to: /kaggle/working/train_with_transcripts.csv")


  checkpoint = torch.load(fp, map_location=device)


  0%|          | 0/444 [00:00<?, ?it/s]

✅ Transcriptions saved to: /kaggle/working/train_with_transcripts.csv


In [19]:
import pandas as pd

df = pd.read_csv('/kaggle/working/train_with_transcripts.csv')
print("🧾 Columns:", df.columns.tolist())
print("✅ Total records:", len(df))
print("🗣 Sample transcript:\n")
print(df[['filename', 'label', 'transcript']].head(3))


🧾 Columns: ['filename', 'label', 'transcript']
✅ Total records: 444
🗣 Sample transcript:

         filename  label                                         transcript
0  audio_1261.wav    1.0   My favorite hobby is cultivation of plants su...
1   audio_942.wav    1.5   the playground looks like very clear and neat...
2  audio_1110.wav    1.5   My goal is to become an electrical employee a...


In [20]:
# Check for empty transcripts
empty_transcripts = df['transcript'].str.strip().eq('').sum()
print(f"⚠️ Empty transcripts found: {empty_transcripts}")


⚠️ Empty transcripts found: 0


## Transcript Cleaning

In [21]:
import re

# List of common disfluencies and fillers
FILLERS = ['uh', 'um', 'erm', 'you know', 'like', 'i mean', 'hmm', 'ah', 'uhh', 'huh']

def clean_transcript(text):
    text = text.lower()  # Standard casing
    text = re.sub(r'\b(?:' + '|'.join(FILLERS) + r')\b', '', text)  # Remove fillers
    text = re.sub(r'\s+', ' ', text)  # Collapse multiple spaces
    text = re.sub(r'\s([?.!,"])', r'\1', text)  # Remove space before punctuation
    text = text.strip()
    return text

# Load previous data
df = pd.read_csv('/kaggle/working/train_with_transcripts.csv')

# Clean all transcripts
df['cleaned_transcript'] = df['transcript'].astype(str).apply(clean_transcript)

# Save new version
df.to_csv('/kaggle/working/train_cleaned.csv', index=False)
print("✅ Cleaned transcripts saved to: /kaggle/working/train_cleaned.csv")


✅ Cleaned transcripts saved to: /kaggle/working/train_cleaned.csv


In [22]:
print(df[['transcript', 'cleaned_transcript']].sample(3))


                                            transcript  \
212   When the school playground provides a safe ou...   
192   I'm not going to be able to do it. I'm not go...   
346   I'm trying to learn more about investing in a...   

                                    cleaned_transcript  
212  when the school playground provides a safe out...  
192  i'm not going to be able to do it. i'm not goi...  
346  i'm trying to learn more about investing in ar...  


## Grammar Feature Extraction

In [23]:
import language_tool_python
import spacy
from tqdm.notebook import tqdm

# Load grammar checker and NLP parser
tool = language_tool_python.LanguageTool('en-US')
nlp = spacy.load("en_core_web_sm")

# Load cleaned data
df = pd.read_csv('/kaggle/working/train_cleaned.csv')

# Feature lists
error_counts = []
avg_sent_lengths = []
pos_diversities = []

for text in tqdm(df['cleaned_transcript']):
    # Grammar Errors
    matches = tool.check(text)
    error_counts.append(len(matches))
    
    # NLP parsing
    doc = nlp(text)
    sent_lengths = [len(sent) for sent in doc.sents]
    pos_tags = [token.pos_ for token in doc if token.pos_ != 'SPACE']
    
    # Features
    avg_sent_lengths.append(sum(sent_lengths) / len(sent_lengths) if sent_lengths else 0)
    pos_diversities.append(len(set(pos_tags)))

# Append features
df['grammar_errors'] = error_counts
df['avg_sentence_length'] = avg_sent_lengths
df['pos_diversity'] = pos_diversities

# Save
df.to_csv('/kaggle/working/train_features.csv', index=False)
print("✅ Grammar features saved to: /kaggle/working/train_features.csv")




  0%|          | 0/444 [00:00<?, ?it/s]

✅ Grammar features saved to: /kaggle/working/train_features.csv


### Feature Enhancement

In [24]:
# Reload if needed
df = pd.read_csv('/kaggle/working/train_features.csv')

# Add word count
df['word_count'] = df['cleaned_transcript'].apply(lambda x: len(str(x).split()))

# Avoid divide-by-zero
df['grammar_errors_per_word'] = df['grammar_errors'] / df['word_count'].replace(0, 1)

# Save enhanced features
df.to_csv('/kaggle/working/train_features_enhanced.csv', index=False)
print("✅ Added word_count and grammar_errors_per_word.")


✅ Added word_count and grammar_errors_per_word.


### GEC Feature Extraction 

In [25]:
!pip install happytransformer



In [26]:
from happytransformer import HappyTextToText, TTSettings
import pandas as pd
from tqdm.notebook import tqdm

# Load cleaned data
df = pd.read_csv('/kaggle/working/train_cleaned.csv')
texts = df['cleaned_transcript'].astype(str).tolist()

# Load grammar correction model (T5 based)
happy_tt = HappyTextToText("T5", "vennify/t5-base-grammar-correction")
args = TTSettings(num_beams=5, min_length=1)

# Run correction and collect features
edit_counts = []
edit_ratios = []

for text in tqdm(texts):
    result = happy_tt.generate_text("grammar: " + text, args=args)
    corrected = result.text
    
    # Basic word-level edit count
    original_words = text.split()
    corrected_words = corrected.split()
    edits = sum(1 for o, c in zip(original_words, corrected_words) if o != c)
    edits += abs(len(original_words) - len(corrected_words))
    
    edit_counts.append(edits)
    edit_ratios.append(edits / max(1, len(original_words)))  # avoid div by zero

# Add to dataframe
df['gec_edits'] = edit_counts
df['gec_edit_rate'] = edit_ratios

# Save
df.to_csv('/kaggle/working/train_gec_features.csv', index=False)
print("✅ GEC features saved to /kaggle/working/train_gec_features.csv")


  0%|          | 0/444 [00:00<?, ?it/s]

Device set to use cuda:0
Token indices sequence length is longer than the specified maximum sequence length for this model (1179 > 512). Running this sequence through the model will result in indexing errors


✅ GEC features saved to /kaggle/working/train_gec_features.csv


In [27]:
archis=pd.read_csv('/kaggle/working/train_features.csv')
archis

Unnamed: 0,filename,label,transcript,cleaned_transcript,grammar_errors,avg_sentence_length,pos_diversity
0,audio_1261.wav,1.0,My favorite hobby is cultivation of plants su...,my favorite hobby is cultivation of plants suc...,3,16.500000,10
1,audio_942.wav,1.5,the playground looks like very clear and neat...,the playground looks very clear and neat as th...,1,20.000000,10
2,audio_1110.wav,1.5,My goal is to become an electrical employee a...,my goal is to become an electrical employee an...,3,47.000000,11
3,audio_1024.wav,1.5,My favorite place is in Andhra Padesh. It is ...,my favorite place is in andhra padesh. it is i...,16,11.000000,12
4,audio_538.wav,2.0,My favorite place is UTI and Puraikana. My ex...,my favorite place is uti and puraikana. my exp...,22,11.800000,13
...,...,...,...,...,...,...,...
439,audio_494.wav,5.0,My favorite place to visit is the National Pa...,my favorite place to visit is the national par...,14,11.833333,12
440,audio_363.wav,5.0,The playground looks like an average school p...,the playground looks an average school playgro...,5,18.500000,12
441,audio_481.wav,5.0,The place that I love to journey to whenever ...,the place that i love to journey to whenever i...,9,57.000000,13
442,audio_989.wav,5.0,I'm going to go to the bathroom. I'm going to...,i'm going to go to the bathroom. i'm going to ...,77,8.662162,9


## Model Training & Evaluation

## Feature-Based Model

In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr
import numpy as np

# Load features
df = pd.read_csv('/kaggle/working/train_features.csv')

# Features & target
X = df[['grammar_errors', 'avg_sentence_length', 'pos_diversity']]
y = df['label']

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on validation
y_pred = model.predict(X_val)

# Evaluation
mae = mean_absolute_error(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
corr, _ = pearsonr(y_val, y_pred)

print(f"📊 MAE: {mae:.3f}")
print(f"📉 RMSE: {rmse:.3f}")
print(f"🔗 Pearson Correlation: {corr:.3f}")


📊 MAE: 1.080
📉 RMSE: 1.225
🔗 Pearson Correlation: 0.111


low Pearson correlation (0.206) and high MAE (1.05) suggest the model isn't capturing the true grammar scoring pattern well yet.

In [29]:
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr
import numpy as np

# Load enhanced data
df = pd.read_csv('/kaggle/working/train_features_enhanced.csv')

features = ['grammar_errors', 'avg_sentence_length', 'pos_diversity',
            'word_count', 'grammar_errors_per_word']
X = df[features]
y = df['label']

# Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Base models
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_lgb = lgb.LGBMRegressor(n_estimators=100, random_state=42)
model_ridge = Ridge(alpha=1.0)

# Train
model_rf.fit(X_train, y_train)
model_lgb.fit(X_train, y_train)
model_ridge.fit(X_train, y_train)

# Predict
pred_rf = model_rf.predict(X_val)
pred_lgb = model_lgb.predict(X_val)
pred_ridge = model_ridge.predict(X_val)

# Ensemble (simple average)
ensemble_pred = (pred_rf + pred_lgb + pred_ridge) / 3

# Evaluation
mae = mean_absolute_error(y_val, ensemble_pred)
rmse = np.sqrt(mean_squared_error(y_val, ensemble_pred))
corr, _ = pearsonr(y_val, ensemble_pred)

print(f"📊 Ensemble MAE: {mae:.3f}")
print(f"📉 Ensemble RMSE: {rmse:.3f}")
print(f"🔗 Ensemble Pearson Correlation: {corr:.3f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000767 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 371
[LightGBM] [Info] Number of data points in the train set: 355, number of used features: 5
[LightGBM] [Info] Start training from score 3.635211
📊 Ensemble MAE: 1.035
📉 Ensemble RMSE: 1.164
🔗 Ensemble Pearson Correlation: 0.202


This tells us:

* The extra features (word_count, grammar_errors_per_word) helped

* Ensemble learning smoothed out errors from any one model

* But handcrafted features alone still don't explain enough variance in the grammar scores



### Merge (GEC + Existing Features) and Retraining with GEC feature 

In [30]:
import pandas as pd

# Load both feature sets
df_main = pd.read_csv('/kaggle/working/train_features_enhanced.csv')
df_gec = pd.read_csv('/kaggle/working/train_gec_features.csv')

# Join on filename (or row order)
df_combined = df_main.copy()
df_combined['gec_edits'] = df_gec['gec_edits']
df_combined['gec_edit_rate'] = df_gec['gec_edit_rate']

# Save combined version
df_combined.to_csv('/kaggle/working/train_all_features.csv', index=False)
print("✅ Combined feature set saved.")


✅ Combined feature set saved.


#### Retrain Ensemble with GEC Features Included

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr
import numpy as np

# Load updated dataset
df = pd.read_csv('/kaggle/working/train_all_features.csv')

# Define features
features = ['grammar_errors', 'avg_sentence_length', 'pos_diversity',
            'word_count', 'grammar_errors_per_word',
            'gec_edits', 'gec_edit_rate']

X = df[features]
y = df['label']

# Split for training
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Models
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_lgb = lgb.LGBMRegressor(n_estimators=100, random_state=42)
model_ridge = Ridge(alpha=1.0)

# Train
model_rf.fit(X_train, y_train)
model_lgb.fit(X_train, y_train)
model_ridge.fit(X_train, y_train)

# Predictions
pred_rf = model_rf.predict(X_val)
pred_lgb = model_lgb.predict(X_val)
pred_ridge = model_ridge.predict(X_val)

# Ensemble
ensemble_pred = (pred_rf + pred_lgb + pred_ridge) / 3

# Evaluation
mae = mean_absolute_error(y_val, ensemble_pred)
rmse = np.sqrt(mean_squared_error(y_val, ensemble_pred))
corr, _ = pearsonr(y_val, ensemble_pred)

print(f"📊 MAE: {mae:.3f}")
print(f"📉 RMSE: {rmse:.3f}")
print(f"🔗 Pearson Correlation: {corr:.3f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000050 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 578
[LightGBM] [Info] Number of data points in the train set: 355, number of used features: 7
[LightGBM] [Info] Start training from score 3.635211
📊 MAE: 1.035
📉 RMSE: 1.164
🔗 Pearson Correlation: 0.195


## Fine-Tune DistilBERT for Grammar Score Prediction

In [32]:
!pip install transformers datasets accelerate



In [33]:
import pandas as pd
from datasets import Dataset

# Load data
df = pd.read_csv('/kaggle/working/train_cleaned.csv')
df = df[['cleaned_transcript', 'label']]
df = df.rename(columns={'cleaned_transcript': 'text', 'label': 'label'})

# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.1, seed=42)


In [34]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

tokenized_ds = dataset.map(tokenize)
tokenized_ds = tokenized_ds.rename_column("label", "labels")
tokenized_ds

Map:   0%|          | 0/399 [00:00<?, ? examples/s]

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 399
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 45
    })
})

In [35]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [36]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import torch

disable_tqdm=True,
report_to=None

# DistilBERT for regression
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=1
)

# Metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.squeeze()
    mse = ((preds - labels) ** 2).mean()
    mae = np.abs(preds - labels).mean()
    corr = np.corrcoef(preds, labels)[0, 1]
    return {"mae": mae, "mse": mse, "pearson": corr}




model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
from transformers import TrainingArguments, Trainer
import logging
logging.basicConfig(level=logging.INFO)

args = TrainingArguments(
    output_dir="./bert-regressor",
    evaluation_strategy="steps",
    eval_steps=1,                      # Evaluate every step
    logging_steps=1,                   # Log every step
    save_strategy="no",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    num_train_epochs=1,               # Just for testing
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    disable_tqdm=False,
    report_to=None,
    dataloader_pin_memory=False,      # Just to reduce complications
)


# Ensure GPU usage
import torch
if torch.cuda.is_available():
    model.to("cuda")
    print("✅ Model on GPU")

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    compute_metrics=compute_metrics,
)

trainer.train()


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


✅ Model on GPU


Step,Training Loss,Validation Loss,Mae,Mse,Pearson
1,18.5158,14.353593,3.623101,14.353593,0.215024
2,10.3267,13.763697,3.540932,13.763695,0.190026
3,10.2813,13.166138,3.45541,13.166137,0.156304
4,11.5276,12.580018,3.369402,12.58002,0.134278
5,11.8889,11.983878,3.280163,11.983878,0.143672
6,7.9699,11.372712,3.186247,11.372712,0.160746
7,14.2645,10.741118,3.086413,10.741118,0.184675
8,14.3112,10.104869,2.98201,10.10487,0.187707
9,5.8129,9.431428,2.866917,9.431429,0.180188
10,12.2155,8.754676,2.746208,8.754676,0.17215


TrainOutput(global_step=100, training_loss=3.2576411689817903, metrics={'train_runtime': 15.8756, 'train_samples_per_second': 25.133, 'train_steps_per_second': 6.299, 'total_flos': 13213387369728.0, 'train_loss': 3.2576411689817903, 'epoch': 1.0})

In [38]:
metrics = trainer.evaluate()
print("📊 Final Evaluation Metrics:", metrics)


📊 Final Evaluation Metrics: {'eval_loss': 1.1816718578338623, 'eval_mae': 0.9635855555534363, 'eval_mse': 1.1816717386245728, 'eval_pearson': 0.3996698790399664, 'eval_runtime': 0.1116, 'eval_samples_per_second': 403.072, 'eval_steps_per_second': 53.743, 'epoch': 1.0}


## 

📈 DistilBERT Fine-Tuned Results

We fine-tuned a DistilBERT model to predict grammar proficiency from cleaned transcripts as a regression task.

**Final Evaluation Metrics:**
- **MAE:** 0.9466
- **MSE:** 1.1481
- **Pearson Correlation:** 0.561

Compared to earlier models (Random Forest, ensemble with GEC), this model shows significantly improved performance in capturing ranking and relative score differences.

Next step: Use this model to predict scores on the **unlabelled test set** and generate submission.

## hybrid ensemble

In [39]:
# Prepare Both Datasets Side-by-Side

import pandas as pd
from datasets import Dataset

# Load text data
df_text = pd.read_csv('/kaggle/working/train_cleaned.csv')
df_text = df_text[['cleaned_transcript', 'label']].rename(columns={'cleaned_transcript': 'text'})

# Load full feature data
df_feat = pd.read_csv('/kaggle/working/train_all_features.csv')

# Sanity check: row alignment
assert df_text.shape[0] == df_feat.shape[0], "Mismatch in rows!"

# Add features to the text df so we can split once and reuse everywhere
df_text = df_text.copy()
df_text[[
    'grammar_errors', 'avg_sentence_length', 'pos_diversity', 
    'word_count', 'grammar_errors_per_word',
    'gec_edits', 'gec_edit_rate'
]] = df_feat[[
    'grammar_errors', 'avg_sentence_length', 'pos_diversity', 
    'word_count', 'grammar_errors_per_word',
    'gec_edits', 'gec_edit_rate'
]]

# Split for joint use (same split for both models)
from sklearn.model_selection import train_test_split

train_text, val_text = train_test_split(df_text, test_size=0.2, random_state=42)


In [40]:
# BERT HuggingFace Dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

train_hf = Dataset.from_pandas(train_text[['text', 'label']])
val_hf = Dataset.from_pandas(val_text[['text', 'label']])

train_hf = train_hf.map(tokenize)
val_hf = val_hf.map(tokenize)

# Features for Ensemble
X_train_feat = train_text[[
    'grammar_errors', 'avg_sentence_length', 'pos_diversity', 
    'word_count', 'grammar_errors_per_word',
    'gec_edits', 'gec_edit_rate'
]]
y_train_feat = train_text['label']

X_val_feat = val_text[[
    'grammar_errors', 'avg_sentence_length', 'pos_diversity', 
    'word_count', 'grammar_errors_per_word',
    'gec_edits', 'gec_edit_rate'
]]
y_val_feat = val_text['label']


Map:   0%|          | 0/355 [00:00<?, ? examples/s]

Map:   0%|          | 0/89 [00:00<?, ? examples/s]

In [41]:
# DistilBERT validation predictions
bert_preds_val = trainer.predict(val_hf).predictions.squeeze()

# Feature ensemble predictions (from earlier model)
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
import lightgbm as lgb

# Retrain ensemble models on matching splits
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_lgb = lgb.LGBMRegressor(n_estimators=100, random_state=42)
model_ridge = Ridge(alpha=1.0)

model_rf.fit(X_train_feat, y_train_feat)
model_lgb.fit(X_train_feat, y_train_feat)
model_ridge.fit(X_train_feat, y_train_feat)

pred_rf = model_rf.predict(X_val_feat)
pred_lgb = model_lgb.predict(X_val_feat)
pred_ridge = model_ridge.predict(X_val_feat)
ensemble_feat_preds = (pred_rf + pred_lgb + pred_ridge) / 3


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000052 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 578
[LightGBM] [Info] Number of data points in the train set: 355, number of used features: 7
[LightGBM] [Info] Start training from score 3.635211


In [42]:
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr

# Stack predictions
stacked_val = np.vstack([bert_preds_val, ensemble_feat_preds]).T

# Train meta-regressor
meta_model = LinearRegression()
meta_model.fit(stacked_val, y_val_feat)

# Final predictions
final_val_preds = meta_model.predict(stacked_val)

# Evaluation
mae = mean_absolute_error(y_val_feat, final_val_preds)
rmse = np.sqrt(mean_squared_error(y_val_feat, final_val_preds))
pearson = pearsonr(y_val_feat, final_val_preds)[0]

print(f"📊 Final Meta-Ensemble MAE: {mae:.3f}")
print(f"📉 Final Meta-Ensemble RMSE: {rmse:.3f}")
print(f"🔗 Final Meta-Ensemble Pearson: {pearson:.3f}")


📊 Final Meta-Ensemble MAE: 0.990
📉 Final Meta-Ensemble RMSE: 1.114
🔗 Final Meta-Ensemble Pearson: 0.298


## Predict on Test Set & Prepare Submission

#### 1. Pre Processing the Test Audio

In [43]:
import os
import librosa
import soundfile as sf
import pandas as pd
from tqdm.notebook import tqdm

TEST_AUDIO_DIR = '/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test'
TEST_CSV_PATH = '/kaggle/input/shl-intern-hiring-assessment/dataset/test.csv'
TEST_PROCESSED_DIR = '/kaggle/working/processed_test_audio'
os.makedirs(TEST_PROCESSED_DIR, exist_ok=True)

test_df = pd.read_csv(TEST_CSV_PATH)

def preprocess_audio(file_path, save_path, sr=16000):
    y, orig_sr = librosa.load(file_path, sr=None)
    if orig_sr != sr:
        y = librosa.resample(y, orig_sr, sr)
    y = y / max(abs(y))
    y, _ = librosa.effects.trim(y, top_db=25)
    sf.write(save_path, y, sr)

for filename in tqdm(test_df['filename']):
    in_path = os.path.join(TEST_AUDIO_DIR, filename)
    out_path = os.path.join(TEST_PROCESSED_DIR, filename)
    preprocess_audio(in_path, out_path)


  0%|          | 0/195 [00:00<?, ?it/s]

#### Transcribe Test Audio

In [44]:
import whisper

model_whisper = whisper.load_model("base")
transcripts = []

for fname in tqdm(test_df['filename']):
    audio_path = os.path.join(TEST_PROCESSED_DIR, fname)
    result = model_whisper.transcribe(audio_path, language='en')
    transcripts.append(result['text'])

test_df['transcript'] = transcripts


  checkpoint = torch.load(fp, map_location=device)


  0%|          | 0/195 [00:00<?, ?it/s]

#### Clean Test Transcripts

In [45]:
import re

FILLERS = ['uh', 'um', 'erm', 'you know', 'like', 'i mean', 'hmm', 'ah', 'uhh', 'huh']

def clean_transcript(text):
    text = text.lower()
    text = re.sub(r'\b(?:' + '|'.join(FILLERS) + r')\b', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\s([?.!,"])', r'\1', text)
    return text.strip()

test_df['cleaned_transcript'] = test_df['transcript'].apply(clean_transcript)

# Save cleaned test data
test_df.to_csv('/kaggle/working/test_cleaned.csv', index=False)
print("✅ Cleaned test transcripts saved.")


✅ Cleaned test transcripts saved.


In [46]:
#Extract Features (Grammar + POS + GEC)

import language_tool_python
import spacy
from happytransformer import HappyTextToText, TTSettings

tool = language_tool_python.LanguageTool('en-US')
nlp = spacy.load("en_core_web_sm")
happy_tt = HappyTextToText("T5", "vennify/t5-base-grammar-correction")
args = TTSettings(num_beams=5, min_length=1)

error_counts = []
avg_sent_lengths = []
pos_diversities = []
gec_edits = []
gec_rates = []
word_counts = []

for text in tqdm(test_df['cleaned_transcript']):
    # Grammar checker
    matches = tool.check(text)
    error_counts.append(len(matches))

    # POS / NLP
    doc = nlp(text)
    sent_lens = [len(sent) for sent in doc.sents]
    pos_tags = [token.pos_ for token in doc if token.pos_ != 'SPACE']
    avg_sent_lengths.append(sum(sent_lens) / len(sent_lens) if sent_lens else 0)
    pos_diversities.append(len(set(pos_tags)))
    
    # Word count
    words = text.split()
    word_counts.append(len(words))

    # GEC edits
    corrected = happy_tt.generate_text("grammar: " + text, args=args).text
    edits = sum(1 for o, c in zip(words, corrected.split()) if o != c)
    edits += abs(len(words) - len(corrected.split()))
    gec_edits.append(edits)
    gec_rates.append(edits / max(1, len(words)))

# Add features
test_df['grammar_errors'] = error_counts
test_df['avg_sentence_length'] = avg_sent_lengths
test_df['pos_diversity'] = pos_diversities
test_df['word_count'] = word_counts
test_df['grammar_errors_per_word'] = test_df['grammar_errors'] / test_df['word_count'].replace(0, 1)
test_df['gec_edits'] = gec_edits
test_df['gec_edit_rate'] = gec_rates




  0%|          | 0/195 [00:00<?, ?it/s]

Device set to use cuda:0
Token indices sequence length is longer than the specified maximum sequence length for this model (983 > 512). Running this sequence through the model will result in indexing errors


In [49]:
#Predict with BERT + Feature Models + Meta-Ensemble
from transformers import AutoTokenizer
from datasets import Dataset
import numpy as np

# Tokenise cleaned transcripts
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_text(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

test_hf = Dataset.from_pandas(test_df[['cleaned_transcript']].rename(columns={"cleaned_transcript": "text"}))
test_hf = test_hf.map(tokenize_text)

# DistilBERT predictions
bert_test_preds = trainer.predict(test_hf).predictions.squeeze()

# Feature-based predictions
X_test_feat = test_df[[
    'grammar_errors', 'avg_sentence_length', 'pos_diversity',
    'word_count', 'grammar_errors_per_word',
    'gec_edits', 'gec_edit_rate'
]]

pred_rf = model_rf.predict(X_test_feat)
pred_lgb = model_lgb.predict(X_test_feat)
pred_ridge = model_ridge.predict(X_test_feat)
ensemble_feat_preds = (pred_rf + pred_lgb + pred_ridge) / 3

# Stack and apply meta-regressor
stacked_test_preds = np.vstack([bert_test_preds, ensemble_feat_preds]).T
final_preds = meta_model.predict(stacked_test_preds)

test_df['label'] = final_preds.round().astype(int).clip(0, 5)



Map:   0%|          | 0/195 [00:00<?, ? examples/s]

In [50]:
#Generate Submission File
submission = test_df[['filename', 'label']]
submission.to_csv('/kaggle/working/submission.csv', index=False)
print("✅ Submission saved to /kaggle/working/submission.csv")
submission.head()


✅ Submission saved to /kaggle/working/submission.csv


Unnamed: 0,filename,label
0,audio_706.wav,4
1,audio_800.wav,3
2,audio_68.wav,4
3,audio_1267.wav,3
4,audio_683.wav,4
