In [None]:
# =========================
# CELL 1: PATH SETUP
# =========================
import sys
import os

# Notebook is running from: training/
BASE_DIR = os.getcwd()

# Add backend folder to Python path
BACKEND_DIR = os.path.abspath(os.path.join(BASE_DIR, "../backend"))
sys.path.append(BACKEND_DIR)

print("Current working directory:", BASE_DIR)
print("Backend path added:", BACKEND_DIR)


In [None]:
# =========================
# CELL 2: CORE IMPORTS
# =========================
import pandas as pd
import numpy as np

print("Pandas version:", pd.__version__)
print("NumPy version:", np.__version__)


In [None]:
# =========================
# CELL 3: IMPORT BACKEND MODULES
# =========================
from app.asr.speech_to_text import SpeechToText
from app.features.grammar_features import extract_grammar_features

print("Backend modules imported successfully")


In [None]:
# =========================
# CELL 4: DATASET PATHS
# =========================
import os

# Notebook is running from: training/
BASE_DIR = os.getcwd()

DATASET_DIR = os.path.join(BASE_DIR, "dataset")
AUDIO_DIR = os.path.join(DATASET_DIR, "speech_data")
LABEL_FILE = os.path.join(DATASET_DIR, "processed_audio_sample_scoring.xlsx")

print("Dataset dir exists :", os.path.exists(DATASET_DIR))
print("Audio dir exists   :", os.path.exists(AUDIO_DIR))
print("Label file exists  :", os.path.exists(LABEL_FILE))


In [None]:
# =========================
# CELL 5: LOAD LABEL FILE
# =========================
import pandas as pd

df = pd.read_excel(LABEL_FILE)

print("Dataset loaded successfully")
print("Total samples:", len(df))
print("\nColumns found:")
print(df.columns)

# Validate required columns
required_columns = ["Record Audio Name", "Content"]
for col in required_columns:
    if col not in df.columns:
        raise ValueError(f"Missing required column: {col}")

df.head()


In [None]:
# =========================
# CELL 6: INITIALIZE ASR
# =========================
asr = SpeechToText(model_size="base")

print("ASR model initialized successfully")


In [None]:
# =========================
# CELL 7 (FINAL + VISUALIZER): GENERATE TRANSCRIPTS
# =========================
from tqdm import tqdm

# Build lookup of audio files (without extension)
audio_files = os.listdir(AUDIO_DIR)

audio_lookup = {}
for file in audio_files:
    base = os.path.splitext(file)[0]
    audio_lookup[base] = file

print("Total audio files found:", len(audio_lookup))

transcripts = []
missing_audio = 0

# tqdm gives a live progress bar
for _, row in tqdm(df.iterrows(), total=len(df), desc="Transcribing audio"):
    excel_name = row["Record Audio Name"]

    audio_file = audio_lookup.get(excel_name)

    if audio_file is None:
        transcripts.append("")
        missing_audio += 1
        continue

    audio_path = os.path.join(AUDIO_DIR, audio_file)
    text = asr.transcribe(audio_path)
    transcripts.append(text)

df["transcript"] = transcripts

print("✅ Transcription completed")
print("❌ Missing audio files:", missing_audio)

df[["Record Audio Name", "transcript"]].head()


In [None]:
# =========================
# CELL 8 (WITH PROGRESS): GRAMMAR FEATURE EXTRACTION
# =========================
from tqdm import tqdm

feature_rows = []

for text in tqdm(df["transcript"], total=len(df), desc="Extracting grammar features"):
    features = extract_grammar_features(text)
    feature_rows.append(features)

X = pd.DataFrame(feature_rows)
y = df["Content"]

print("Feature matrix shape:", X.shape)
print("Target shape:", y.shape)

X.head()


In [None]:
# =========================
# CELL 9: TRAIN MODEL
# =========================
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train size:", X_train.shape)
print("Validation size:", X_val.shape)

# Model
model = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

# Train
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_val)

# Evaluate
mae = mean_absolute_error(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

print("MAE:", round(mae, 3))
print("RMSE:", round(rmse, 3))


In [None]:
# =========================
# CELL 10: SAVE MODEL & METADATA
# =========================
import joblib
import os
import json

# Create model directory if not exists
MODEL_DIR = "../backend/app/model"
os.makedirs(MODEL_DIR, exist_ok=True)

# Save trained model
model_path = os.path.join(MODEL_DIR, "grammar_scorer.pkl")
joblib.dump(model, model_path)

# Save feature names (very important for inference)
feature_metadata = {
    "features": list(X.columns),
    "target": "Content",
    "model": "RandomForestRegressor",
    "mae": mae,
    "rmse": rmse
}

metadata_path = os.path.join(MODEL_DIR, "metadata.json")
with open(metadata_path, "w") as f:
    json.dump(feature_metadata, f, indent=2)

print("✅ Model saved to:", model_path)
print("✅ Metadata saved to:", metadata_path)


In [None]:
# =========================
# CELL 11: END-TO-END INFERENCE PIPELINE (FIXED PATH)
# =========================
import joblib
import json
import os
import pandas as pd

# ✅ Correct model directory
MODEL_DIR = r"D:\SHL\backend\app\model"
MODEL_PATH = os.path.join(MODEL_DIR, "grammar_scorer.pkl")
META_PATH = os.path.join(MODEL_DIR, "metadata.json")

# Load model and metadata
model = joblib.load(MODEL_PATH)
with open(META_PATH, "r") as f:
    metadata = json.load(f)

FEATURE_COLUMNS = metadata["features"]

print("✅ Model and metadata loaded successfully")
print("✅ Features used:", FEATURE_COLUMNS)


def score_audio(audio_path: str) -> float:
    """
    Complete grammar scoring pipeline:
    audio -> transcript -> grammar features -> score
    """
    # 1. Speech to text
    transcript = asr.transcribe(audio_path)

    # 2. Grammar feature extraction
    features = extract_grammar_features(transcript)

    # 3. Ensure correct feature order
    X_input = pd.DataFrame([features])[FEATURE_COLUMNS]

    # 4. Predict score
    score = model.predict(X_input)[0]

    return round(float(score), 2)


In [22]:
score_audio(r"D:\SHL\training\en-IN_Kajal_1700049178365.mp3")


80.57