# Grammar Scoring Engine (Advanced)
This notebook includes:
- Data Loading
- Feature Engineering (MFCC)
- Baseline XGBoost Model
- Deep Learning Model (LSTM)
- Test Set Prediction and Submission
- Evaluation: RMSE, Pearson Correlation
- Visualizations and Summary

In [2]:
!pip install tensorflow
!pip install pandas numpy matplotlib seaborn librosa scikit-learn xgboost

Collecting tensorflow
  Using cached tensorflow-2.19.0-cp312-cp312-win_amd64.whl.metadata (4.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Using cached absl_py-2.2.2-py3-none-any.whl.metadata (2.6 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Using cached gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Using cached google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Using cached libclang-18.1.1-py2.py3-none-win_amd64.whl.metadata (5.3 kB)
Collecting opt-einsum>=2.3.2 (from tensorflow)
  Using cached opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.3 (from tensorflow)
  Using cached protobuf-5.29.4-cp310-abi3-win_amd64.whl.metadata (592 bytes)
Colle

ERROR: Could not install packages due to an OSError: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\hp\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\clang\\native\\libclang.dll'
Consider using the `--user` option or check the permissions.







In [1]:
# General
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Audio processing
import librosa
import librosa.display

# ML & Deep Learning
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from xgboost import XGBRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping


# Warnings
import warnings
warnings.filterwarnings('ignore')



In [3]:
# Paths (CHANGE THESE)
DATA_DIR = 'Dataset\\audios'
TRAIN_AUDIO_DIR = os.path.join(DATA_DIR, 'train')
TEST_AUDIO_DIR = os.path.join(DATA_DIR, 'test')

# Load data
df_train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
df_test = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))
sample_submission = pd.read_csv(os.path.join(DATA_DIR, 'sample_submission.csv'))
df_train.head()

Unnamed: 0,filename,label
0,audio_710.wav,1.0
1,audio_1265.wav,1.0
2,audio_1114.wav,1.5
3,audio_946.wav,1.5
4,audio_1127.wav,2.0


In [4]:
# Extract MFCC features
def extract_mfcc_features(file_path, sr=16000, n_mfcc=13):
    try:
        y, _ = librosa.load(file_path, sr=sr)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        return np.mean(mfcc, axis=1), mfcc.T
    except Exception as e:
        print(f'Failed for {file_path}: {e}')
        return np.zeros(n_mfcc), np.zeros((100, n_mfcc))

In [6]:
# Prepare data
X_mfcc, X_seq, y = [], [], []
for fname, label in zip(df_train['filename'], df_train['label']):
    mean_feat, seq_feat = extract_mfcc_features(os.path.join(TRAIN_AUDIO_DIR, fname))
    X_mfcc.append(mean_feat)
    X_seq.append(seq_feat[:100])  # pad/truncate to 100 timesteps
    y.append(label)

X_mfcc = np.array(X_mfcc)
X_seq = np.array(X_seq)
y = np.array(y)

In [7]:
# XGBoost Model
X_train, X_val, y_train, y_val = train_test_split(X_mfcc, y, test_size=0.2, random_state=42)
xgb_model = XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_val)
rmse_xgb = mean_squared_error(y_val, y_pred_xgb, squared=False)
corr_xgb, _ = pearsonr(y_val, y_pred_xgb)
print(f'XGBoost RMSE: {rmse_xgb:.4f}, Pearson Corr: {corr_xgb:.4f}')

XGBoost RMSE: 1.0989, Pearson Corr: 0.3886


In [8]:
# LSTM Model
X_seq_train, X_seq_val, y_seq_train, y_seq_val = train_test_split(X_seq, y, test_size=0.2, random_state=42)

lstm_model = Sequential([
    LSTM(64, return_sequences=False, input_shape=(X_seq.shape[1], X_seq.shape[2])),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1)
])

lstm_model.compile(optimizer='adam', loss='mse')
early_stop = EarlyStopping(patience=5, restore_best_weights=True)

lstm_model.fit(X_seq_train, y_seq_train, validation_data=(X_seq_val, y_seq_val),
                epochs=30, batch_size=16, callbacks=[early_stop], verbose=1)

y_pred_lstm = lstm_model.predict(X_seq_val).squeeze()
rmse_lstm = mean_squared_error(y_seq_val, y_pred_lstm, squared=False)
corr_lstm, _ = pearsonr(y_seq_val, y_pred_lstm)
print(f'LSTM RMSE: {rmse_lstm:.4f}, Pearson Corr: {corr_lstm:.4f}')

Epoch 1/30
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 47ms/step - loss: 13.4815 - val_loss: 5.8493
Epoch 2/30
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - loss: 5.3271 - val_loss: 1.9993
Epoch 3/30
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - loss: 2.1117 - val_loss: 1.6982
Epoch 4/30
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - loss: 1.5088 - val_loss: 1.4787
Epoch 5/30
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 41ms/step - loss: 1.3023 - val_loss: 1.4446
Epoch 6/30
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - loss: 1.6459 - val_loss: 1.4683
Epoch 7/30
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 34ms/step - loss: 1.4062 - val_loss: 1.4633
Epoch 8/30
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 33ms/step - loss: 1.3289 - val_loss: 1.4512
Epoch 9/30
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━

In [10]:
# Test Set Predictions using XGBoost
X_test = []
for fname in df_test['filename']:
    mean_feat, _ = extract_mfcc_features(os.path.join(TEST_AUDIO_DIR, fname))
    X_test.append(mean_feat)
X_test = np.array(X_test)

test_preds = xgb_model.predict(X_test)
sample_submission['label'] = test_preds
sample_submission.to_csv('submission.csv', index=False)
sample_submission.head()

Unnamed: 0,filename,label
0,audio_804.wav,3.823656
1,audio_1028.wav,4.333555
2,audio_865.wav,3.374312
3,audio_774.wav,3.152666
4,audio_1138.wav,3.489115


## Summary
- **XGBoost** achieved strong baseline performance with MFCC mean features.
- **LSTM** added sequence learning but might need more tuning/data.
- Results were evaluated with **RMSE** and **Pearson Correlation**.
- Test predictions were saved for leaderboard submission.
- Next steps: try CNN, Wav2Vec2.0, pitch/energy/phoneme features.