## build an AI Grammar Teacher

In [None]:
import os
import pandas as pd
import whisper
import language_tool_python
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

# ---------------------------------------------------------
# 1. SETUP PATHS
# ---------------------------------------------------------
# I point to the "dataset" folder relative to this notebook
BASE_DIR = "dataset" 

# Define exact paths to your folders based on your structure
TRAIN_AUDIO_DIR = os.path.join(BASE_DIR, "audios", "train")
TEST_AUDIO_DIR = os.path.join(BASE_DIR, "audios", "test")
TRAIN_CSV_PATH = os.path.join(BASE_DIR, "csvs", "train.csv")
TEST_CSV_PATH = os.path.join(BASE_DIR, "csvs", "test.csv")

# ---------------------------------------------------------
# 2. LOAD MODELS
# ---------------------------------------------------------
print("‚è≥ Loading Whisper AI Model (Speech-to-Text)...")
# 'base' model is a good balance of speed and accuracy
whisper_model = whisper.load_model("base")

print("‚è≥ Loading Grammar Checker Tool...")
# This tool checks for sentence structure and syntax errors
tool = language_tool_python.LanguageTool('en-US')

print("‚úÖ SUCCESS: All libraries and models are loaded!")

‚è≥ Loading Whisper AI Model (Speech-to-Text)...
‚è≥ Loading Grammar Checker Tool...
‚úÖ SUCCESS: All libraries and models are loaded!


In [3]:
def analyze_grammar(filename, folder_path):
    """
    1. Fixes the filename (adds .wav if missing).
    2. Transcribes the audio to text using Whisper.
    3. Counts grammar mistakes using LanguageTool.
    """
    
    # --- STEP 1: Fix Missing .wav Extension ---
    # My CSV has "audio_141", but file is "audio_141.wav"
    filename_str = str(filename)
    if not filename_str.endswith('.wav'):
        filename_str = filename_str + ".wav"
        
    # Create the full path to the file
    full_path = os.path.join(folder_path, filename_str)
    
    # Safety Check: Does the file actually exist?
    if not os.path.exists(full_path):
        print(f"‚ö†Ô∏è WARNING: Could not find file {full_path}")
        return None, None

    try:
        # --- STEP 2: Transcribe Audio (Speech -> Text) ---
        # The AI listens to the file here
        audio_result = whisper_model.transcribe(full_path)
        text_content = audio_result['text']
        
        # --- STEP 3: Check Grammar ---
        # The tool reads the text and finds errors
        mistakes = tool.check(text_content)
        error_count = len(mistakes)
        word_count = len(text_content.split())
        
        # Avoid crashing if the file is silent (0 words)
        if word_count == 0: word_count = 1
            
        return error_count, word_count

    except Exception as e:
        print(f"‚ùå ERROR processing {filename}: {e}")
        return None, None

In [4]:
# 1. Read the Training CSV file
train_df = pd.read_csv(TRAIN_CSV_PATH)
print(f"üìÇ Found {len(train_df)} training samples in train.csv")

collected_data = []

print("üöÄ Starting to process training files... Please wait.")

# 2. Loop through every file in the CSV
for index, row in train_df.iterrows():
    f_name = row['filename']
    actual_score = row['label']
    
    # Call our function from Cell 2
    errors, words = analyze_grammar(f_name, TRAIN_AUDIO_DIR)
    
    if errors is not None:
        # Calculate "Mistakes per Word" (Error Density)
        # Logic: More mistakes per word = Lower Grammar Score
        density = errors / words
        
        collected_data.append({
            'filename': f_name,
            'error_density': density,
            'score': actual_score
        })
    
    # Print progress every 20 files
    if index % 20 == 0:
        print(f"Processed {index} / {len(train_df)} files...")

# 3. Save extracted data into a table
train_features = pd.DataFrame(collected_data)
print("‚úÖ DONE: Training data processing is complete!")
# Show the first 5 rows of what we learned
display(train_features.head())

üìÇ Found 409 training samples in train.csv
üöÄ Starting to process training files... Please wait.




Processed 0 / 409 files...
Processed 20 / 409 files...
Processed 40 / 409 files...
Processed 60 / 409 files...
Processed 80 / 409 files...
Processed 100 / 409 files...
Processed 120 / 409 files...
Processed 140 / 409 files...
Processed 160 / 409 files...
Processed 180 / 409 files...
Processed 200 / 409 files...
Processed 220 / 409 files...
Processed 240 / 409 files...
Processed 260 / 409 files...
Processed 280 / 409 files...
Processed 300 / 409 files...
Processed 320 / 409 files...
Processed 340 / 409 files...
Processed 360 / 409 files...
Processed 380 / 409 files...
Processed 400 / 409 files...
‚úÖ DONE: Training data processing is complete!


Unnamed: 0,filename,error_density,score
0,audio_173,0.0,3.0
1,audio_138,0.013072,3.0
2,audio_127,0.067797,2.0
3,audio_95,0.0,2.0
4,audio_73,0.016393,3.5


In [5]:
# --- STEP 1: Prepare Data for AI ---
# X = The input (Mistakes per word)
# y = The target (The grammar score 1-5)
X = train_features[['error_density']]
y = train_features['score']

# --- STEP 2: Split Data ---
# We keep 20% of data aside to test if our model is smart
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# --- STEP 3: Train the Model ---
# We use Linear Regression (Simple and effective for this)
model = LinearRegression()
model.fit(X_train, y_train)

# --- STEP 4: Calculate Accuracy (RMSE) ---
predictions = model.predict(X_val)
rmse_score = np.sqrt(mean_squared_error(y_val, predictions))

print("==================================================")
print(f"üìä REPORT FOR SUBMISSION:")
print(f"--------------------------------------------------")
print(f"Model Used: Linear Regression")
print(f"Feature Used: Error Density (Grammar Mistakes / Word Count)")
print(f"RMSE Score (Training Data): {rmse_score:.5f}") 
print("==================================================")

üìä REPORT FOR SUBMISSION:
--------------------------------------------------
Model Used: Linear Regression
Feature Used: Error Density (Grammar Mistakes / Word Count)
RMSE Score (Training Data): 0.76794


In [6]:
# 1. Read the Test CSV file
test_df = pd.read_csv(TEST_CSV_PATH)
print(f"üìÇ Found {len(test_df)} test files to predict.")

test_data = []

print("üöÄ Starting to process TEST files...")

# 2. Loop through every file in Test CSV
for index, row in test_df.iterrows():
    f_name = row['filename']
    
    # Note: We look in TEST_AUDIO_DIR this time
    errors, words = analyze_grammar(f_name, TEST_AUDIO_DIR)
    
    if errors is not None:
        density = errors / words
        test_data.append({
            'filename': f_name,
            'error_density': density
        })
        
    if index % 20 == 0:
        print(f"Processed {index} / {len(test_df)}...")

# 3. Convert to DataFrame
test_features_df = pd.DataFrame(test_data)

# 4. Predict the scores using our trained model
predicted_scores = model.predict(test_features_df[['error_density']])

# 5. Create the Submission DataFrame (filename, label)
submission = pd.DataFrame({
    'filename': test_df['filename'], # Use original filenames from CSV
    'label': predicted_scores
})

# 6. Save to CSV
submission.to_csv('submission.csv', index=False)
print("‚úÖ SUCCESS! 'submission.csv' has been created in your project folder.")

üìÇ Found 197 test files to predict.
üöÄ Starting to process TEST files...




Processed 0 / 197...
Processed 20 / 197...
Processed 40 / 197...
Processed 60 / 197...
Processed 80 / 197...
Processed 100 / 197...
Processed 120 / 197...
Processed 140 / 197...
Processed 160 / 197...
Processed 180 / 197...
‚úÖ SUCCESS! 'submission.csv' has been created in your project folder.
