In [1]:
import os
import cv2
import mediapipe as mp
import numpy as np
import pandas as pd
from tqdm import tqdm

In [177]:
# Data Preparation
# Preprocess the image dataset by converting the ASL sign images into numerical format suitable for machine learning.

# 1. Paths – adjust to your setup
# at the top of extract_landmarks.py
DATA_DIR = "D:/Downloads/ai_data/asl_alphabet_train/asl_alphabet_train"
OUTPUT_CSV = "asl_landmark_features.csv"


In [178]:
# 2. Init MediaPipe Hands (static mode for still images)
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
    static_image_mode=True,
    max_num_hands=1)


In [179]:
# 3. Build DataFrame rows & column names
rows    = []
columns = []
for i in range(21):
    columns += [f"lm{i}_x", f"lm{i}_y", f"lm{i}_z"]
columns.append("label")


In [180]:


# 4. Iterate over each class folder and image
for label in sorted(os.listdir(DATA_DIR)):
    class_dir = os.path.join(DATA_DIR, label)
    if not os.path.isdir(class_dir):
        continue
    if label == "nothing":
        continue

    for img_name in tqdm(os.listdir(class_dir), desc=label):
        img_path = os.path.join(class_dir, img_name)
        img = cv2.imread(img_path)
        if img is None:
            continue

        # Convert BGR→RGB and run detection
        rgb    = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        result = hands.process(rgb)

        # Extract landmarks (or zeros if none found)
        if result.multi_hand_landmarks:
            lm = result.multi_hand_landmarks[0]
            feature_vector = []
            for pt in lm.landmark:
                feature_vector += [pt.x, pt.y, pt.z]
            # Append the class label
            feature_vector.append(label)
            rows.append(feature_vector)


A: 100%|██████████| 3000/3000 [02:04<00:00, 24.06it/s]
B: 100%|██████████| 3000/3000 [01:54<00:00, 26.12it/s]
C: 100%|██████████| 3000/3000 [01:52<00:00, 26.71it/s]
D: 100%|██████████| 3000/3000 [01:57<00:00, 25.59it/s]
E: 100%|██████████| 3000/3000 [01:57<00:00, 25.62it/s]
F: 100%|██████████| 3000/3000 [02:04<00:00, 24.05it/s]
G: 100%|██████████| 3000/3000 [01:58<00:00, 25.26it/s]
H: 100%|██████████| 3000/3000 [01:58<00:00, 25.41it/s]
I: 100%|██████████| 3000/3000 [01:57<00:00, 25.45it/s]
J: 100%|██████████| 3000/3000 [02:01<00:00, 24.74it/s]
K: 100%|██████████| 3000/3000 [02:01<00:00, 24.59it/s]
L: 100%|██████████| 3000/3000 [01:59<00:00, 25.04it/s]
M: 100%|██████████| 3000/3000 [01:51<00:00, 27.01it/s]
N: 100%|██████████| 3000/3000 [01:45<00:00, 28.33it/s]
O: 100%|██████████| 3000/3000 [01:59<00:00, 25.13it/s]
P: 100%|██████████| 3000/3000 [01:43<00:00, 29.04it/s]
Q: 100%|██████████| 3000/3000 [01:15<00:00, 39.48it/s]
R: 100%|██████████| 3000/3000 [01:18<00:00, 38.37it/s]
S: 100%|██

In [189]:
# 6. Save to CSV
df = pd.DataFrame(rows, columns=columns)
df.to_csv(OUTPUT_CSV, index=False)
print(f"Processed {len(df)} samples → saved to {OUTPUT_CSV}")

Processed 63673 samples → saved to asl_landmark_features.csv


In [2]:
import pandas as pd

# 1.1 Load the CSV of landmark features
df = pd.read_csv("asl_landmark_features.csv")

# 1.2 Inspect its size
print("Data shape:", df.shape)

# 1.3 Peek at the first few rows
df.head()


Data shape: (63673, 64)


Unnamed: 0,lm0_x,lm0_y,lm0_z,lm1_x,lm1_y,lm1_z,lm2_x,lm2_y,lm2_z,lm3_x,...,lm18_x,lm18_y,lm18_z,lm19_x,lm19_y,lm19_z,lm20_x,lm20_y,lm20_z,label
0,0.457428,0.583321,-6.474007e-07,0.570786,0.503428,-0.03558,0.640131,0.373976,-0.044407,0.651734,...,0.391186,0.321072,-0.070152,0.404947,0.404261,-0.055443,0.403822,0.457566,-0.028879,A
1,0.485155,0.613102,-7.789129e-07,0.602762,0.543648,-0.027501,0.674942,0.405732,-0.03282,0.688006,...,0.432534,0.362228,-0.074631,0.445967,0.451188,-0.063124,0.443897,0.5064,-0.041344,A
2,0.723598,0.674288,-6.069148e-07,0.796444,0.618316,-0.03192,0.847458,0.513981,-0.03761,0.862209,...,0.623643,0.453718,-0.038728,0.630732,0.510247,-0.031833,0.640717,0.556273,-0.014879,A
3,0.713132,0.752423,-4.974553e-07,0.83018,0.65073,-0.042259,0.89759,0.497577,-0.055653,0.895794,...,0.604212,0.449319,-0.108915,0.632593,0.545551,-0.094142,0.637691,0.612206,-0.063156,A
4,0.71694,0.759032,-4.832657e-07,0.833547,0.666843,-0.041384,0.903522,0.504841,-0.052302,0.90088,...,0.611438,0.459419,-0.115095,0.639612,0.55672,-0.102434,0.649539,0.625443,-0.072584,A


In [3]:
from sklearn.preprocessing import LabelEncoder
# 2.1 Fit the encoder on your label column
le = LabelEncoder()
df['label_enc'] = le.fit_transform(df['label'])

# 2.2 Inspect the mapping and verify
print("Classes (in order of 0…n-1):", le.classes_)
df[['label','label_enc']].head(10)


Classes (in order of 0…n-1): ['A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R'
 'S' 'T' 'U' 'V' 'W' 'X' 'Y' 'Z' 'del' 'space']


Unnamed: 0,label,label_enc
0,A,0
1,A,0
2,A,0
3,A,0
4,A,0
5,A,0
6,A,0
7,A,0
8,A,0
9,A,0


In [4]:
# Step 6: Prepare training data and split off a validation set

from sklearn.model_selection import train_test_split
import numpy as np

# 6.1 Define feature columns and extract numpy arrays
X = df.drop(columns=["label_enc", "label"]).values
y = df["label_enc"].values

# 6.2 Split into train (80%) and test (20%)
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.20,
    random_state=42
)

# 6.3 Verify shapes
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val   shape:", X_val.shape)
print("y_val   shape:", y_val.shape)


X_train shape: (50938, 63)
y_train shape: (50938,)
X_val   shape: (12735, 63)
y_val   shape: (12735,)


In [None]:
# Grid Search to Evaluate best parameters for the RadomForestClassifier Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# 1) Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5, 10]
}

# 2) Initialize a base classifier
rf = RandomForestClassifier(random_state=42)

# 3) Set up GridSearchCV with 3-fold CV
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

# 4) Fit on your training data
grid_search.fit(X_train, y_train)

# 5) Inspect the best parameters and CV score
print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation accuracy: {:.4f}".format(grid_search.best_score_))

# 6) Evaluate the best estimator on train and test
best_rf = grid_search.best_estimator_
train_acc = best_rf.score(X_train, y_train)
test_acc  = best_rf.score(X_val, y_val)
print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test  Accuracy: {test_acc:.4f}")


Fitting 3 folds for each of 18 candidates, totalling 54 fits
Best parameters found: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 150}
Best cross-validation accuracy: 0.9777
Train Accuracy: 0.9997
Test  Accuracy: 0.5185


In [5]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
X[0]
classifier = RandomForestClassifier(n_estimators= 150, random_state = 42, max_depth=20, min_samples_split=2)
classifier.fit(X_train, y_train)

y_pred_train = classifier.predict(X_train)
y_pred_test = classifier.predict(X_val)

In [6]:
train_accuracy = accuracy_score(y_train, y_pred_train)
print(f'train Accuracy: {train_accuracy * 100:.2f}%')

# use your already‐computed y_test_enc (integers 0…28)
test_accuracy = accuracy_score(y_val, y_pred_test)
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

train Accuracy: 99.98%
Test Accuracy: 98.08%


In [7]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
test_accuracy = accuracy_score(y_val, y_pred_test)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# 4) Confusion matrix on integer codes
cm = confusion_matrix(y_val, y_pred_test, labels=np.arange(len(le.classes_)))
print("Confusion matrix (rows=true, cols=pred):\n", cm)

# 5) (Optional) Classification report
print(classification_report(
    y_val,
    y_pred_test,
    labels=np.arange(len(le.classes_)),
    target_names=le.classes_,
    zero_division=0
))
import joblib
# 2) Save to disk
joblib.dump(classifier, "asl_rf_model.pkl")
print("Model saved to asl_rf_model.pkl")

Test Accuracy: 98.08%
Confusion matrix (rows=true, cols=pred):
 [[434   1   0   0   1   0   0   0   0   0   0   0   1   1   0   0   0   0
    1   0   0   0   0   0   0   0   0   0]
 [  0 428   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0 361   0   0   0   0   0   0   0   0   0   0   0   4   0   0   0
    0   0   0   0   0   0   0   0   0   1]
 [  0   0   3 490   0   0   0   0   1   0   0   0   0   0   5   0   0   0
    0   0   0   0   0   0   0   0   0   1]
 [  0   0   0   1 463   0   0   0   0   0   0   0   0   0   1   0   0   0
    1   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0 589   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0 492   1   0   0   0   0   1   0   0   0   0   0
    0   0   1   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   4 450   0   0   0   0   1   0   0   0   1   0
    0   0   0   0   0   2   0   0   0   0]


In [8]:
import cv2
import time
import numpy as np
import mediapipe as mp
import joblib
# On Windows for beep; on other platforms replace as needed
import winsound

In [12]:
classifier = joblib.load("asl_rf_model.pkl")
print("Loaded RandomForest from asl_rf_model.pkl")

Loaded RandomForest from asl_rf_model.pkl


In [None]:
# ─── 2) SET UP MEDIAPIPE & WEBCAM ─────────────────────────────────────────────
mp_hands = mp.solutions.hands
hands    = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=1,
    min_detection_confidence=0.7,
    min_tracking_confidence=0.5
)

cap = cv2.VideoCapture(0)
if not cap.isOpened():
    raise RuntimeError("Could not open webcam")

# ─── 3) STATE FOR STABLE-LETTER DETECTION ───────────────────────────────────────
stable_idx     = None       # integer code of the current letter
stable_letter  = ""         # decoded letter
letter_start   = 0.0
captured_text  = ""
STABLE_SECONDS = 2.0
BEEP_FREQ      = 1000
BEEP_DUR       = 200       # ms

print("Live ASL demo: hold a sign for 2s to capture. ESC to exit.")

# ─── Before the loop ────────────────────────────────────────────────────────────
committed = False  # tracks whether we’ve already committed the current hold

# ─── 4) LIVE LOOP (modified) ───────────────────────────────────────────────────
while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame = cv2.flip(frame, 1)
    rgb   = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    res   = hands.process(rgb)

    pred_idx = None
    if res.multi_hand_landmarks:
        lm = res.multi_hand_landmarks[0]
        feat = np.array([c for pt in lm.landmark for c in (pt.x, pt.y, pt.z)],
                        dtype=np.float32).reshape(1, -1)
        pred_idx = int(classifier.predict(feat)[0])

    now = time.time()

    # ── reset if no hand detected ─────────────────────────────────────────────
    if pred_idx is None:
        stable_idx     = None
        stable_letter  = ""
        committed      = False

    else:
        if pred_idx == stable_idx:
            # same sign continuing
            if now - letter_start >= STABLE_SECONDS and not committed:
                captured_text += stable_letter
                try:
                    winsound.Beep(BEEP_FREQ, BEEP_DUR)
                except:
                    pass
                committed = True
        else:
            # new sign detected
            stable_idx     = pred_idx
            stable_letter  = le.inverse_transform([stable_idx])[0]
            letter_start   = now
            committed      = False

    # ── DRAW “in-flight” (yellow) and captured text (green) ────────────────
    if stable_letter:
        cv2.putText(frame, f"> {stable_letter}",
                    (30, 80), cv2.FONT_HERSHEY_SIMPLEX, 2, (0,255,255), 4)
    cv2.putText(frame, f"Text: {captured_text}",
                (30, 150), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0,255,0), 3)

    cv2.imshow("ASL Live Demo", frame)
    if cv2.waitKey(1) & 0xFF == 27:
        break

# ─── CLEAN UP ─────────────────────────────────────────────────────────────────
cap.release()
cv2.destroyAllWindows()
hands.close()

Live ASL demo: hold a sign for 2s to capture. ESC to exit.
