# Sign Language Digits Dataset

In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split
import os
import warnings
#warnings.filterwarnings('ignore')

# Loading the Data 

In [40]:
wd = os.getcwd()
X_path = os.path.join(wd,"data","X_sign_language.npy")
y_path = os.path.join(wd,"data","y_sign_language.npy")

X_path = os.path.join(wd,"data","X.npy")
y_path = os.path.join(wd,"data","y.npy")
X = np.load(X_path)
y = np.load(y_path)

# Labels are wrongly assigned. 

In practice, changing the labels like this will not change anything for the following algorithms, but it will be more convenient for understanding. 

In [41]:
def decode_label(y):
    label = np.argmax(y)
    labels = {0:9,1:0, 2:7, 3:6, 4:1, 5:8, 6:4, 7:3, 8:2, 9:5}
    return labels[label]

In [42]:
def decode_OneHotEncoding(label):
    labels = []
    for target in label:
        labels.append(np.argmax(target))    
    return np.array(labels)

In [43]:
y = np.array([decode_label(label) for label in y])

# Quick EDA

In [44]:
print(X.shape)
print(y.shape)

(2062, 64, 64)
(2062,)


In [45]:
X[0]

array([[0.46666667, 0.4745098 , 0.47843137, ..., 0.5176471 , 0.5137255 ,
        0.5019608 ],
       [0.49803922, 0.4862745 , 0.4862745 , ..., 0.5254902 , 0.5176471 ,
        0.50980395],
       [0.54509807, 0.49411765, 0.49019608, ..., 0.5294118 , 0.5254902 ,
        0.5137255 ],
       ...,
       [0.5019608 , 0.5137255 , 0.5176471 , ..., 0.5529412 , 0.54509807,
        0.53333336],
       [0.49803922, 0.5058824 , 0.5137255 , ..., 0.54509807, 0.53333336,
        0.52156866],
       [0.49019608, 0.49803922, 0.5019608 , ..., 0.5294118 , 0.52156866,
        0.50980395]], dtype=float32)

Pixels are already standardized.

In [46]:
y[0]

9

In [47]:
numbers = [np.argmax(y_) for y_ in y]
px.histogram(numbers)

### EDA conclusion

Each image is represented by an array of 64*64 pixels, their values have already been normalized.   
Each class is equally divided, the labels are represented by vectors of size 10 composed of 0 and 1, where the position of the 1 corresponds to the number. 

# Data separation

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
X_train = X_train.reshape(len(X_train), 64 * 64)
X_test = X_test.reshape(len(X_test), 64 * 64)

In [50]:
i = 520
px.imshow(X[i], title=f"Ceci est un {y_train[i]}")

# Training some baseline algo

In [51]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [52]:
baseline = [KNeighborsClassifier(), LogisticRegression(multi_class='multinomial'), RandomForestClassifier(), SVC()]
for model in baseline:
    model.fit(X_train, y_train)
    print(f"Score moyen {model} : {cross_val_score(model, X_train, y_train, cv=5).mean()}")

Score moyen KNeighborsClassifier() : 0.678587086672193



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to th

Score moyen LogisticRegression(multi_class='multinomial') : 0.7556138896564428
Score moyen RandomForestClassifier() : 0.7665192963065303
Score moyen SVC() : 0.8138196555217831


# Hyper parameter tuning 

In [53]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [150, 200, 300], 'max_depth': [5, 10, 25, 50, None]}]

forest_clf = RandomForestClassifier(random_state=42, n_jobs=-1)
# train across 5 folds, that's a total of (3*4)*5=60 rounds of training 
grid_search = GridSearchCV(forest_clf, param_grid, cv=5,
                           return_train_score=True)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
             param_grid=[{'max_depth': [5, 10, 25, 50, None],
                          'n_estimators': [150, 200, 300]}],
             return_train_score=True)

In [54]:
grid_search.best_params_

{'max_depth': 25, 'n_estimators': 200}

In [55]:
grid_search.best_score_

0.776242055816524

In [65]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'C': [0.01, 0.1, 0.5, 1], 'max_iter': [100, 1000], 'tol': [0.0001, 0.01, 1]}]

log_reg = LogisticRegression(random_state=42, n_jobs=-1)
# train across 5 folds, that's a total of (3*4)*5=60 rounds of training 
grid_search = GridSearchCV(log_reg, param_grid, cv=5,
                           return_train_score=True)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(n_jobs=-1, random_state=42),
             param_grid=[{'C': [0.01, 0.1, 0.5, 1], 'max_iter': [100, 1000],
                          'tol': [0.0001, 0.01, 1]}],
             return_train_score=True)

In [66]:
grid_search.best_params_

{'C': 0.1, 'max_iter': 1000, 'tol': 1}

In [67]:
grid_search.best_score_

0.766515612047527

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'C': [0.01, 0.1, 0.5, 1], 'kernel': ["linear","rbf","sigmoid"], 'tol': [0.0001, 0.01, 1]}]

SVC = SVC(random_state=42)
# train across 5 folds, that's a total of (3*4)*5=60 rounds of training 
grid_search = GridSearchCV(SVC, param_grid, cv=5,
                           return_train_score=True)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

## Data Augmentation

In [None]:
from scipy.ndimage.interpolation import shift

In [None]:
def shift_image(image, dx, dy):
    image = image.reshape((28, 28))
    shifted_image = shift(image, [dy, dx], cval=0, mode="constant")
    return shifted_image.reshape([-1])

In [None]:

# evaluate multinomial logistic regression model
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
# define dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5, n_classes=3, random_state=1)
# define the multinomial logistic regression model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
# define the model evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model and collect the scores
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report the model performance
print('Mean Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

In [None]:
row = [1.89149379, -0.39847585, 1.63856893, 0.01647165, 1.51892395, -3.52651223, 1.80998823, 0.58810926, -0.02542177, -0.52835426]
# predict a multinomial probability distribution
yhat = model.predict_proba([row])

In [None]:
# tune model

...
# define the multinomial logistic regression model with a default penalty
LogisticRegression(multi_class='multinomial', solver='lbfgs', penalty='l2', C=1.0)

C close to 1.0: Light penalty.
C close to 0.0: Strong penalty.