## Import required libraries

In [1]:
import csv
from pathlib import Path
from sys import __stdout__
from ast import literal_eval
from pickle import dump

import numpy as np
import pandas as pd
import sklearn as sk
import copy
import seaborn as sns
import matplotlib.pyplot as plt
from pandas import DataFrame
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression, MultiTaskLasso
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from typing import Dict, Set, Tuple, List, Any, Optional, Callable
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_regression, VarianceThreshold
from sklearn.svm import SVR

from sklearn.pipeline import Pipeline as skPipeline
from sklearn.pipeline import make_pipeline

from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin

import cv2
import mediapipe as mp
mp_drawing = mp.solutions.drawing_utils # type: ignore
mp_drawing_styles = mp.solutions.drawing_styles # type: ignore
mp_face_mesh = mp.solutions.face_mesh # type: ignore
mp_face_mesh_connections = mp.solutions.face_mesh_connections # type: ignore

from settings import *
from auxiliary import plot_selected_features

## Prepare the Training Data

See [./landmarkgenerator.py](./landmarkgenerator.py)

## Define Data Structures and Classes

In [23]:
class BlendshapeData:
    def __init__(self, train_X, test_X, train_Y, test_Y) -> None:
        self.train_X = train_X
        self.test_X = test_X
        self.train_Y = train_Y
        self.test_Y = test_Y

class BlendshapeTrainingSet:
    def __init__(self, blendshape_idx, blendshape_name, X, Y):
        self.blendshape_idx = blendshape_idx
        self.blendshape_name = blendshape_name
        self.X = X
        self.Y = Y

    def __str__(self):
        return f"BlendshapeData({self.blendshape_idx})"
        
blendshape_training_set_lst: List[BlendshapeTrainingSet] = []


class DataTransformer:

    def __init__(self) -> None:
        pass
    
    def transform_X(self, X: DataFrame) -> DataFrame:
        raise NotImplementedError

class Pipeline:

    def __init__(self, pipeline_name: str, 
                dataset_transformer: DataTransformer, 
                model: Callable,
                split: Callable,
                selection_method: Optional[Callable] = None,
                test_size: float = 0.2,
                random_state: int = 42,
                shuffle: bool = True) -> None:
        self.pipeline_name = pipeline_name
        self.dataset_transformer: DataTransformer = dataset_transformer
        self.selection_method = selection_method
        self.model = model
        self.split = split
        self.test_size = test_size
        self.random_state = random_state
        self.shuffle = shuffle
    
    # def fit(self, train_df: DataFrame):
    #     selectors = dict()
    #     predictors = dict()
    #     results = []
    #     transformed_df = self.dataset_transformer(train_df)
    #     global blendshape_idx_lst
    #     for blendshape_i in blendshape_idx_lst:
    #         sub_df = transformed_df[transformed_df["blendshape_i"] == blendshape_i]
    #         reduced_X = sub_df.iloc[:, 2:]
    #         Y = sub_df.filter(regex="weight").to_numpy().flatten()
    #         selectors[f"{blendshape_i}"] = None
    #         if self.selection_method:
    #             predictor = self.selection_method(reduced_X, Y)
    #             reduced_X = predictor.fit_transform(reduced_X, Y)
    #             selectors[f"{blendshape_i}"] = copy.deepcopy(predictor)
    #         splitted_data = self.split(reduced_X, Y)
    #         self.model.fit(splitted_data.train_X, splitted_data.train_Y)
    #         score = self.model.score(splitted_data.test_X, splitted_data.test_Y)
    #         results.append(score)
    #         predictors[f"{blendshape_i}"] = copy.deepcopy(self.model)
    #         # print(f"{self.pipeline_name}: blendshape_i={blendshape_i}, score={score}")
    #     print(f"{self.pipeline_name}: mean={np.mean(results)}, min={np.min(results)}, max={np.max(results)}")
    #     return selectors, predictors
    
    def fit_(self, training_set_list: List[BlendshapeTrainingSet], verbose: int = 1):
        selectors = dict()
        predictors = dict()
        results = []
        for training_set in training_set_list:
            new_X = self.dataset_transformer.transform_X(training_set.X)
            if self.selection_method:
                predictor = self.selection_method(new_X, training_set.Y)
                new_X = predictor.fit_transform(new_X, training_set.Y)
                print(predictor.get_support())
                selectors[f"{training_set.blendshape_idx}"] = copy.deepcopy(predictor)
            X_train, X_test, Y_train, Y_test = train_test_split(
                new_X, training_set.Y, test_size=self.test_size, 
                random_state=self.random_state, shuffle=self.shuffle)
            self.model.fit(X_train, Y_train)
            score = self.model.score(X_test, Y_test)
            results.append(score)
            predictors[f"{training_set.blendshape_idx}"] = copy.deepcopy(self.model)
            break
        if verbose > 0:
            print(f"{self.pipeline_name}: mean={np.mean(results)}, min={np.min(results)}, max={np.max(results)}")
        return selectors, predictors    


## Helper Functions for Debugging

### Plot the selected features

See [./auxiliary.py:plot_selected_features](./auxiliary.py)

## Load Train Dataset

### For Multi-task Regression

For Multi-task Regression use. All the data from the dataset CSV are load into a single dataframe, such that

$ x_{i,j} \in X, i \leq n, j \leq m $ where $X$ is a $n * m$ matrix, is the $j^{th}$ landmark of the $i^{th}$ training data element.

$ y_i \in Y, i \leq n$ where $Y$ is a $n * 1$ matrix, is the list of blendshape of that element.

In [2]:
tmp_df = pd.read_csv(TRAIN_FILE, header=0, delimiter=",", index_col=False)
multitask_Y : DataFrame = tmp_df["weight"].to_frame()
multitask_Y = multitask_Y.applymap(literal_eval).applymap(np.array)
## Assuming 1st column blendshape index and 2nd column is the weight list.
multitask_X : DataFrame = tmp_df.iloc[:, 2:]
multitask_X = multitask_X.applymap(literal_eval).applymap(np.array)

In [3]:
print(multitask_X.shape)
print(multitask_Y.shape)

(19392, 478)
(19392, 1)


### For multi models regression

All the data from the dataset CSV are load into a list of dataframes, such that

$ dataframe_i$ in list is a dataframe for a specific blendshape.

$ x_{i,j} \in X, i \leq n, j \leq m $ where $X$ is a $n * m$ matrix, is the $j^{th}$ landmark of the $i^{th}$ training data element of this specific blendshape.

$ y_i \in y, i \leq n$ where $y$ is a **vector**, is the weight of that blendshape.

In [27]:
## %%script echo skip

blendshape_training_set_lst = []

def str_splitted_by_space_to_list(s: str) -> list:
    return [float(x) for x in s.strip("[").strip("]").split()]

all_X = None
OVERALL_Y = None

def load_data() -> None:
    # name_df = pd.read_csv("blendshapes_name.csv", header=0, delimiter=",", index_col=False)
    blendshape_idx_lst = []
    train_df = pd.read_csv(
        TRAIN_FILE, header=0, delimiter=",", index_col=False
    )
    # name_val = [name_df.loc[name_df["index"] == idx].iloc[0, 1].strip() for idx in train_df["blendshape_i"]]
    # train_df.insert(1, "blendshape_name", name_val)
    blendshape_idx_lst = train_df["blendshape_i"].drop_duplicates().to_list()
    landmarks = train_df.columns[3:].to_list()
    train_df[landmarks] = train_df[landmarks].applymap(literal_eval).applymap(np.array)
    global all_X
    all_X = train_df[landmarks]
    for blendshape_idx in blendshape_idx_lst:
        blendshape_training_set_lst.append(
            BlendshapeTrainingSet(
                blendshape_idx=blendshape_idx,
                # blendshape_name=train_df[train_df["blendshape_i"] == blendshape_idx]["blendshape_name"],
                blendshape_name = "",
                X=train_df[train_df["blendshape_i"] == blendshape_idx][landmarks],
                Y=train_df[train_df["blendshape_i"] == blendshape_idx]["weight"],
            )
        )
load_data()


NameError: name 'name_df' is not defined

In [None]:
from sklearn.datasets import make_regression

X, Y = make_regression(n_samples=10, n_features=30, n_targets=3, random_state=42)

print(X, Y)

In [None]:
print(blendshape_training_set_lst[0].blendshape_idx, blendshape_training_set_lst[0].blendshape_name)
print( blendshape_training_set_lst[0].Y)

## Transforming Dataset

### Default training set

In [None]:
class DefaultFeatures(BaseEstimator, TransformerMixin):

    def __init__(self):
        super().__init__()

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        """Produce the training set that using 478 landmarks' coordinates (1434 in total) as features

        Args:
            input_df (DataFrame): the origin dataframe from csv file

        Returns:
            DataFrame: the training set with distance between landmarks
        """
        default_train_columns = (X.columns.to_list()
            + [f"{i}_landmark_{j}" for i in range(N_LANDMARKS) for j in ["x", "y", "z"]])

        default_X =[]

        for _, row in X.iterrows():
            default_X.append(np.concatenate(row.values))
        default_X = DataFrame(data=default_X, columns=default_train_columns)
        return default_X

### Distance training set

In [None]:
class Distance(BaseEstimator, TransformerMixin):

    def __init__(self):
        super().__init__()
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        """Produce the training set with distance between certain sets of landmarks

        Args:
            input_df (DataFrame): the origin dataframe from csv file

        Returns:
            DataFrame: the training set with distance between landmarks
        """
        vertices_sets: Dict[str, Set[Tuple[int, int]]] = {
            "FACEMESH_FACE_OVAL": mp_face_mesh_connections.FACEMESH_FACE_OVAL,
            "FACEMESH_LIPS": mp_face_mesh_connections.FACEMESH_LIPS, 
            "FACEMESH_LEFT_EYE": mp_face_mesh_connections.FACEMESH_LEFT_EYE,
            "FACEMESH_LEFT_IRIS": mp_face_mesh_connections.FACEMESH_LEFT_IRIS,
            "FACEMESH_LEFT_EYEBROW": mp_face_mesh_connections.FACEMESH_LEFT_EYEBROW,
            "FACEMESH_RIGHT_EYE": mp_face_mesh_connections.FACEMESH_RIGHT_EYE,
            "FACEMESH_RIGHT_EYEBROW": mp_face_mesh_connections.FACEMESH_RIGHT_EYEBROW,
            "FACEMESH_RIGHT_IRIS": mp_face_mesh_connections.FACEMESH_RIGHT_IRIS}
        vertices_sets_new_val = map(tuple_set_to_list, vertices_sets.values())
        vertices_sets = dict(zip(vertices_sets.keys(), vertices_sets_new_val))

        NOSE_IDX = 1
        TOP_DOWN_FACE = (10,152)
        LEFT_RIGHT_OUTER_EYE = (263, 33)
        LEFT_RIGHT_MOUSE= (61, 291)

        # define the column names
        new_columns = list()
        for name, vertices_set in vertices_sets.items():
            for _, vertices in enumerate(vertices_set):
                column = f"{name}_distance_{vertices}"
                new_columns.append(column)

        distance_X = DataFrame(columns=new_columns, dtype=np.float64)

        for i, row in X.iterrows():
            new_row = list()
            middle_point_x = np.mean([(row[LEFT_RIGHT_OUTER_EYE[0]][0] + row[LEFT_RIGHT_OUTER_EYE[1]][0]) / 2, 
                (row[LEFT_RIGHT_MOUSE[0]][0] + row[LEFT_RIGHT_MOUSE[1]][0]) / 2, row[NOSE_IDX][0]])
            middle_point_y = ((row[TOP_DOWN_FACE[0]] + row[TOP_DOWN_FACE[1]]) / 2)[1]
            middle_point_z = 0
            middle_point = [middle_point_x, middle_point_y, middle_point_z]
            normalised_distance = np.linalg.norm(row[TOP_DOWN_FACE[0]] - row[TOP_DOWN_FACE[1]])
            for name, vertices_set in vertices_sets.items():
                for _, vertex in enumerate(vertices_set):
                    distance = np.linalg.norm(row[vertex] - middle_point)
                    new_row.append(distance)
            distance_X.loc[i] = [distance / normalised_distance for distance in new_row] # type: ignore
        ## distance_X.to_csv("test_dis.csv", index=False)  
        return distance_X

    def _tuple_set_to_list(in_set: Set[Tuple]) -> List:
        return list({element for tuple_ in in_set for element in tuple_})

### Full Distance Training Set

The training set base on distances that includes all the points, labelled by their index in MediaPipe.

In [10]:
class FullDistance(BaseEstimator, TransformerMixin):

    def __init__(self):
        super().__init__()
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        """Produce the training set with distance between certain sets of landmarks

        Args:
            input_df (DataFrame): the origin dataframe from csv file

        Returns:
            DataFrame: the training set with distance between landmarks
        """

        NOSE_IDX = 1
        TOP_DOWN_FACE = (10,152)

        # define the column names
        new_columns = list()
        for idx, _ in enumerate(X.columns):
            new_columns.append(f"distance_{idx}")

        distance_X = DataFrame(columns=new_columns, dtype=np.float64)

        for i, row in X.iterrows():
            new_row = list()
            middle_point = row[NOSE_IDX][:]
            middle_point[1] = (row[TOP_DOWN_FACE[0]][1] + row[TOP_DOWN_FACE[1]][1]) / 2
            max_distance = row[TOP_DOWN_FACE[0]] - row[TOP_DOWN_FACE[1]]
            normalised_distance = np.linalg.norm(max_distance)
            for _, landmark in enumerate(row):
                distance = np.linalg.norm(landmark - middle_point)
                new_row.append(distance)
            ## distance_X.loc[i] = [distance / normalised_distance for distance in new_row] # type: ignore
            distance_X.loc[i] = [distance for distance in new_row] # type: ignore
        return distance_X

In [None]:
class PlaneCoordinate(BaseEstimator, TransformerMixin):

    def __init__(self):
        super().__init__()
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        """Produce the training set with the x, y coordinates of landmarks.

        The generated coordinates are 
        Args:
            input_df (DataFrame): the origin dataframe from csv file

        Returns:
            DataFrame: the training set with distance between landmarks
        """

        NOSE_IDX = 1
        TOP_DOWN_FACE = (10,152)
        LEFT_RIGHT_OUTER_EYE = (263, 33)
        LEFT_RIGHT_MOUSE= (61, 291)

        # define the column names
        new_columns = list()
        for idx, _ in enumerate(X.columns):
            new_columns.append(f"distance_{idx}")

        distance_X = DataFrame(columns=new_columns, dtype=np.float64)

        for i, row in X.iterrows():
            new_row = list()
            middle_point_x = np.mean([(row[LEFT_RIGHT_OUTER_EYE[0]][0] + row[LEFT_RIGHT_OUTER_EYE[1]][0]) / 2, 
                (row[LEFT_RIGHT_MOUSE[0]][0] + row[LEFT_RIGHT_MOUSE[1]][0]) / 2, row[NOSE_IDX][0]])
            middle_point_y = ((row[TOP_DOWN_FACE[0]] + row[TOP_DOWN_FACE[1]]) / 2)[1]
            middle_point_z = 0
            middle_point = [middle_point_x, middle_point_y, middle_point_z]
            normalised_distance = np.linalg.norm(row[TOP_DOWN_FACE[0]] - row[TOP_DOWN_FACE[1]])
            
            for _, landmark in enumerate(row):
                distance = np.linalg.norm(landmark - middle_point)
                new_row.append(distance)
            distance_X.loc[i] = [distance / normalised_distance for distance in new_row] # type: ignore
        return distance_X

## Splitting the Dataset into Train, Test and Validation

In [None]:
def default_split(X, Y) -> BlendshapeData:
    train_X, test_X, train_Y, test_Y = train_test_split(
            X, Y, test_size=0.2, random_state=42, shuffle=True
        )
    return BlendshapeData(train_X, test_X, train_Y, test_Y)

## Features selection

In [None]:
selection_methods: Dict[str, Callable] = dict()

### PCA

In [None]:
def selection_pca(X, Y) -> PCA:
    n: int = min(X.shape)
    pca = PCA(n_components=n)
    pca.fit(X=X, y=Y)
    explained_variance_ratios = pca.explained_variance_ratio_ # type: ignore
    for i in range(n):
        if sum(explained_variance_ratios[:i]) > 0.95:
            n = i
            break
    pca = PCA(n_components=n)
    return pca
    
selection_methods["PCA"] = selection_pca


### Selected K Best base on Chi2

In [None]:
def selection_k_best_chi2(X, Y) -> SelectKBest:
    # k = 0
    # selector = SelectKBest(chi2, k=10)
    # selector.fit(X, Y)
    # selected_features = selector.get_support(indices=True)
    # selected_features = sorted(selected_features, key = lambda x: selector.scores_[x], reverse=True)
    # for i in range(len(selected_features)):
    #     if selector.scores_[selected_features[i]] > 0.010:
    #         k += 1
    #     else:
    #         break
    selector = SelectKBest(chi2, k=7)
    return selector

selection_methods["chi2"] = selection_k_best_chi2

### Selected K Best base on ?

## Model Training and Evaluation

In [None]:
models: Dict[str, Callable]  = dict()

### Linear Regression

#### Linear Regression

In [None]:
models["linear-regression"] = LinearRegression()

#### Logistic Regression

In [None]:
models["logistic-regression"] = LogisticRegression()

### Ensemble Models

#### Random Forest Regressor

In [None]:
models["random_forest_regressor"] = RandomForestRegressor()

#### Ada Boost Regressor

In [None]:
models["ada_boost_regressor"] = AdaBoostRegressor()

## Training

### Define the pipelines

In [None]:
pipelines: Dict[str, Pipeline] = dict()

# pipelines["default_random_forest_regressor"] = Pipeline(
#     pipeline_name="default_random_forest_regressor",
#     dataset_transformer=training_set_transformers["default"],
#     selection_method=None,
#     model=models["random_forest_regressor"],
#     split=default_split)
# pipelines["distance_random_forest_regressor"] = Pipeline(
#     pipeline_name="distance_random_forest_regressor",
#     dataset_transformer=training_set_transformers["distance"],
#     selection_method=None,
#     model=models["random_forest_regressor"],
#     split=default_split)
# pipelines["default_ada_boost_regressor"] = Pipeline(
#     pipeline_name="default_ada_boost_regressor",
#     dataset_transformer=training_set_transformers["default"],
#     selection_method=None,
#     model=models["ada_boost_regressor"],
#     split=default_split)
# pipelines["distance_ada_boost_regressor"] = Pipeline(
#     pipeline_name="distance_ada_boost_regressor",
#     dataset_transformer=training_set_transformers["distance"],
#     selection_method=None,
#     model=models["ada_boost_regressor"],
#     split=default_split)
# pipelines["default_pca_ada_boost_regressor"] = Pipeline(
#     pipeline_name="default_pca_ada_boost_regressor",
#     dataset_transformer=training_set_transformers["default"],
#     selection_method=selection_methods["PCA"],
#     model=models["ada_boost_regressor"],
#     split=default_split)
# pipelines["distance_pca_ada_boost_regressor"] = Pipeline(
#     pipeline_name="distance_pca_ada_boost_regressor",
#     dataset_transformer=training_set_transformers["distance"],
#     selection_method=selection_methods["PCA"],
#     model=models["ada_boost_regressor"],
#     split=default_split)

## Chi2 only allow non-negative values,
## therefore, the default set is not applicable

# pipelines["distance_chi2_linear_regressor"] = Pipeline(
#     pipeline_name="distance_chi2_linear_regressor",
#     dataset_transformer=training_set_transformers["distance"],
#     selection_method=selection_methods["chi2"],
#     model=models["linear-regression"],
#     split=default_split)
# pipelines["distance_chi2_random_forest_regressor"] = Pipeline(
#     pipeline_name="distance_chi2_random_forest_regressor",
#     dataset_transformer=training_set_transformers["distance"],
#     selection_method=selection_methods["chi2"],
#     model=models["random_forest_regressor"],
#     split=default_split)
# pipelines["distance_chi2_ada_boost_regressor"] = Pipeline(
#     pipeline_name="distance_chi2_ada_boost_regressor",
#     dataset_transformer=training_set_transformers["distance"],
#     selection_method=selection_methods["chi2"],
#     model=models["ada_boost_regressor"],
#     split=default_split)
pipelines["full_distance_chi2_linear_regressor"] = Pipeline(
    pipeline_name="full_distance_chi2_linear_regressor",
    dataset_transformer=training_set_transformers["full_distance"],
    selection_method=selection_methods["chi2"],
    model=models["linear-regression"],
    split=default_split)
pipelines["full_distance_chi2_random_forest_regressor"] = Pipeline(
    pipeline_name="full_distance_chi2_random_forest_regressor",
    dataset_transformer=training_set_transformers["full_distance"],
    selection_method=selection_methods["chi2"],
    model=models["random_forest_regressor"],
    split=default_split)
pipelines["full_distance_chi2_ada_boost_regressor"] = Pipeline(
    pipeline_name="full_distance_chi2_ada_boost_regressor",
    dataset_transformer=training_set_transformers["full_distance"],
    selection_method=selection_methods["chi2"],
    model=models["ada_boost_regressor"],
    split=default_split)
# pipelines["distance_tsne_ada_boost_regressor"] = Pipeline(
#     pipeline_name="distance_tsne_ada_boost_regressor",
#     dataset_transformer=training_set_transformers["distance"],
#     selection_method=selection_methods["TSNE"],
#     model=models["ada_boost_regressor"],
#     split=default_split)
# pipelines["distance_tsne_ada_boost_regressor"] = Pipeline(
#     pipeline_name="distance_tsne_ada_boost_regressor",
#     dataset_transformer=training_set_transformers["distance"],
#     selection_method=selection_methods["TSNE"],
#     model=models["ada_boost_regressor"],
#     split=default_split)

In [None]:
result_pipelines = dict()

In [None]:
print(len(blendshape_training_set_lst))

NameError: name 'blendshape_training_set_lst' is not defined

In [8]:
class CustomVarianceThreshold(VarianceThreshold):

    def __init__(self, threshold=0.0, step=1.0e-7, max_iteration=1000):
        super().__init__(threshold=threshold)
        self.try_count = 0
        self.step = step
        self.max_iteration = max_iteration
    
    def fit_transform(self, X, y = None, **fit_params):
        try:
            return super().fit_transform(X, y, **fit_params)
        except ValueError:
            while True:
                self.threshold -= self.threshold
                if self.try_count > self.max_iteration:
                    self.threshold=0
                    print("No suitable threshold, new threshold = 0")
                    return super().fit_transform(X, y, **fit_params)
                self.max_iteration += 1
                try:
                    return super().fit_transform(X, y, **fit_params)
                except ValueError:
                    continue

weight = 100
searches = []
fulldistance_chi2_gridsearch_linear_regressor = skPipeline(steps=[('data_transform', FullDistance()), ('feature_selection', SelectKBest(chi2)), ('regression', LinearRegression())])
# param_grid =  {
#     'data_transform__k': [k for k in range(1,10)]
# }

fulldistance_infogain_gridsearch_linear_regressor = skPipeline(steps=[('data_transform', FullDistance()), ('feature_selection', SelectKBest(mutual_info_regression)), ('regression', LinearRegression())])
# param_grid =  {
#     'data_transform__k': [k for k in range(1,10)]
# }

fulldistance_varthershold_gridsearch_linear_regressor = skPipeline(steps=[('data_transform', FullDistance()), ('feature_selection', CustomVarianceThreshold()), ('regression', LinearRegression())])

fulldistance_varthershold_gridsearch_svr = skPipeline(steps=[('data_transform', FullDistance()), ('feature_selection', CustomVarianceThreshold()), ('regression', SVR())])

fulldistance_multilasso = skPipeline(steps=[('data_transform', FullDistance()), ('regression', MultiTaskLasso())])

def custom_score_func(pipeline, X, y):

    feature_selector = pipeline.named_steps['chi2']
    _landmarks = feature_selector.get_support(indices=True)
    

# searches.append(GridSearchCV(
#                 estimator=fulldistance_chi2_gridsearch_linear_regressor, 
#                 param_grid={
#                     'feature_selection__k': [k for k in range(1,10)]
#                 }, 
#                 return_train_score=True, 
#                 error_score='raise'))
# searches.append(GridSearchCV(
#                 estimator=fulldistance_infogain_gridsearch_linear_regressor, 
#                 param_grid={
#                     'feature_selection__k': [k for k in range(1,10)]
#                 }, 
#                 return_train_score=True, 
#                 error_score='raise'))
# searches.append(GridSearchCV(
#                 estimator=fulldistance_varthershold_gridsearch_linear_regressor,
#                 param_grid={
#                     'feature_selection__threshold': [2.5e-5]
#                 }, 
#                 return_train_score=True, 
#                 error_score='raise',
#                 verbose=3,
#                 cv=2)
#                 )
searches.append(GridSearchCV(
estimator=fulldistance_multilasso,
param_grid={
    'regression__alpha': [1.0, 1.5]
}, 
return_train_score=True, 
error_score='raise',
verbose=3,
cv=2)
)
searches[0].fit(multitask_X, multitask_Y)
# searches.append(GridSearchCV(
#                 estimator=fulldistance_varthershold_gridsearch_svr,
#                 param_grid=[{
#                     'feature_selection__threshold': [2.5e-5],
#                     'regression__C': [0.75, 1, 1,5],
#                     'regression__kernel': ['linear'],
#                     'regression__epsilon': [0.5, 1, 2],
#                 },
#                 {
#                     'feature_selection__threshold': [2.5e-5],
#                     'regression__C': [0.75, 1, 1,5],
#                     'regression__kernel': ['rbf'],
#                     'regression__epsilon': [0.5, 1, 2],
#                     'regression__gamma': ['auto', 'scale'],
#                 }], 
#                 return_train_score=True, 
#                 error_score='raise',
#                 verbose=3,
#                 cv=2)
#                 )

# def train_on_each_blendshape():
#     for search in searches:
#         tmp_res = []
#         for training_set in blendshape_training_set_lst[:]: 
#             print(training_set.blendshape_idx)
#             print(training_set.blendshape_name.to_list()[0])
#             search.fit(training_set.X, training_set.Y)
#             print("Best Estimator:", search.best_estimator_)
#             print("Best Train Score:", search.best_score_)
#             tmp_res.append(search.best_estimator_)
#             best_filter = search.best_estimator_.named_steps["feature_selection"]
#             best_landmarks = best_filter.get_support(indices=True)
#             filename_idx = training_set.blendshape_idx
#             filename_data_transform = search.best_estimator_.named_steps['data_transform'].__class__.__name__
#             filename_feature_selection = search.best_estimator_.named_steps['feature_selection'].__class__.__name__
#             if search.best_estimator_.named_steps['feature_selection'].__class__ == SelectKBest:
#                 filename_feature_selection += f"{search.best_estimator_.named_steps['feature_selection'].score_func.__name__}"
#             filename_regression_model = search.best_estimator_.named_steps['regression'].__class__.__name__
#             plot_selected_features(f"./index{training_set.blendshape_idx}-weight${weight}.png", best_landmarks, f"{filename_idx}_{filename_data_transform}_{filename_feature_selection}_{filename_regression_model}")
#         result_pipelines[f"{filename_data_transform}_{filename_feature_selection}_{filename_regression_model}"] = tmp_res

# def train_on_all_data():

#     def transform_y(y, idx: int):
#         target = y
#         target = [0 for blendshape in blendshape_training_set_lst[:idx] for j in range(blendshape.Y.shape[0])]\
#                 + target\
#                 + [0 for blendshape in blendshape_training_set_lst[min(len(blendshape_training_set_lst), idx+1):] for j in range(blendshape.Y.shape[0])]
#         return target

#     for search in searches:
#         tmp_res = []
#         for idx, training_set in enumerate(blendshape_training_set_lst[:]): 
#             print(training_set.blendshape_idx)
#             print(training_set.blendshape_name.to_list()[0])
#             search.fit(all_X, transform_y(training_set.Y.to_list(), idx))
#             print("Best Estimator:", search.best_estimator_)
#             print("Best Train Score:", search.best_score_)
#             tmp_res.append(search.best_estimator_)
#             best_filter = search.best_estimator_.named_steps["feature_selection"]
#             best_landmarks = best_filter.get_support(indices=True)
#             filename_idx = training_set.blendshape_idx
#             filename_data_transform = search.best_estimator_.named_steps['data_transform'].__class__.__name__
#             filename_feature_selection = search.best_estimator_.named_steps['feature_selection'].__class__.__name__
#             if search.best_estimator_.named_steps['feature_selection'].__class__ == SelectKBest:
#                 filename_feature_selection += f"{search.best_estimator_.named_steps['feature_selection'].score_func.__name__}"
#             filename_regression_model = search.best_estimator_.named_steps['regression'].__class__.__name__
#             # plot_selected_features(f"./index{training_set.blendshape_idx}-weight${weight}.png", best_landmarks, f"{filename_idx}_{filename_data_transform}_{filename_feature_selection}_{filename_regression_model}")
#         result_pipelines[f"all_{filename_data_transform}_{filename_feature_selection}_{filename_regression_model}"] = tmp_res

# train_on_each_blendshape()
# train_on_all_data()

Fitting 2 folds for each of 2 candidates, totalling 4 fits


KeyboardInterrupt: 

In [19]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import models as keras_models
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

class TensorFlowCNN(BaseEstimator, RegressorMixin):
    def __init__(self):
        self.model = None
        self.model_ = None

    def fit(self, X, y):
        X, y = check_X_y(X, y)
        self.model_ = self.build_model(X.shape, len(list(y)))

        self.model_.compile(optimizer='adam', loss='mse', metrics=['mae'])

        self.model_.fit(X, y, epochs=10, verbose=0)

        return self

    def predict(self, X):
        check_is_fitted(self, 'model_')
        X = check_array(X)
        return self.model_.predict(X)

    def build_model(self, input_shape, output_size):
        if self.model is None:
            model_layers = [layers.Input(shape=input_shape)]
            model_layers.append(tf.keras.layers.Conv2D(2, 3, activation='relu')(model_layers[-1]))
            # model_layers.append(tf.keras.layers.MaxPooling2D(pool_size=(2, 2))) # 2x2最大池化层
            # model_layers.append(tf.keras.layers.Conv2D(3, (6, 6), activation='relu'))
            output = [layers.Dense(1, activation='linear', name=f'task{i}_output')(hidden_layers[-1]) for i in range(output_size)]
            model = keras_models.Model(inputs=input_, outputs=output)
            print(model.summary())
            return model
        return self.model

# p = skPipeline(steps=[('data_transform', FullDistance()), ('feature_selection', CustomVarianceThreshold()), ('regression', TensorFlowCNN())])
cnn = TensorFlowCNN()
cnn.build_model((500, 500), 50)

ValueError: Input 0 of layer "conv2d_10" is incompatible with the layer: expected min_ndim=4, found ndim=3. Full shape received: (None, 500, 500)

In [20]:
for name in result_pipelines.keys():
    print(name)

FullDistance_CustomVarianceThreshold_LinearRegression


In [20]:
idx_to_plot = 68
BEGIN_IDX = 67
END_IDX = 114 - BEGIN_IDX
idx_to_plot -= BEGIN_IDX
SELECTED_PIPELINE = "FullDistance_CustomVarianceThreshold_LinearRegression"

transformed_X = result_pipelines[SELECTED_PIPELINE][idx_to_plot].named_steps['data_transform'].transform(all_X)
transformed_X = result_pipelines[SELECTED_PIPELINE][idx_to_plot].named_steps['feature_selection'].transform(transformed_X)
target = blendshape_training_set_lst[idx_to_plot].Y.to_list()
new_df = pd.DataFrame(transformed_X, columns=[f"feature_{i}" for i in range(len(transformed_X[0]))])
target = [0 for blendshape in blendshape_training_set_lst[:idx_to_plot] for j in range(blendshape.Y.shape[0])]\
            + target\
            + [0 for blendshape in blendshape_training_set_lst[min(END_IDX, idx_to_plot+1):] for j in range(blendshape.Y.shape[0])]
print(new_df)
new_df['target'] = target
g1 = sns.PairGrid(new_df, diag_sharey=False)
g1.map_lower(sns.scatterplot, hue=new_df['target'], palette='coolwarm', hue_norm=(0, 1))
g1.map_diag(sns.histplot, kde=True, color='blue')
g1.add_legend()

transformed_X = result_pipelines[SELECTED_PIPELINE][idx_to_plot].named_steps['data_transform'].transform(blendshape_training_set_lst[idx_to_plot].X)
transformed_X = result_pipelines[SELECTED_PIPELINE][idx_to_plot].named_steps['feature_selection'].transform(transformed_X)
new_df = pd.DataFrame(transformed_X, columns=[f"feature_{i}" for i in range(len(transformed_X[0]))])
target = blendshape_training_set_lst[idx_to_plot].Y.to_list()
new_df['target'] = target
g2 = sns.PairGrid(new_df, diag_sharey=False)
g2.map_lower(sns.scatterplot, hue=new_df['target'], palette='coolwarm', hue_norm=(0, 1))
g2.map_diag(sns.histplot, kde=True, color='blue')
g2.add_legend()
plt.show()

NameError: name 'result_pipelines' is not defined

In [595]:
print(target, new_df['target'])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

### Call the fit for training

In [None]:
selectors_groups: Dict[str, Dict[str, Any]] = dict()
predictors_groups: Dict[str, Dict[str, Any]] = dict()

for pipeline in pipelines.values():
    selectors, predictors = pipeline.fit_(training_set_list=blendshape_training_set_lst)
    selectors_groups[pipeline.pipeline_name] = selectors
    predictors_groups[pipeline.pipeline_name] = predictors

In [186]:
print(training_set_transformers["distance"].transform_X)

<bound method DataTransformer.transform_X of <__main__.Distance object at 0x7fad1ae96640>>


In [660]:
def get_selected_point_idx(feature_name: str) -> int:
    return int(feature_name.split("_")[-1])

res = []

IDX = 99
WEIGHT = 100

IMAGE_FILES = [f"./index{IDX}-weight${WEIGHT}.png"]
# IMAGE_FILE = ["./test-img-01.jpg"]
name_df = pd.read_csv("./blendshapes_name.csv")
print(name_df.loc[name_df['index'] == IDX])


selectors = selectors_groups["full_distance_chi2_linear_regressor"]
predictors = predictors_groups["full_distance_chi2_linear_regressor"]

# for idx, selector in selectors.items():
#     selected_features = selector.get_support(indices=True)
#     print(idx)
#     print(selected_features)

# define a function to return a int, in range [min, max],
# input is a float.

def int_in_given_range(x: float, min: int, max: int) -> int:
    if x < min:
        return min
    elif x > max:
        return max
    else:
        return int(x)

drawing_spec = mp_drawing.DrawingSpec(thickness=1, circle_radius=1)
with mp_face_mesh.FaceMesh(
    static_image_mode=True,
    max_num_faces=1,
    refine_landmarks=True,
    min_detection_confidence=0.5,
) as face_mesh:
    for idx, file in enumerate(IMAGE_FILES):
        image = cv2.imread(file)
        # Convert the BGR image to RGB before processing.
        results = face_mesh.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

        # Print and draw face mesh landmarks on the image.
        if not results.multi_face_landmarks:
            continue
        annotated_image = image.copy()
        arr = []
        for face_landmarks in results.multi_face_landmarks:
            for a in face_landmarks.landmark:
                arr.append(np.array([a.x, a.y, a.z]))
        predict_df = pd.DataFrame([arr], columns=HEADERS[2:])
        for pipeline in result_pipelines["FullDistance_CustomVarianceThreshold_LinearRegression"]:
            weight = pipeline.predict(predict_df)[0]
            print(int_in_given_range(weight, 0, 100), end=", ")
            res.append(int_in_given_range(weight, 0, 100))
        print("")
        mask = [True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, True, True, True, False, True, True, False, True, True, False, False, False, True, False, False, False, False, True, False, False, False, False, False, False, True, True, False, True, True, False, False, False, False]
        for idx, r in enumerate(res):
            if not mask[idx]:
                res[idx] = 0
        print(res)
            

    index                           weight   filename
32     99   blendShape1.AU_27_MouthStretch        NaN
19, 30, 0, 0, 3, 100, 100, 0, 0, 0, 0, 0, 0, 100, 100, 67, 82, 90, 0, 88, 0, 0, 0, 88, 100, 46, 0, 100, 0, 0, 100, 100, 100, 100, 100, 73, 98, 96, 33, 0, 97, 100, 100, 7, 52, 50, 64, 0, 
[19, 30, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 67, 82, 90, 0, 88, 0, 0, 0, 88, 0, 0, 0, 100, 0, 0, 0, 0, 100, 0, 0, 0, 0, 0, 0, 0, 97, 0, 100, 7, 0, 0, 0, 0]


In [655]:
# print(blendshape_training_set_lst[IDX-67].blendshape_name)
mask = [True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, True, True, True, False, True, True, False, True, True, False, False, False, True, False, False, False, False, True, False, False, False, False, False, False, True, True, False, True, True, False, False, False]
print(len(mask))
print(114 - 67)

46
47


## Export the ideal models

In [234]:
selector_group = selectors_groups["distance_chi2_linear_regressor"]
print("selector_group", selector_group)
for blendshape_i, selector in selector_group.items():
    with open(f"fm2bs_selector_{blendshape_i}.pkl", "wb") as f:
        dump(selector, f)
predictors_group = predictors_groups["distance_chi2_linear_regressor"]
print("predictors_group", predictors_group)
for blendshape_i, model in predictors_group.items():
    with open(f"fm2bs_model_{blendshape_i}.pkl", "wb") as f:
        dump(model, f)

selector_group {'67': SelectKBest(k=7, score_func=<function chi2 at 0x7fad7c3664c0>), '68': SelectKBest(k=7, score_func=<function chi2 at 0x7fad7c3664c0>), '69': SelectKBest(k=7, score_func=<function chi2 at 0x7fad7c3664c0>), '70': SelectKBest(k=7, score_func=<function chi2 at 0x7fad7c3664c0>), '71': SelectKBest(k=7, score_func=<function chi2 at 0x7fad7c3664c0>), '72': SelectKBest(k=7, score_func=<function chi2 at 0x7fad7c3664c0>), '73': SelectKBest(k=7, score_func=<function chi2 at 0x7fad7c3664c0>), '74': SelectKBest(k=7, score_func=<function chi2 at 0x7fad7c3664c0>), '75': SelectKBest(k=7, score_func=<function chi2 at 0x7fad7c3664c0>), '76': SelectKBest(k=7, score_func=<function chi2 at 0x7fad7c3664c0>), '77': SelectKBest(k=7, score_func=<function chi2 at 0x7fad7c3664c0>), '78': SelectKBest(k=7, score_func=<function chi2 at 0x7fad7c3664c0>), '79': SelectKBest(k=7, score_func=<function chi2 at 0x7fad7c3664c0>), '80': SelectKBest(k=7, score_func=<function chi2 at 0x7fad7c3664c0>), '81'

## Draw different parts on mediapipe

In [13]:
half_X = multitask_X.iloc[0::2, ::]
half_Y = multitask_Y.iloc[0::2, ::]

                                              landmark_0  \
0      [0.4988866150379181, 0.689608097076416, -0.030...   
2      [0.49904048442840576, 0.6896460652351379, -0.0...   
4      [0.498867005109787, 0.6893705725669861, -0.030...   
6      [0.49839678406715393, 0.6897587776184082, -0.0...   
8      [0.49848517775535583, 0.6896651983261108, -0.0...   
...                                                  ...   
19382  [0.4998423457145691, 0.6506761312484741, -0.03...   
19384  [0.5001189112663269, 0.6512563228607178, -0.03...   
19386  [0.5004903674125671, 0.6509153246879578, -0.03...   
19388  [0.5006995797157288, 0.6506898403167725, -0.03...   
19390  [0.5005106925964355, 0.6506888270378113, -0.03...   

                                              landmark_1  \
0      [0.499955415725708, 0.570995032787323, -0.0658...   
2      [0.5003747940063477, 0.570743128657341, -0.065...   
4      [0.5005361437797546, 0.5708734691143036, -0.06...   
6      [0.5009960532188416, 0.571255758

In [12]:
n_blendshape = 2424
distance_X = FullDistance().transform(half_X)
print((distance_X.iloc[:,296]))
Y = []
for row in half_Y.iterrows():
    Y.append(row[1].values[0][0])
print(Y)
print("Done")
colors = ["red","blue","yellow","purple"]
for i, color in enumerate(colors):
    plt.scatter(distance_X.iloc[i*n_blendshape:(i+1)*n_blendshape, 296], Y[i*n_blendshape:(i+1)*n_blendshape], c=color)
plt.show()

KeyboardInterrupt: 

In [29]:
n_targets = multitask_Y.iloc[0].values[0].shape
n_features = distance_X.shape[1]
fig, axes = plt.subplots(nrows=n_targets, ncols=n_features, figsize=(15, 8))

for i in range(n_targets):
    for j in range(n_features):
        axes[i, j].scatter(X[:, j], Y[:, i], alpha=0.5)
        axes[i, j].set_xlabel(f"Feature {j+1}")
        axes[i, j].set_ylabel(f"Target {i+1}")

fig.tight_layout()
plt.show()

(48,) 478
