<a href="https://colab.research.google.com/github/GrandMoff100/MiscColabNotebooks/blob/main/MolecularSmellBruteForceModelSelection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install func_timeout
import func_timeout

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import random
import numpy as np
import logging
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn import tree, svm, decomposition, neural_network
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import FactorAnalysis, IncrementalPCA, KernelPCA, TruncatedSVD, SparsePCA, FastICA
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Load Datasets
train_df = pd.read_csv("https://raw.githubusercontent.com/trevorkarn/MLCamp2022/main/molecular_smell_training_data.csv")
# Shape: (700ish, 4857ish)

# Discard Some Columns
cleaned_df = train_df.drop(columns=["VALENCE.PLEASANTNESS"])

# Clean Data
cleaned_df = cleaned_df.replace(["low", "high"], [0, 1])

In [4]:
X_all = cleaned_df.drop(columns=["SWEETORSOUR"])
y_all = cleaned_df["SWEETORSOUR"]

X_train, X_test, y_train, y_test = train_test_split(
    X_all,
    y_all,
    test_size=0.2,
    random_state=42
)

In [5]:
@func_timeout.func_set_timeout(10)
def score_supervised_model(model):
    cv_scores = cross_val_score(
        model,
        X_train,
        y_train,
        cv=StratifiedKFold(shuffle=True, random_state=42)
    )
    score = cv_scores.mean()
    logging.debug("%r", score)
    logging.debug("")
    return score


In [6]:
def generate_knns():
    for k in range(2, 50):
        yield KNeighborsClassifier(n_neighbors=k, n_jobs=-1)


def generate_svms():
    for kernel in ["poly", "rbf", "sigmoid"]:
        yield svm.SVC(kernel=kernel)


def generate_trees():
    for depth in range(2, 40, 3):
       yield tree.DecisionTreeClassifier(max_depth=depth)


def generate_kmeans():
    yield KMeans(n_clusters=2)


def generate_kernel_pca():
    for (i, kernel) in itertools.product(
        range(2, 100, 3),
        {'rbf', 'sigmoid', 'cosine'}
    ):
        yield KernelPCA(n_components=i, kernel=kernel)


def generate_factor_analysis():
   for (i, rotation) in itertools.product(
        range(2, 100, 3),
        {'varimax', 'quartimax'}
    ):
        yield FactorAnalysis(n_components=i, rotation=rotation)


def generate_incremental_pca():
    for i in  range(2, 100, 3):
        for whiten in [False, True]:
            yield IncrementalPCA(n_components=i, whiten=whiten)


def generate_truncated_svd():
    for i in  range(2, 100, 3):
        yield TruncatedSVD(n_components=i)


def generate_sparse_pca():
    for i in  range(2, 100, 3):
        for sparse in  range(1, 100, 3):
            yield SparsePCA(n_components=i, alpha=sparse)


def generate_fast_ica():
    for i in  range(2, 100, 3):
        yield FastICA(n_components=i)



def generate_supervised_models():
    for transformer in itertools.chain(
        generate_kernel_pca(),
        generate_factor_analysis(),
        generate_incremental_pca(),
        generate_truncated_svd(),
        generate_sparse_pca(),
        generate_fast_ica()
    ):
        for model in itertools.chain(
            generate_knns(),
            generate_svms(),
            generate_trees()
        ):
            yield make_pipeline(transformer, model)


In [None]:
import sys

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
open("omni.log", "w").close()
logger.addHandler(logging.FileHandler("omni.log", encoding="utf-8"))


models = list(generate_supervised_models())
models = list(random.sample(models, len(models) // 4))
total = len(models)

for i, model in enumerate(models):
    logging.debug(f"(%i/%i) %r", i, total, model)
    try:
        score_supervised_model(model)
    except:
        continue