In [None]:
#Imports
import os, sys
path_to_package = os.path.abspath(os.path.join('../'))
if path_to_package not in sys.path:
    sys.path.append(path_to_package)


from src.io import get_filepaths
from src.models import data_preparation
from src.utils import get_filename

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp

import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import median_absolute_error
from sklearn.linear_model import Ridge, RidgeCV

In [None]:
ANOVA_DATASET_PATH = "/home/khaldrem/code/sc_regmod/dataset/anova"
INDEX_PATH = "/home/khaldrem/code/sc_regmod/dataset/index"
PHENOTYPES_PATH = "/home/khaldrem/code/sc_regmod/dataset/phenotypes/clean_phenotypes.csv"

In [None]:
# LOAD FILE
filepath = '/home/khaldrem/code/sc_regmod/dataset/anova/anova_at_least_one_phenotype/p_value_0_05/all/YDR543C.phylip'
filename = get_filename(filepath)
df, data_length = data_preparation(filepath, PHENOTYPES_PATH)


In [None]:
choosen_phenotype = 'SM300-Efficiency'

In [None]:
#Y labels
labels = np.array(df[choosen_phenotype])

#X features
features_ohe = pd.get_dummies(df.iloc[:, 0:data_length])
features_list = list(features_ohe.columns)
print(f"features_list: {len(features_list)}")
features = np.array(features_ohe)

pd_features = pd.DataFrame(features, columns=features_ohe.columns)


#Split Train/Test
X_train, X_test, y_train, y_test = train_test_split(pd_features, labels, test_size = 0.25, random_state=42, shuffle=False)


In [None]:
# model = Ridge(alpha=)
# model.fit(X_train, y_train)

# # y_pred = model.predict(X_train)

In [None]:
# model = RidgeCV(cv=5)
# model.fit(X_train, y_train)

# # y_pred = model.predict(X_train)

In [None]:
# print(f"alphas: {model.alphas}")
# print(f"best score: {model.best_score_}")
# print(f"coef_: {model.coef_.shape}")
# print(f"cv: {model.cv}")
# # print(f"features: {model.feature_names_in_}")
# print(f": {model.}")




In [None]:
# plt.scatter(y_train, y_pred)
# plt.scatter(X_train)




print(model.intercept_)
print(len(model.coef_))
print(X_train.shape)


# plt.plot(model.coef_)
# # print(X_train)

# plt.scatter(X_train.iloc[:, 0], y_train)

# plt.scatter(X_train.iloc[:, 1], y_train)

# plt.scatter(X_train.iloc[:, 2], y_train)

# plt.scatter(X_train.iloc[:, 3], y_train)

# print(X_train.shape)
# print(len(y_train))

# plt.bloxplot(X_train.iloc[:, 0], y_train)




In [None]:
mae = median_absolute_error(y_train, y_pred)
string_score = f"MAE on training set: {mae:.2f}"

y_pred = model.predict(X_test)
mae = median_absolute_error(y_test, y_pred)
string_score += f"\nMAE on testing set: {mae:.2f}"

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
plt.scatter(y_test, y_pred)
# ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c="red")
# plt.text(0.2, 1.3, string_score)
plt.title("Ridge model, small regularization")
plt.ylabel("Model predictions")
plt.xlabel("Truths")
# plt.xlim([0, 27])
# _ = plt.ylim([0, 27])

plt.show()

In [None]:
coefs = pd.DataFrame(
    model.coef_,
    columns=["Coefficients"],
    index=model.feature_names_in_,
)

coefs

In [None]:
coefs = coefs.sort_values(by="Coefficients", ascending=True)
coefs[:10]


In [None]:
coefs[:10].plot.barh(figsize=(9, 7))
plt.title("Ridge model, small regularization")
plt.axvline(x=0, color=".5")
plt.xlabel("Raw coefficient values")
# plt.subplots_adjust(left=0.1)


In [None]:
coefs.plot.barh(figsize=(9, 7))
plt.title("Feature ranges")
plt.xlabel("Std. dev. of feature values")
plt.subplots_adjust(left=0.3)

In [None]:
import numpy as np

In [None]:
Y = np.array(df[choosen_phenotype])
X = df.iloc[:, :-12]

Y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state = 42)

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder

In [None]:
categorical_columns = X.columns

preprocessor = make_column_transformer(
    (OneHotEncoder(drop="if_binary", handle_unknown="ignore"), categorical_columns),
    remainder="passthrough",
    verbose_feature_names_out=False,
)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.compose import TransformedTargetRegressor


In [None]:
model = make_pipeline(
    preprocessor,
    TransformedTargetRegressor(
        regressor=Ridge(alpha=1e-10), func=np.log10, inverse_func=sp.special.exp10
    ),
)

In [None]:
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import median_absolute_error

y_pred = model.predict(X_train)

mae = median_absolute_error(y_train, y_pred)
print(f"MAE on training set: {mae:.2f}")


y_pred = model.predict(X_test)
mae = median_absolute_error(y_test, y_pred)
print(f"MAE on training set: {mae:.2f}")


In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
plt.scatter(y_test, y_pred)
# ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c="red")
# plt.text(3, 20, string_score)
plt.title("Ridge model, small regularization")
plt.ylabel("Model predictions")
plt.xlabel("Truths")
# plt.xlim([0, 27])
# _ = plt.ylim([0, 27])

In [None]:
feature_names = model[:-1].get_feature_names_out()

coefs = pd.DataFrame(
    model[-1].regressor_.coef_,
    columns=["Coefficients"],
    index=feature_names,
)

coefs

In [None]:
coefs.plot.barh(figsize=(9, 7))
plt.title("Ridge model, small regularization")
plt.axvline(x=0, color=".5")
plt.xlabel("Raw coefficient values")
plt.subplots_adjust(left=0.3)

In [None]:
X_train_preprocessed = pd.DataFrame(
    model[:-1].transform(X_train), columns=feature_names
)

X_train_preprocessed.std(axis=0).plot.barh(figsize=(9, 7))
plt.title("Feature ranges")
plt.xlabel("Std. dev. of feature values")
plt.subplots_adjust(left=0.3)

In [None]:
X_train_preprocessed = pd.DataFrame(
    model[:-1].transform(X_train), columns=feature_names
)

In [None]:
X_train.shape

In [None]:
import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets import make_regression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

clf = Ridge()


coefs10 = []
errors = np.zeros((200, 1))

alphas = np.logspace(1.2, 1.4, 200)

# Train the model with different regularisation strengths
for idx, a in enumerate(alphas):
    clf.set_params(alpha=a)
    clf.fit(X_train, y_train)
    coefs10.append(clf.coef_)
    y_pred = clf.predict(X_test)
    errors[idx] = mean_squared_error(y_test, y_pred)

# Display results
plt.figure(figsize=(20, 6))

plt.subplot(121)
ax = plt.gca()
ax.plot(alphas, coefs10)
ax.set_xscale("log")
plt.xlabel("alpha")
plt.ylabel("weights")
plt.title("Ridge coefficients as a function of the regularization")
plt.axis("tight")

plt.subplot(122)
ax = plt.gca()
ax.plot(alphas, errors)
ax.set_xscale("log")
plt.xlabel("alpha")
plt.ylabel("error")
plt.title("Coefficient error as a function of the regularization")
plt.axis("tight")

plt.show()

In [None]:
errors.argmin()

In [None]:
alphas[109]

In [None]:

model = Ridge(alpha = alphas[109])
model.fit(X_train, y_train)



