In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from ppi_py import ppi_mean_ci
from sklearn.linear_model import LinearRegression
from ppi_py import ppi_ols_ci
from ppi_py.datasets import load_dataset
from ppi_py import ppi_ols_pointestimate


In [4]:
df=pd.read_csv('cyanobacteria_thermotolerance_amino_acid_composition_wide.csv')

In [5]:
amino_acids = ['A', 'R', 'N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V']
# Normalize by total AA count
df['total_aa'] = df[amino_acids].sum(axis=1)
for aa in amino_acids:
    df[f'{aa}_freq'] = df[aa] / df['total_aa']

In [6]:
high_quality_df = df[df['ogt'].notnull()].copy()
low_quality_df = df[df['genome_type']=='single_cell'].copy()

In [26]:
high_quality_df.shape

(39, 54)

In [27]:
low_quality_df.shape

(716, 54)

In [7]:

aa_cols = amino_acids 

# Labeled data (isolates)
X_labeled = high_quality_df[aa_cols].values
y_labeled = high_quality_df['ogt'].values

# Unlabeled data (single cells)
X_unlabeled = low_quality_df[aa_cols].values

# Fit model on labeled data
model = LinearRegression()
model.fit(X_labeled, y_labeled)

# Predict for labeled and unlabeled
yhat_labeled = model.predict(X_labeled)
yhat_unlabeled = model.predict(X_unlabeled)

alpha = 0.05  
ci_low, ci_high = ppi_ols_ci(X_labeled, y_labeled, yhat_labeled, X_unlabeled, yhat_unlabeled, alpha=alpha)
point_estimates = ppi_ols_pointestimate(X_labeled, y_labeled, yhat_labeled, X_unlabeled, yhat_unlabeled)


for i, aa in enumerate(aa_cols):
    print(f"{aa}:  CI = [{ci_low[i]:.3f}, {ci_high[i]:.3f}] | Point Estimate = {point_estimates[i]:.3f}")


A:  CI = [-0.020, 0.003] | Point Estimate = -0.009
R:  CI = [0.045, 0.082] | Point Estimate = 0.063
N:  CI = [-0.055, 0.026] | Point Estimate = -0.014
D:  CI = [-0.038, 0.028] | Point Estimate = -0.005
C:  CI = [0.081, 0.194] | Point Estimate = 0.137
Q:  CI = [0.000, 0.028] | Point Estimate = 0.014
E:  CI = [-0.076, -0.026] | Point Estimate = -0.051
G:  CI = [-0.033, 0.012] | Point Estimate = -0.010
H:  CI = [-0.125, -0.033] | Point Estimate = -0.079
I:  CI = [-0.041, 0.010] | Point Estimate = -0.016
L:  CI = [-0.028, 0.011] | Point Estimate = -0.008
K:  CI = [0.050, 0.094] | Point Estimate = 0.072
M:  CI = [-0.089, 0.004] | Point Estimate = -0.043
F:  CI = [-0.060, -0.000] | Point Estimate = -0.030
P:  CI = [-0.054, -0.006] | Point Estimate = -0.030
S:  CI = [0.009, 0.052] | Point Estimate = 0.031
T:  CI = [-0.045, 0.012] | Point Estimate = -0.016
W:  CI = [-0.006, 0.089] | Point Estimate = 0.042
Y:  CI = [0.011, 0.067] | Point Estimate = 0.039
V:  CI = [-0.012, 0.043] | Point Estimat

## PPI counts

In [37]:

# use counts
aa_cols = amino_acids
X_all = high_quality_df[aa_cols].values
y_all = high_quality_df["ogt"].values

# Split into small labeled + large "pseudo-unlabeled"
np.random.seed(105)
n_total = X_all.shape[0]
n_labeled = 20  # simulate small labeled set
indices = np.random.permutation(n_total)

labeled_idx = indices[:n_labeled]
unlabeled_idx = indices[n_labeled:]

X_labeled = X_all[labeled_idx]
y_labeled = y_all[labeled_idx]
X_unlabeled = X_all[unlabeled_idx]
y_unlabeled_true = y_all[unlabeled_idx]  # only for validation

# Train a model on labeled data and get predictions
model = LinearRegression()
model.fit(X_labeled, y_labeled)
yhat_labeled = model.predict(X_labeled)
yhat_unlabeled = model.predict(X_unlabeled)

# Apply PPI
alpha = 0.05
ci_low, ci_high = ppi_ols_ci(X_labeled, y_labeled, yhat_labeled, X_unlabeled, yhat_unlabeled, alpha=alpha)
ppi_beta = ppi_ols_pointestimate(X_labeled, y_labeled, yhat_labeled, X_unlabeled, yhat_unlabeled)

# Compare with full regression (ground truth)
full_model = LinearRegression()
full_model.fit(X_all, y_all)
full_beta = full_model.coef_

# # Print comparison
# print("Amino Acid    |  PPI Beta  |  Full Beta")
# print("-" * 35)
# for i, aa in enumerate(aa_cols):
#     print(f"{aa:<12}  {ppi_beta[i]:>9.3f}   {full_beta[i]:>9.3f}")
# Coefficients from naive model (trained only on labeled data)
naive_beta = model.coef_

# Print comparison table with 3 models: Naive, PPI, Full
print("Amino Acid    |  Naive Beta |  PPI Beta  |  Full Beta")
print("-" * 50)
for i, aa in enumerate(aa_cols):
    print(f"{aa:<12}  {naive_beta[i]:>11.3f}   {ppi_beta[i]:>9.3f}   {full_beta[i]:>9.3f}")



Amino Acid    |  Naive Beta |  PPI Beta  |  Full Beta
--------------------------------------------------
A                  -0.010      -0.015       0.003
R                  -0.014      -0.006      -0.011
N                  -0.028      -0.069       0.004
D                  -0.091      -0.155      -0.016
C                   0.013       0.010      -0.001
Q                   0.014       0.088      -0.006
E                   0.016      -0.140       0.002
G                  -0.012       0.076       0.009
H                  -0.043      -0.066      -0.005
I                   0.061       0.079       0.016
L                   0.021       0.047      -0.004
K                   0.091       0.148       0.013
M                  -0.003      -0.010      -0.015
F                   0.044       0.179      -0.028
P                  -0.021       0.026       0.005
S                   0.038       0.051       0.016
T                  -0.031      -0.154      -0.016
W                  -0.075      -0.198      -0

In [38]:
from sklearn.metrics import mean_squared_error, r2_score


ppi_intercept = model.intercept_           # PPI uses same model intercept as naive
full_intercept = full_model.intercept_
naive_beta = model.coef_

# Predictions using coefficients + intercepts
y_pred_ppi = X_unlabeled @ ppi_beta + ppi_intercept
y_pred_full = X_unlabeled @ full_beta + full_intercept
y_pred_naive = X_unlabeled @ naive_beta + ppi_intercept  # same model

# Evaluation function
def evaluate(y_true, y_pred, label):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{label:<20} → MSE: {mse:.3f}, R²: {r2:.3f}")

print("\n Comparing Predictions on Unlabeled Isolates")
evaluate(y_unlabeled_true, y_pred_naive, "Naive (Labeled Only)")
evaluate(y_unlabeled_true, y_pred_ppi,   "PPI-Corrected")
evaluate(y_unlabeled_true, y_pred_full,  "Full Model")



 Comparing Predictions on Unlabeled Isolates
Naive (Labeled Only) → MSE: 108.209, R²: -15.276
PPI-Corrected        → MSE: 285444.499, R²: -42932.689
Full Model           → MSE: 1.158, R²: 0.826


In [39]:
# Get model coefficients and intercepts
naive_beta = model.coef_
ppi_intercept = model.intercept_         # Use same intercept for PPI
full_intercept = full_model.intercept_

# Predictions on full data (not just unlabeled)
y_pred_naive = X_all @ naive_beta + ppi_intercept
y_pred_ppi   = X_all @ ppi_beta + ppi_intercept
y_pred_full  = X_all @ full_beta + full_intercept

# Evaluation function
def evaluate(y_true, y_pred, label):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{label:<20} → MSE: {mse:.3f}, R²: {r2:.3f}")

print("\n Comparing Predictions on All Isolates")
evaluate(y_all, y_pred_naive, "Naive (Labeled only)")
evaluate(y_all, y_pred_ppi,   "PPI-Corrected")
evaluate(y_all, y_pred_full,  "Full Model ")



 Comparing Predictions on All Isolates
Naive (Labeled only) → MSE: 52.717, R²: -8.169
PPI-Corrected        → MSE: 280529.232, R²: -48793.155
Full Model           → MSE: 0.889, R²: 0.845


### PPI frequency 

In [31]:

# Use frequency columns
aa_cols = [f'{aa}_freq' for aa in amino_acids]
X_all = high_quality_df[aa_cols].values
y_all = high_quality_df["ogt"].values

# Split into small labeled + large "pseudo-unlabeled"
np.random.seed(105)
n_total = X_all.shape[0]
n_labeled = 20
indices = np.random.permutation(n_total)

labeled_idx = indices[:n_labeled]
unlabeled_idx = indices[n_labeled:]

X_labeled = X_all[labeled_idx]
y_labeled = y_all[labeled_idx]
X_unlabeled = X_all[unlabeled_idx]
y_unlabeled_true = y_all[unlabeled_idx]

# Train naive model
model = LinearRegression()
model.fit(X_labeled, y_labeled)
yhat_labeled = model.predict(X_labeled)
yhat_unlabeled = model.predict(X_unlabeled)

# PPI correction
alpha = 0.05
ci_low, ci_high = ppi_ols_ci(X_labeled, y_labeled, yhat_labeled, X_unlabeled, yhat_unlabeled, alpha=alpha)
ppi_beta = ppi_ols_pointestimate(X_labeled, y_labeled, yhat_labeled, X_unlabeled, yhat_unlabeled)

# Full model
full_model = LinearRegression()
full_model.fit(X_all, y_all)
full_beta = full_model.coef_

# Naive model beta
naive_beta = model.coef_

# Print comparison table
print("Amino Acid    |  Naive Beta |  PPI Beta  |  Full Beta")
print("-" * 50)
for i, aa in enumerate(amino_acids):
    print(f"{aa + '_freq':<12}  {naive_beta[i]:>11.3f}   {ppi_beta[i]:>9.3f}   {full_beta[i]:>9.3f}")


Amino Acid    |  Naive Beta |  PPI Beta  |  Full Beta
--------------------------------------------------
A_freq         105922.348   -15578.978   841298814748705.750
R_freq          57555.992   -32015.291   841298814734615.250
N_freq          69293.183     932.662   841298814750122.125
D_freq         -27669.222   -140857.297   841298814734663.375
C_freq        -151612.315   -221698.367   841298814744235.625
Q_freq         138569.186   29762.449   841298814742293.875
E_freq          93194.026   -23175.295   841298814748657.500
G_freq         122803.541   28690.760   841298814755382.875
H_freq         -55850.885   -167609.614   841298814743534.500
I_freq         265212.347   135159.859   841298814753681.125
L_freq          58711.757   -38275.401   841298814742021.125
K_freq         268794.410   123140.132   841298814758532.750
M_freq          20718.614   -36116.685   841298814731417.625
F_freq         -53476.170   -140991.796   841298814725870.000
P_freq         148542.662   45339.744   

In [30]:

ppi_intercept = model.intercept_           # PPI uses same model intercept as naive
full_intercept = full_model.intercept_
naive_beta = model.coef_

# Predictions using coefficients + intercepts
y_pred_ppi = X_unlabeled @ ppi_beta + ppi_intercept
y_pred_full = X_unlabeled @ full_beta + full_intercept
y_pred_naive = X_unlabeled @ naive_beta + ppi_intercept  # same model

# Evaluation function
def evaluate(y_true, y_pred, label):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{label:<20} → MSE: {mse:.3f}, R²: {r2:.3f}")

print("\n Comparing Predictions on Unlabeled Isolates")
evaluate(y_unlabeled_true, y_pred_naive, "Naive (Labeled Only)")
evaluate(y_unlabeled_true, y_pred_ppi,   "PPI-Corrected")
evaluate(y_unlabeled_true, y_pred_full,  "Full Model")


 Comparing Predictions on Unlabeled Isolates
Naive (Labeled Only) → MSE: 1044.245, R²: -156.065
PPI-Corrected        → MSE: 11242536564.232, R²: -1690989229.044
Full Model           → MSE: 1.193, R²: 0.821


In [35]:
from sklearn.metrics import mean_squared_error, r2_score

# Get model coefficients and intercepts
naive_beta = model.coef_
ppi_intercept = model.intercept_         # Use same intercept for PPI
full_intercept = full_model.intercept_

# Predictions on full data (not just unlabeled)
y_pred_naive = X_all @ naive_beta + ppi_intercept
y_pred_ppi   = X_all @ ppi_beta + ppi_intercept
y_pred_full  = X_all @ full_beta + full_intercept

# Evaluation function
def evaluate(y_true, y_pred, label):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{label:<20} → MSE: {mse:.3f}, R²: {r2:.3f}")

print("\n Comparing Predictions on All Isolates")
evaluate(y_all, y_pred_naive, "Naive (Labeled only)")
evaluate(y_all, y_pred_ppi,   "PPI-Corrected")
evaluate(y_all, y_pred_full,  "Full Model ")



 Comparing Predictions on All Isolates
Naive (Labeled only) → MSE: 508.735, R²: -87.487
PPI-Corrected        → MSE: 11243581616.577, R²: -1955664518.415
Full Model           → MSE: 1.046, R²: 0.818
