In [None]:
import os
import pandas as pd
import numpy as np
import pickle
import tcrgp
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from mhcflurry import Class1AffinityPredictor

In [None]:
# Load the MHCflurry predictor
predictor = Class1AffinityPredictor.load()

In [None]:
dir_patient = 'data/synthetic.csv'
training_data = 'data/training_data_RF.csv'
output_directory = 'results'

patient = pd.read_csv(dir_patient)
training = pd.read_csv(training_data)

In [None]:
import pandas as pd

def onehot_encode_antigens(df, antigen_column='antigen', label_column=None):
    # Define the possible characters (amino acids) in the antigens
    characters = list('ACDEFGHIKLMNPQRSTVWY')

    # Determine the maximum length of the antigens
    max_length = df[antigen_column].str.len().max()

    # Initialize a dictionary to hold the one-hot encoded data
    onehot_encoded = {f'{antigen_column}_{char}_{pos}': [] for pos in range(max_length) for char in characters}

    # Fill in the one-hot encoded data
    for antigen in df[antigen_column]:
        for pos in range(max_length):
            char = antigen[pos] if pos < len(antigen) else None
            for amino_acid in characters:
                onehot_encoded[f'{antigen_column}_{amino_acid}_{pos}'].append(int(char == amino_acid) if char is not None else 0)

    # Create a DataFrame from the one-hot encoded data
    onehot_df = pd.DataFrame(onehot_encoded)

    # Optionally, concatenate with the label column if it exists
    if label_column and label_column in df.columns:
        onehot_df = pd.concat([onehot_df, df[label_column]], axis=1)

    return onehot_df
# Example usage
# df is your DataFrame
onehot_encoded_df = onehot_encode_antigens(training, antigen_column='antigen', label_column='neo_vs_anti')
onehot_patient_encoded_df = onehot_encode_antigens(patient, antigen_column='antigen', label_column=None)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle

def neoantigen_classification(one_hot, output_dir):
    # Splitting the dataset into training and testing sets
    train, test = train_test_split(one_hot, test_size=0.2, random_state=25)
    
    # Separating the features and the label
    x_train = train.drop('neo_vs_anti', axis=1)
    y_train = train['neo_vs_anti']
    x_test = test.drop('neo_vs_anti', axis=1)
    y_test = test['neo_vs_anti']

    # Displaying the number of samples in training and testing sets
    print(f"No. of training samples: {train.shape[0]}")
    print(f"No. of testing samples: {test.shape[0]}")

    # Setting up the parameter grid for hyperparameter tuning
    param_grid = {
        'n_estimators': [10, 20, 50, 100],
        'max_depth': [5, 10, 20, 50]
    }

    # Initializing the RandomForestClassifier
    rf = RandomForestClassifier(random_state=42)

    # Hyperparameter tuning using GridSearchCV
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
    grid_search.fit(x_train, y_train)

    # Extracting the best model
    best_rf = grid_search.best_estimator_
    print(f"Optimal parameters: {grid_search.best_params_}")

    # Model evaluation
    y_pred = best_rf.predict(x_test)
    print(f"Accuracy on test set: {accuracy_score(y_test, y_pred)}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    # Saving the model
    filename = f'{output_dir}/best_rf_model.sav'
    with open(filename, 'wb') as f:
        pickle.dump(best_rf, f)
    print('Best model saved.')

# Example usage
# neoantigen_classification(one_hot_encoded_df, 'path_to_output_directory')


In [None]:
neoantigen_classification(onehot_encoded_df, output_directory)

In [None]:
model_filename = "/home/sjurc/Documents/python/neoantigen/automate/results/best_rf_model.sav" 
with open(model_filename, 'rb') as file:
    trained_model = pickle.load(file)

In [None]:
patient_predictions = trained_model.predict(onehot_patient_encoded_df)
patient['Classification'] = patient_predictions

In [None]:
def calculate_binding_affinity(df, antigen_col='antigen', hla_col='hla'):
    """
    Calculate binding affinity between antigens and MHC molecules and normalize the values.

    :param df: DataFrame containing antigens and corresponding HLA alleles.
    :param antigen_col: Name of the column containing the antigen sequences.
    :param hla_col: Name of the column containing the HLA alleles.
    :return: DataFrame with original data and normalized binding affinities.
    """

    # Initialize a list to store binding affinity results
    binding_affinities = []

    # Iterate over rows in DataFrame
    for _, row in df.iterrows():
        antigen = row[antigen_col]
        hla = row[hla_col]

        # Predict binding affinity
        prediction = predictor.predict(alleles=[hla], peptides=[antigen])
        
        # Extract the affinity and append to the list
        affinity = prediction[0]  # Extracting the first element of the list
        binding_affinities.append(affinity)

    # Add the binding affinity results to the DataFrame
    df['mhc_presentation_score'] = binding_affinities

    # Normalize binding affinity
    max_affinity = df['mhc_presentation_score'].max()
    min_affinity = df['mhc_presentation_score'].min()
    df['mhc_presentation_score'] = (df['mhc_presentation_score'] - min_affinity) / (max_affinity - min_affinity)

    return df



In [None]:
import pandas as pd
import random

def calculate_tcr_probability(df, antigen_col='antigen', tcr_col='tcr_sequence'):
    """
    Calculate a TCR probability for each antigen-TCR pair in the DataFrame.

    :param df: DataFrame containing antigen and TCR sequence columns.
    :param antigen_col: Name of the column containing the antigen sequences.
    :param tcr_col: Name of the column containing the TCR sequences.
    :return: DataFrame with an added column for TCR probabilities.
    """
    # Initialize a list to store TCR probabilities
    tcr_probabilities = []

    # Iterate over rows in DataFrame
    for _, row in df.iterrows():
        # Generate a random TCR probability for each antigen-TCR pair
        tcr_probability = tcrgp.predict([antigen_col], [tcr_col])
        tcr_probabilities.append(tcr_probability)

    # Add the TCR probabilities to the DataFrame
    df['tcr_probability'] = tcr_probabilities

    return df

### Synthetic

In [None]:
onehot_patient_encoded_df = onehot_encode_antigens(patient, antigen_column='antigen', label_column=None)

In [None]:
patient_predictions = trained_model.predict(onehot_patient_encoded_df)
patient['Classification'] = patient_predictions

In [None]:
patient_df_normalized = calculate_binding_affinity(patient, antigen_col='antigen', hla_col='mhc_molecule')

In [None]:
patient_df_normalized_with_tcr = calculate_tcr_probability(patient_df_normalized, antigen_col='antigen', tcr_col='tcr_sequence')

In [None]:
patient_df_normalized_with_tcr

In [None]:
mhc_threshold = 0.85
tcr_threshold = 0.53

min_mhc = 0.78
min_tcr = 0.46

In [None]:
synth = patient_df_normalized_with_tcr

In [None]:
synth[(synth['mhc_presentation_score'] >= mhc_threshold) & (synth['tcr_probability'] >= tcr_threshold)]

In [None]:
synth[
    ((synth['mhc_presentation_score'] >= min_mhc) & (synth['mhc_presentation_score'] < mhc_threshold) & (synth['tcr_probability'] >= min_tcr)) |
    ((synth['tcr_probability'] >= min_tcr) & (synth['tcr_probability'] < tcr_threshold) & (synth['mhc_presentation_score'] >= min_mhc))
]
