In [None]:
"""# Function to calculate molecular weight and amino acid composition
def calculate_features(sequence):
    analysis = ProteinAnalysis(sequence)
    features = {
        'molecular_weight': analysis.molecular_weight(),
        'isoelectric_point': analysis.isoelectric_point(),
        'aromaticity': analysis.aromaticity(),
        'instability_index': analysis.instability_index(),
    }
    amino_acid_percent = analysis.get_amino_acids_percent()
    features.update(amino_acid_percent)
    return features"""

In [None]:
"""# Read the dataset
train_file_path = '/content/train_dataset.csv'
test_file_path = '/content/test2_dataset.csv'

train_data = pd.read_csv(train_file_path, usecols=['simple_fasta', 'Species', 'Tm'])
test_data = pd.read_csv(test_file_path, usecols=['simple_fasta', 'Species', 'Tm'])

# Feature Engineering: Extract enhanced features from simple_fasta
train_features_df = train_data['simple_fasta'].apply(calculate_features).apply(pd.Series)
test_features_df = test_data['simple_fasta'].apply(calculate_features).apply(pd.Series)

# Concatenate the features with the original dataset
train_data = pd.concat([train_data, train_features_df], axis=1)
test_data = pd.concat([test_data, test_features_df], axis=1)

# Label encoding for 'Species'
le = LabelEncoder()
train_data['Species_encoded'] = le.fit_transform(train_data['Species'])
test_data['Species_encoded'] = le.fit_transform(test_data['Species'])

# Select only the required columns and prepare for normalization
feature_columns = ['Species_encoded', 'molecular_weight', 'isoelectric_point', 'aromaticity', 'instability_index']
train_dt = train_data[feature_columns + ['Tm']].copy()
test_dt = test_data[feature_columns + ['Tm']].copy()"""

In [None]:
"""# Initialize the StandardScaler
scaler = StandardScaler()

# Normalize continuous features
continuous_features = ['molecular_weight', 'isoelectric_point', 'aromaticity', 'instability_index', 'Tm']
train_dt[continuous_features] = scaler.fit_transform(train_dt[continuous_features])
test_dt[continuous_features] = scaler.transform(test_dt[continuous_features])  # Use transform instead of fit_transform for test data

# Step 2: Augment Data using LLM
# Using a pre-trained language model for data augmentation (example: GPT-2)
generator = pipeline('text-generation', model='gpt2', truncation=True)  # Add truncation
set_seed(42)

# Function to augment data
def augment_data(sequence):
    augmented_texts = generator(sequence, max_length=1024, num_return_sequences=5)
    return [augmented['generated_text'] for augmented in augmented_texts]

# Augment training data
train_data['augmented_sequences'] = train_data['simple_fasta'].apply(lambda seq: augment_data(seq[:100]))  # Truncate sequence to avoid issues
augmented_sequences = train_data.explode('augmented_sequences')

# Recalculate features for augmented data
augmented_features_df = augmented_sequences['augmented_sequences'].apply(calculate_features).apply(pd.Series)
augmented_data = pd.concat([augmented_sequences, augmented_features_df], axis=1)

# Combine original and augmented data
combined_data = pd.concat([train_data, augmented_data])

# Select features and target for combined data
combined_features = combined_data[feature_columns]
combined_target = combined_data['Tm']

# Step 3: Train Regression Model
X_train, X_val, y_train, y_val = train_test_split(combined_features, combined_target, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

# Step 4: Evaluate Model
y_pred_train = model.predict(X_train)
y_pred_val = model.predict(X_val)
y_pred_test = model.predict(test_dt[feature_columns])

train_mse = mean_squared_error(y_train, y_pred_train)
val_mse = mean_squared_error(y_val, y_pred_val)
test_mse = mean_squared_error(test_dt['Tm'], y_pred_test)

print(f'Training MSE: {train_mse}')
print(f'Validation MSE: {val_mse}')
print(f'Test MSE: {test_mse}')"""

In [None]:
"""# Step 5: Visualize Results
plt.figure(figsize=(10, 6))
plt.scatter(y_train, y_pred_train, label='Training Data', alpha=0.6)
plt.scatter(test_dt['Tm'], y_pred_test, label='Test Data', alpha=0.6)
plt.plot([min(test_dt['Tm']), max(test_dt['Tm'])], [min(test_dt['Tm']), max(test_dt['Tm'])], color='red', linestyle='--')
plt.xlabel('Actual Melting Point')
plt.ylabel('Predicted Melting Point')
plt.title('Actual vs Predicted Melting Point')
plt.legend()
plt.show()"""

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
from transformers import pipeline, set_seed
!pip install biopython
from Bio.SeqUtils.ProtParam import ProteinAnalysis



In [None]:
# Step 1: Set up the environment
!pip install transformers biopython pandas scikit-learn

# Step 2: Load and preprocess the dataset
import pandas as pd

# Load the dataset
file_path = '/content/test2_dataset.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(data.head())

# Step 3: Apply DistilBERT for data augmentation
from transformers import DistilBertTokenizer, DistilBertModel
import torch

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

def augment_sequence(sequence):
    inputs = tokenizer(sequence, return_tensors='pt')
    outputs = model(**inputs)
    augmented_sequence = tokenizer.decode(torch.argmax(outputs.last_hidden_state, dim=2)[0])
    return augmented_sequence

data['augmented_sequence'] = data['simple_fasta'].apply(augment_sequence)

# Step 4: Compute features like isoelectric point and aromaticity
from Bio.SeqUtils.ProtParam import ProteinAnalysis

def compute_features(sequence):
    analysis = ProteinAnalysis(sequence)
    isoelectric_point = analysis.isoelectric_point()
    aromaticity = analysis.aromaticity()
    return isoelectric_point, aromaticity

data['isoelectric_point'], data['aromaticity'] = zip(*data['simple_fasta'].apply(compute_features))
data['augmented_isoelectric_point'], data['augmented_aromaticity'] = zip(*data['augmented_sequence'].apply(compute_features))

# Step 5: Train a model before augmentation and evaluate performance
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Features and target variable
X = data[['isoelectric_point', 'aromaticity']]
y = data['Tm']  # Assuming 'target' is the column name for the target variable

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mse_before = mean_squared_error(y_test, y_pred)
r2_before = r2_score(y_test, y_pred)

print(f"MSE before augmentation: {mse_before}")
print(f"R² before augmentation: {r2_before}")

# Step 6: Augment the data and retrain the model
augmented_X = data[['augmented_isoelectric_point', 'augmented_aromaticity']]
augmented_y = y

# Split the augmented data
augmented_X_train, augmented_X_test, augmented_y_train, augmented_y_test = train_test_split(augmented_X, augmented_y, test_size=0.2, random_state=42)

# Retrain the model
augmented_model = LinearRegression()
augmented_model.fit(augmented_X_train, augmented_y_train)

# Predict and evaluate after augmentation
augmented_y_pred = augmented_model.predict(augmented_X_test)
mse_after = mean_squared_error(augmented_y_test, augmented_y_pred)
r2_after = r2_score(augmented_y_test, augmented_y_pred)

print(f"MSE after augmentation: {mse_after}")
print(f"R² after augmentation: {r2_after}")


Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.84
   Unnamed: 0       Protein ID  \
0           0  Q72HG4_TT_C1523   
1           1  Q745T7_TT_P0220   
2           2      Q72G97_recG   
3           3  Q745Z3_TT_P0162   
4           4  Q72HN7_TT_C1449   

                                        simple_fasta         Species      Tm  
0  MREVEPLAERLRPRSLDEVLGQPHLTGPKGLLRRMLEAGRLSSMVL...  T.thermophilus  74.007  
1  MRLDPNHPRPTLQRPGWRSLEGHWDFALSEAEAPGGVRFDRKILVP...  T.thermophilus  77.065  
2  MTWEELEERLARGQDERTLFLPQDISPEDLARYAAGLANHKGGTLF...  T.thermophilus  72.153  
3  MTKAKRTYEPWYWANAHTRLYMRRGYLLPGVSVEERVKEIAQRAEA...  T.

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

MSE before augmentation: 122.5715292796728
R² before augmentation: 0.0020051908700364773
MSE after augmentation: 123.10451859267947
R² after augmentation: -0.002334483855639169
