In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb

# ------------------------
# Step 1: Create Dummy Dataset
# ------------------------
def random_dna_seq(length=20):
    return ''.join(random.choice("ATGC") for _ in range(length))

np.random.seed(42)
data = pd.DataFrame({
    "sequence": [random_dna_seq(20) for _ in range(200)],   # 200 dummy sgRNAs
    "efficiency": np.random.rand(200)                      # random efficiency values
})

print("Dummy CRISPR dataset created")
print(data.head())

# ------------------------
# Step 2: Feature Engineering
# ------------------------
def featurize(seq):
    """Extract simple nucleotide features"""
    features = {
        "A_count": seq.count("A"),
        "T_count": seq.count("T"),
        "G_count": seq.count("G"),
        "C_count": seq.count("C"),
        "GC_content": (seq.count("G") + seq.count("C")) / len(seq)
    }
    return features

features = pd.DataFrame([featurize(seq) for seq in data["sequence"]])
dataset = pd.concat([features, data["efficiency"]], axis=1)

# ------------------------
# Step 3: Train-Test Split
# ------------------------
X = dataset.drop("efficiency", axis=1)
y = dataset["efficiency"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
# Step 4: Train LightGBM Model
# ------------------------
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

params = {
    "objective": "regression",
    "metric": "rmse",
    "verbosity": -1
}

# Fix: Use callbacks parameter for early stopping instead of early_stopping_rounds
model = lgb.train(
    params, 
    train_data, 
    valid_sets=[test_data], 
    num_boost_round=50,
    callbacks=[lgb.early_stopping(stopping_rounds=10)]  # Use callbacks instead of early_stopping_rounds
)

# If the above doesn't work with your LightGBM version, try this alternative:
# model = lgb.train(
#     params, 
#     train_data, 
#     valid_sets=[test_data], 
#     num_boost_round=50
# )

Dummy CRISPR dataset created
               sequence  efficiency
0  CTAATTATCGGTGTGTAGTA    0.374540
1  GCCTGGGATCGGTTAGGTTG    0.950714
2  CGGTCCCAAGAAAGATCAAT    0.731994
3  TAGATTGGACTAACGGCCAC    0.598658
4  AGACGTGCGGCTCTGAAAAG    0.156019
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[50]	valid_0's rmse: 0.282162
