In [None]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from sklearn.decomposition import PCA

# Step 1: Read the .xlsx file
df = pd.read_excel('/kaggle/input/edna/edna new merged data.xlsx', engine='openpyxl')

# Clean column names
df.columns = df.columns.str.strip()
print("Columns in dataset:", df.columns.tolist())

# Step 2: Check for target column
if 'scientificName' not in df.columns:
    raise ValueError("Error: 'scientificName' column not found in the dataset.")

# Define features and target
exclude_cols = ['scientificName', 'id', 'occurrenceID', 'eventID', 'fieldNumber', 
                'verbatimIdentification', 'pcr_primer_forward', 'pcr_primer_reverse', 
                'pcr_primer_reference', 'eventDate', 'locality']
X = df.drop(columns=exclude_cols)
y = df['scientificName']

# Step 3: Filter out rare classes
min_class_size = 3
y_counts = y.value_counts()
valid_classes = y_counts[y_counts >= min_class_size].index
X_filtered = X[y.isin(valid_classes)].copy()
y_filtered = y[y.isin(valid_classes)]
print("Original class distribution:", Counter(y_filtered))

# Step 4: Preprocess features
numerical_cols = ['organismQuantity', 'sampleSizeValue', 'decimalLatitude', 'decimalLongitude']
categorical_cols = ['env_broad_scale', 'lib_layout', 'target_gene', 'seq_meth', 
                    'pcr_primer_name_forward', 'pcr_primer_name_reverse', 'basisOfRecord', 
                    'organismQuantityType', 'occurrenceStatus', 'sampleSizeUnit', 
                    'country', 'geodeticDatum']
text_cols = ['DNA_sequence']

# Handle missing values
X_filtered.loc[:, numerical_cols] = X_filtered[numerical_cols].fillna(X_filtered[numerical_cols].mean())
X_filtered.loc[:, categorical_cols] = X_filtered[categorical_cols].fillna('Unknown')
X_filtered.loc[:, text_cols] = X_filtered[text_cols].fillna('')

# K-mer extraction
def extract_kmers(sequence, k=3):
    sequence = str(sequence) if sequence else ""
    return Counter([sequence[i:i+k] for i in range(max(0, len(sequence)-k+1))])

kmer_features = X_filtered['DNA_sequence'].apply(extract_kmers)
kmer_df = pd.DataFrame(kmer_features.tolist(), index=X_filtered.index).fillna(0)

# PCA on k-mers
pca = PCA(n_components=50, random_state=42)
kmer_reduced = pca.fit_transform(kmer_df)
kmer_columns = [f'kmer_pca_{i}' for i in range(kmer_reduced.shape[1])]
kmer_df = pd.DataFrame(kmer_reduced, index=kmer_df.index, columns=kmer_columns)

# Combine features
X_filtered = pd.concat([X_filtered[numerical_cols + categorical_cols], kmer_df], axis=1)
X_filtered = pd.get_dummies(X_filtered, columns=categorical_cols, drop_first=True)
features = X_filtered.columns.tolist()

# Step 5: SMOTE
smote = SMOTE(random_state=42, k_neighbors=2)
X_res, y_res = smote.fit_resample(X_filtered, y_filtered)
print("Resampled class distribution:", Counter(y_res))

# Step 6: Encode labels
label_encoder = LabelEncoder()
y_res_encoded = label_encoder.fit_transform(y_res)

# Step 7: Train model
clf = RandomForestClassifier(class_weight='balanced', max_depth=10, min_samples_split=10, 
                             n_estimators=200, max_features='sqrt', random_state=42)
cross_val_scores = cross_val_score(clf, X_res, y_res_encoded, cv=5)
print(f"Cross-validation accuracy scores: {cross_val_scores}")
print(f"Mean cross-validation accuracy: {cross_val_scores.mean()}")

clf.fit(X_res, y_res_encoded)
y_pred = clf.predict(X_res)
print("Classification Report:")
print(classification_report(y_res_encoded, y_pred, target_names=label_encoder.classes_))


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# Encode target labels
label_encoder = LabelEncoder()
y_res_encoded = label_encoder.fit_transform(y_res)

# Initialize Gradient Boosting Classifier
clf = GradientBoostingClassifier(learning_rate=0.1, n_estimators=200, max_depth=3, random_state=42)

# Perform cross-validation
cross_val_scores = cross_val_score(clf, X_res, y_res_encoded, cv=5)

# Print cross-validation accuracy scores
print(f"Cross-validation accuracy scores: {cross_val_scores}")
print(f"Mean cross-validation accuracy: {cross_val_scores.mean()}")

# Train on full data and print classification report
clf.fit(X_res, y_res_encoded)
y_pred = clf.predict(X_res)
print("Classification Report:")
print(classification_report(y_res_encoded, y_pred, target_names=label_encoder.classes_))


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Encode the target labels into numeric values
y_encoded = label_encoder.fit_transform(y_res)  # Using y_res after resampling

# Use XGBoost Classifier
clf = XGBClassifier(
    objective="multi:softmax",
    eval_metric="mlogloss",
    use_label_encoder=False,
    scale_pos_weight=1,  # Helps with class imbalance
    random_state=42
)

# Fit the model
clf.fit(X_res, y_encoded)

# Make predictions
y_pred = clf.predict(X_res)

# Print classification report
print("Classification Report:")
print(classification_report(y_encoded, y_pred))

# If you want to see the class labels corresponding to the numeric values:
print("Class labels:", label_encoder.classes_)


In [None]:
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report

# Use LightGBM Classifier with suppressed verbose output
clf = LGBMClassifier(
    class_weight='balanced',  # Helps with class imbalance
    random_state=42,
    n_jobs=-1,  # Use all cores to speed up computation
    boosting_type='gbdt',
    objective='multiclass',   # For multi-class classification
    num_class=len(set(y_res)), # Number of unique classes in target
    verbose=-1  # Suppress training messages
)

# Fit the model
clf.fit(X_res, y_res)

# Predict the target values
y_pred = clf.predict(X_res)

# Print the classification report
print("Classification Report:")
print(classification_report(y_res, y_pred))


In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score

# Use Extra Trees Classifier
clf = ExtraTreesClassifier(
    class_weight='balanced',  # Handles class imbalance
    random_state=42,
    n_jobs=-1  # Use all cores to speed up computation
)

# Perform cross-validation
cross_val_scores = cross_val_score(clf, X_res, y_res, cv=5)

# Print cross-validation accuracy scores
print(f"Cross-validation accuracy scores: {cross_val_scores}")
print(f"Mean cross-validation accuracy: {cross_val_scores.mean()}")

clf.fit(X_res, y_res)
y_pred = clf.predict(X_res)
print("Classification Report:")
print(classification_report(y_res, y_pred))

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier

# Use HistGradientBoosting Classifier
clf = HistGradientBoostingClassifier(
    random_state=42,
    class_weight='balanced'  # Handle class imbalance
)

# Perform cross-validation
cross_val_scores = cross_val_score(clf, X_res, y_res, cv=5)

# Print cross-validation accuracy scores
print(f"Cross-validation accuracy scores: {cross_val_scores}")
print(f"Mean cross-validation accuracy: {cross_val_scores.mean()}")
clf.fit(X_res, y_res)
y_pred = clf.predict(X_res)
print("Classification Report:")
print(classification_report(y_res, y_pred))

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

# Use K-Nearest Neighbors Classifier
clf = KNeighborsClassifier(
    n_neighbors=5,
    weights='distance'  # Uses distance-weighted neighbors
)

# Perform cross-validation
cross_val_scores = cross_val_score(clf, X_res, y_res, cv=5)

# Print cross-validation accuracy scores
print(f"Cross-validation accuracy scores: {cross_val_scores}")
print(f"Mean cross-validation accuracy: {cross_val_scores.mean()}")

clf.fit(X_res, y_res)
y_pred = clf.predict(X_res)
print("Classification Report:")
print(classification_report(y_res, y_pred))

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

# Use Voting Classifier with different models
clf = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression(class_weight='balanced', random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('svc', SVC(kernel='rbf', probability=True, random_state=42))
    ],
    voting='soft'
)

# Perform cross-validation
cross_val_scores = cross_val_score(clf, X_res, y_res, cv=5)

# Print cross-validation accuracy scores
print(f"Cross-validation accuracy scores: {cross_val_scores}")
print(f"Mean cross-validation accuracy: {cross_val_scores.mean()}")

clf.fit(X_res, y_res)
y_pred = clf.predict(X_res)
print("Classification Report:")
print(classification_report(y_res, y_pred))