### Importing modules

In [100]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [101]:
import pandas as pd
import re
import xgboost as xgb
nltk.download('wordnet')
nltk.download('punkt')

from tqdm import tqdm
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from nltk.corpus import inaugural, stopwords
from wordcloud import WordCloud, STOPWORDS
import lightgbm as lgb
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [102]:
data = pd.read_csv(r'C:\Users\user\Videos\GeneExp\data\labelled_train_data.csv')

In [103]:
data.head()

Unnamed: 0.1,Unnamed: 0,geo_accession,gse_id,ctrl,pert,channel_count,characteristics_ch1,contact_address,contact_city,contact_country,...,extract_protocol_ch2,label_ch2,label_protocol_ch2,molecule_ch2,organism_ch2,source_name_ch2,taxid_ch2,treatment_protocol_ch2,biomaterial_provider_ch2,growth_protocol_ch2
0,0,GSM1617977,GSE66250,0,1,1,facs sorting: CD44low/CD24high,Am Hubland,Wuerzburg,Germany,...,,,,,,,,,,
1,1,GSM1617983,GSE66250,0,1,1,facs sorting: Unsorted,Am Hubland,Wuerzburg,Germany,...,,,,,,,,,,
2,2,GSM1617982,GSE66250,1,0,1,facs sorting: CD44low/CD24high,Am Hubland,Wuerzburg,Germany,...,,,,,,,,,,
3,3,GSM1617975,GSE66250,0,1,1,facs sorting: CD44high/CD24low,Am Hubland,Wuerzburg,Germany,...,,,,,,,,,,
4,0,GSM1267968,GSE52505,0,1,1,tissue: human nasal polyp,"148, Gurodong-ro, Guro-gu",Seoul,South Korea,...,,,,,,,,,,


In [104]:
# Describing the dataset
print("The shape of the dataset is:", data.shape)
print("The columns in the dataset are:", data.columns.tolist())
print("The information about the dataset is:", data.info())
print("The statistical summary of the dataset is:", data.describe())
# To check for missing values
print("Missing values in each column:\n", data.isnull().sum())
# To check for duplicate rows
print("Number of duplicate rows:", data.duplicated().sum())
# To check the distribution of the target variable
target = ['ctrl', 'pert']
print("Distribution of the target variable:\n", data[target].value_counts().reset_index(drop=True))

The shape of the dataset is: (623, 53)
The columns in the dataset are: ['Unnamed: 0', 'geo_accession', 'gse_id', 'ctrl', 'pert', 'channel_count', 'characteristics_ch1', 'contact_address', 'contact_city', 'contact_country', 'contact_department', 'contact_email', 'contact_institute', 'contact_name', 'contact_state', 'data_processing', 'data_row_count', 'description', 'extract_protocol_ch1', 'growth_protocol_ch1', 'hyb_protocol', 'label_ch1', 'label_protocol_ch1', 'last_update_date', 'molecule_ch1', 'organism_ch1', 'platform_id', 'scan_protocol', 'source_name_ch1', 'status', 'submission_date', 'supplementary_file', 'taxid_ch1', 'title', 'treatment_protocol_ch1', 'type', 'contact_phone', 'contact_laboratory', 'relation', 'contact_fax', 'biomaterial_provider_ch1', 'contact_web_link', 'characteristics_ch2', 'extract_protocol_ch2', 'label_ch2', 'label_protocol_ch2', 'molecule_ch2', 'organism_ch2', 'source_name_ch2', 'taxid_ch2', 'treatment_protocol_ch2', 'biomaterial_provider_ch2', 'growth_pr

In [105]:
# Copy of the original data
data_copy = data.copy()
data_copy.head()

Unnamed: 0.1,Unnamed: 0,geo_accession,gse_id,ctrl,pert,channel_count,characteristics_ch1,contact_address,contact_city,contact_country,...,extract_protocol_ch2,label_ch2,label_protocol_ch2,molecule_ch2,organism_ch2,source_name_ch2,taxid_ch2,treatment_protocol_ch2,biomaterial_provider_ch2,growth_protocol_ch2
0,0,GSM1617977,GSE66250,0,1,1,facs sorting: CD44low/CD24high,Am Hubland,Wuerzburg,Germany,...,,,,,,,,,,
1,1,GSM1617983,GSE66250,0,1,1,facs sorting: Unsorted,Am Hubland,Wuerzburg,Germany,...,,,,,,,,,,
2,2,GSM1617982,GSE66250,1,0,1,facs sorting: CD44low/CD24high,Am Hubland,Wuerzburg,Germany,...,,,,,,,,,,
3,3,GSM1617975,GSE66250,0,1,1,facs sorting: CD44high/CD24low,Am Hubland,Wuerzburg,Germany,...,,,,,,,,,,
4,0,GSM1267968,GSE52505,0,1,1,tissue: human nasal polyp,"148, Gurodong-ro, Guro-gu",Seoul,South Korea,...,,,,,,,,,,


In [106]:
# Drop unnecessary columns without data entries
cols = ['contact_phone', 'contact_laboratory', 'relation', 'contact_fax', 'biomaterial_provider_ch1', 'contact_web_link', 'characteristics_ch2', 'extract_protocol_ch2', 'label_ch2', 'label_protocol_ch2', 'molecule_ch2', 'organism_ch2', 'source_name_ch2', 'taxid_ch2', 'treatment_protocol_ch2', 'biomaterial_provider_ch2', 'growth_protocol_ch2', 'Unnamed: 0', 'contact_address', 'contact_city', 'contact_email', 'contact_institute', 'contact_state', 'supplementary_file', 'status']
data_df1 = data_copy.drop(columns=cols, axis=1)
print("Snippet of the cleaned data:\n", data_df1.head())


Snippet of the cleaned data:
   geo_accession    gse_id  ctrl  pert  channel_count  \
0    GSM1617977  GSE66250     0     1              1   
1    GSM1617983  GSE66250     0     1              1   
2    GSM1617982  GSE66250     1     0              1   
3    GSM1617975  GSE66250     0     1              1   
4    GSM1267968  GSE52505     0     1              1   

              characteristics_ch1 contact_country  \
0  facs sorting: CD44low/CD24high         Germany   
1          facs sorting: Unsorted         Germany   
2  facs sorting: CD44low/CD24high         Germany   
3  facs sorting: CD44high/CD24low         Germany   
4       tissue: human nasal polyp     South Korea   

                             contact_department    contact_name  \
0  Chair for Biochemistry and Molecular Biology  Martin,,Eilers   
1  Chair for Biochemistry and Molecular Biology  Martin,,Eilers   
2  Chair for Biochemistry and Molecular Biology  Martin,,Eilers   
3  Chair for Biochemistry and Molecular Biolog

In [107]:
data_df1.columns
data_df1.shape


(623, 28)

In [108]:
# # Drop columns with more than 50% missing values
# threshold = len(data) * 0.5
# data = data.loc[:, data.isnull().sum() < threshold]

threshold_null = data_df1.shape[0] // 2

cols_drop = []
for col in data_df1.columns:
    if data_df1.loc[ : , col].isnull().sum() > threshold_null:
        cols_drop.append(col)
        print(cols_drop)
data_df1.drop(columns=cols_drop, axis=1, inplace=True)
data_df1.info()

['growth_protocol_ch1']
['growth_protocol_ch1', 'treatment_protocol_ch1']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 623 entries, 0 to 622
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   geo_accession         623 non-null    object
 1   gse_id                623 non-null    object
 2   ctrl                  623 non-null    int64 
 3   pert                  623 non-null    int64 
 4   channel_count         623 non-null    int64 
 5   characteristics_ch1   623 non-null    object
 6   contact_country       623 non-null    object
 7   contact_department    503 non-null    object
 8   contact_name          623 non-null    object
 9   data_processing       623 non-null    object
 10  data_row_count        623 non-null    int64 
 11  description           476 non-null    object
 12  extract_protocol_ch1  623 non-null    object
 13  hyb_protocol          513 non-null    object
 14  label_ch1       

In [109]:
#  Detecting more object columns
obj_cols = [ col for col in data_df1.columns if data_df1[col].dtype == 'object']
print("Object columns in the dataset:", obj_cols)
data_df1[obj_cols].head()

Object columns in the dataset: ['geo_accession', 'gse_id', 'characteristics_ch1', 'contact_country', 'contact_department', 'contact_name', 'data_processing', 'description', 'extract_protocol_ch1', 'hyb_protocol', 'label_ch1', 'label_protocol_ch1', 'last_update_date', 'molecule_ch1', 'organism_ch1', 'platform_id', 'scan_protocol', 'source_name_ch1', 'submission_date', 'title', 'type']


Unnamed: 0,geo_accession,gse_id,characteristics_ch1,contact_country,contact_department,contact_name,data_processing,description,extract_protocol_ch1,hyb_protocol,...,label_protocol_ch1,last_update_date,molecule_ch1,organism_ch1,platform_id,scan_protocol,source_name_ch1,submission_date,title,type
0,GSM1617977,GSE66250,facs sorting: CD44low/CD24high,Germany,Chair for Biochemistry and Molecular Biology,"Martin,,Eilers",Basecalling was performed with the real time a...,cd44high_dox_vs_etoh.txt and cd44high_etoh_vs_...,Total RNA was extracted using the RNeasy Mini ...,,...,,May 15 2019,polyA RNA,Homo sapiens,GPL10999,,Sorted HMLE cell line,Feb 24 2015,rep1_cd44low_dox,SRA
1,GSM1617983,GSE66250,facs sorting: Unsorted,Germany,Chair for Biochemistry and Molecular Biology,"Martin,,Eilers",Basecalling was performed with the real time a...,imecs_myc_dox_vs_vector_dox.txt,Total RNA was extracted using the RNeasy Mini ...,,...,,May 15 2019,polyA RNA,Homo sapiens,GPL10999,,IMEC cell line,Feb 24 2015,rep1_imecs_myc_dox,SRA
2,GSM1617982,GSE66250,facs sorting: CD44low/CD24high,Germany,Chair for Biochemistry and Molecular Biology,"Martin,,Eilers",Basecalling was performed with the real time a...,cd44high_dox_vs_etoh.txt and cd44high_etoh_vs_...,Total RNA was extracted using the RNeasy Mini ...,,...,,May 15 2019,polyA RNA,Homo sapiens,GPL10999,,Sorted HMLE cell line,Feb 24 2015,rep2_cd44low_etoh,SRA
3,GSM1617975,GSE66250,facs sorting: CD44high/CD24low,Germany,Chair for Biochemistry and Molecular Biology,"Martin,,Eilers",Basecalling was performed with the real time a...,cd44high_dox_vs_etoh.txt and cd44high_etoh_vs_...,Total RNA was extracted using the RNeasy Mini ...,,...,,May 15 2019,polyA RNA,Homo sapiens,GPL10999,,Sorted HMLE cell line,Feb 24 2015,rep1_cd44high_dox,SRA
4,GSM1267968,GSE52505,tissue: human nasal polyp,South Korea,Department of otorhinolaryngology,"Heung-Man,,Lee",All data normalization and selection of fold-c...,Gene expression of LPS-stimulated nasal fibrob...,Nasal fibroblasts were exposed to LPS (10 μg/m...,"After checking labeling efficiency, fragmentat...",...,Amplified and labeled cRNA was purified on cRN...,Dec 31 2015,total RNA,Homo sapiens,GPL13497,The hybridized images were scanned using Agile...,nasal polyp,Nov 19 2013,lipopolysaccharide treated sample replication 2,RNA


In [110]:
# Dropping some object columns
obj_cols_drop = ['data_processing', 'description', 'platform_id', 'title', 'submission_date', 'hyb_protocol', 'label_protocol_ch1', 'scan_protocol', 'contact_department', 'contact_country','type']
data_df1.drop(columns=obj_cols_drop, axis=1, inplace=True)
data_df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 623 entries, 0 to 622
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   geo_accession         623 non-null    object
 1   gse_id                623 non-null    object
 2   ctrl                  623 non-null    int64 
 3   pert                  623 non-null    int64 
 4   channel_count         623 non-null    int64 
 5   characteristics_ch1   623 non-null    object
 6   contact_name          623 non-null    object
 7   data_row_count        623 non-null    int64 
 8   extract_protocol_ch1  623 non-null    object
 9   label_ch1             513 non-null    object
 10  last_update_date      623 non-null    object
 11  molecule_ch1          623 non-null    object
 12  organism_ch1          623 non-null    object
 13  source_name_ch1       623 non-null    object
 14  taxid_ch1             623 non-null    int64 
dtypes: int64(5), object(10)
memory usage: 73

In [111]:
# Prep data for modeling
y = data_df1['ctrl']
X = data_df1.drop(columns=['ctrl', 'pert', 'gse_id', 'geo_accession'], axis=1)

In [112]:
obj = [cols for cols in data_df1.columns if data_df1[cols].dtype == 'object']
obj.remove('geo_accession')
obj.remove('gse_id')
obj.remove('contact_name')
obj.remove('last_update_date')
# obj.remove('label_protocol_ch1')
# obj.remove('hyb_protocol')
print(obj)

['characteristics_ch1', 'extract_protocol_ch1', 'label_ch1', 'molecule_ch1', 'organism_ch1', 'source_name_ch1']


In [113]:
def preprocess(data_df):
    data_df1['cleaned'] = ''
    
    # Initializing Stopwords and Lemmatization objects
    stop_words = set(stopwords.words('english'))
    wordnet_lemm = WordNetLemmatizer()
    
    # Pattern to detect characters which are not alphabets or numbers so they can removed
    alpha_or_numeric = "[^a-zA-Z0-9- ]"
    
    for index, row in tqdm(data_df.iterrows(), total = data_df.shape[0]):
        sample = row['feature']
        
        # Replacing characters which are not alphabets or numbers with blank space and ...
        # ... changing text to lowercase. These two steps are for cleaning text data, ...
        # ... you can add more on top of this to make your data cleaner.
        pre_txt = re.sub(alpha_or_numeric, " ", sample)
        pre_txt = sample.lower()
        
        # Removing stop words and lemmatizing different words in preprocessed text ...
        # ... and making the final processed text
        sample_words = [wordnet_lemm.lemmatize(w) for w in pre_txt.split() \
            if w not in stop_words and len(w)>1]
        pre_proc_ver = ' '.join(sample_words)
        
        data_df.loc[index, 'cleaned'] = pre_proc_ver
    return data_df

In [114]:
df_text = data_df1.loc[ : , obj]
df_text['feature'] = data_df1.apply(lambda row: ' '.join([str(row[i]) for i in obj]), axis=1)
df_text = preprocess(pd.DataFrame(df_text))

# Removing the Unnecessary Features
df_text = pd.DataFrame(df_text['cleaned'])
df_text.head()

  0%|          | 0/623 [00:00<?, ?it/s]

100%|██████████| 623/623 [00:00<00:00, 834.53it/s] 
100%|██████████| 623/623 [00:00<00:00, 834.53it/s] 


Unnamed: 0,cleaned
0,facs sorting: cd44low/cd24high total rna extra...
1,facs sorting: unsorted total rna extracted usi...
2,facs sorting: cd44low/cd24high total rna extra...
3,facs sorting: cd44high/cd24low total rna extra...
4,tissue: human nasal polyp nasal fibroblast exp...


In [115]:
# ... downloaded when importing libraries.
stop_words = set(stopwords.words('english'))
vect = TfidfVectorizer(analyzer="word", preprocessor=None, stop_words=list(stop_words), max_features=4000)
df_text = vect.fit_transform(df_text['cleaned'])
df = pd.DataFrame(df_text.toarray())
print(df.shape)

(623, 1093)


In [116]:
pca = PCA(n_components=.99, svd_solver='full')
df = pca.fit_transform(df)
print(df.shape)

(623, 161)


In [117]:
X_train, X_val, y_train, y_val = train_test_split(df, y, test_size=0.2, random_state=42)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(498, 161) (125, 161) (498,) (125,)


In [118]:
models = {
        'logistic_regression': LogisticRegression(
            max_iter=1000, 
            random_state=42,
            class_weight='balanced'
        ),
        'random_forest': RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            random_state=42,
            class_weight='balanced',
            n_jobs=-1
        ),
        'gradient_boosting': GradientBoostingClassifier(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=5,
            random_state=42
        ),
        'xgboost': xgb.XGBClassifier(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=5,
            random_state=42,
            eval_metric='logloss'
        ),
        'lightgbm': lgb.LGBMClassifier(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=5,
            random_state=42,
            verbose=-1
        )
    }

In [119]:
results = []

for name, model in models.items():
    print(f"\n{'='*60}")
    print(f"Training {name}...")
    print('='*60)
    
    # Train model
    model.fit(X_train, y_train)
    
    # Predictions
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    
    # Metrics for this model
    train_f1 = f1_score(y_train, y_train_pred)
    val_f1 = f1_score(y_val, y_val_pred)
    train_acc = accuracy_score(y_train, y_train_pred)
    val_acc = accuracy_score(y_val, y_val_pred)
    conf_m = confusion_matrix(y_val, y_val_pred)
    class_report = classification_report(y_val, y_val_pred)
    
    # Print metrics for this model
    print(f"\n{name}:")
    print(f"  Training F1-Score:   {train_f1:.4f}")
    print(f"  Validation F1-Score: {val_f1:.4f}")
    print(f"  Training Accuracy:   {train_acc:.4f}")
    print(f"  Validation Accuracy: {val_acc:.4f}")
    print("  Confusion Matrix:\n", conf_m)
    print("  Classification Report:\n", class_report)
    
    # Store results
    results.append({
        'model': name,
        'train_f1': train_f1,
        'val_f1': val_f1,
        'train_acc': train_acc,
        'val_acc': val_acc,
        'classification_report': class_report
    })

# Display summary table
print(f"\n{'='*60}")
print("SUMMARY - ALL MODELS")
print('='*60)
results_df = pd.DataFrame(results)
print(results_df.drop(columns=['classification_report']).to_string(index=False))



Training logistic_regression...

logistic_regression:
  Training F1-Score:   0.9000
  Validation F1-Score: 0.6984
  Training Accuracy:   0.8996
  Validation Accuracy: 0.6960
  Confusion Matrix:
 [[43 22]
 [16 44]]
  Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.66      0.69        65
           1       0.67      0.73      0.70        60

    accuracy                           0.70       125
   macro avg       0.70      0.70      0.70       125
weighted avg       0.70      0.70      0.70       125


Training random_forest...

random_forest:
  Training F1-Score:   0.9939
  Validation F1-Score: 0.7967
  Training Accuracy:   0.9940
  Validation Accuracy: 0.8000
  Confusion Matrix:
 [[51 14]
 [11 49]]
  Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.78      0.80        65
           1       0.78      0.82      0.80        60

    accuracy                           



In [120]:
# --- PSEUDO-LABELING: One iteration ---
# This cell: 1) loads unlabeled data, 2) preprocesses with your existing `vect` and `pca`,
# 3) predicts with the best model (selected from `results_df`), 4) keeps high-confidence
#    predictions as pseudo-labels, 5) saves them and 6) retrains the best model on the
#    augmented training set.

import os
import numpy as np
import pandas as pd

# Configuration
UNLABELED_FILE = r'C:\Users\user\Videos\GeneExp\data\unlabelled_train_data.csv'  # change if different
CONF_THRESH = 0.6
MAX_PSEUDO = 5000
PSEUDO_WEIGHT = 0.5
PSEUDO_OUTPUT = 'pseudo_labels.csv'

if not os.path.exists(UNLABELED_FILE):
    print(f"Unlabeled file not found: {UNLABELED_FILE}. Update the path and re-run.")
else:
    print(f"Loading unlabeled data: {UNLABELED_FILE}")
    df_unl = pd.read_csv(UNLABELED_FILE)

    # Build the same 'feature' used earlier: concatenate object columns in `obj`
    try:
        df_unl_features = df_unl[obj].astype(str).agg(' '.join, axis=1)
    except Exception as e:
        # Fallback: try building from columns present in training
        print("Warning building text features using `obj` failed:", e)
        common = [c for c in obj if c in df_unl.columns]
        if not common:
            raise RuntimeError("No overlapping feature columns found between labeled and unlabeled data.")
        df_unl_features = df_unl[common].astype(str).agg(' '.join, axis=1)

    df_unl_feat = pd.DataFrame({'feature': df_unl_features})

    # Reuse the same text preprocessing function used earlier
    df_unl_proc = preprocess(df_unl_feat)
    docs = df_unl_proc['cleaned'].astype(str)

    # Vectorize and project with existing transformers
    X_unl_tfidf = vect.transform(docs)
    X_unl_array = X_unl_tfidf.toarray()
    X_unl = pca.transform(X_unl_array)

    print(f"Unlabeled features prepared: {X_unl.shape}")

    # Select best model from results_df if available
    try:
        best_name = results_df.sort_values('val_f1', ascending=False).iloc[0]['model']
        print(f"Using best model from results: {best_name}")
    except Exception:
        best_name = list(models.keys())[0]
        print(f"Could not read results_df; defaulting to first model: {best_name}")

    best_model = models[best_name]

    # Predict probabilities (binary case expected)
    if hasattr(best_model, 'predict_proba'):
        proba = best_model.predict_proba(X_unl)
        if proba.ndim == 2 and proba.shape[1] == 2:
            pos_proba = proba[:, 1]
        else:
            pos_proba = proba.max(axis=1)
    else:
        print("Model has no predict_proba; using predict and accepting all pseudo-labels (not recommended).")
        preds_all = best_model.predict(X_unl)
        pos_proba = np.ones(len(preds_all))

    preds = best_model.predict(X_unl)

    accept_mask = pos_proba >= CONF_THRESH
    accepted_idx = np.where(accept_mask)[0]
    print(f"Total unlabeled samples: {len(X_unl)} | High-confidence (>= {CONF_THRESH}): {len(accepted_idx)}")

    if len(accepted_idx) == 0:
        print("No pseudo-labels passed the confidence threshold. Consider lowering the threshold.")
    else:
        if len(accepted_idx) > MAX_PSEUDO:
            accepted_idx = accepted_idx[:MAX_PSEUDO]
            print(f"Limiting pseudo-labels to first {MAX_PSEUDO} samples")

        X_pseudo = X_unl[accepted_idx]
        y_pseudo = preds[accepted_idx]

        # Save pseudo-labels (with original unlabeled metadata if present)
        df_pseudo = df_unl.loc[df_unl.index[accepted_idx]].copy()
        df_pseudo['pseudo_label'] = y_pseudo
        df_pseudo['pseudo_proba'] = pos_proba[accepted_idx]
        df_pseudo.to_csv(PSEUDO_OUTPUT, index=False)
        print(f"Saved {len(accepted_idx)} pseudo-labeled samples to: {PSEUDO_OUTPUT}")

        # Augment training data
        X_train_arr = np.asarray(X_train)
        y_train_arr = np.asarray(y_train)

        X_aug = np.vstack([X_train_arr, X_pseudo])
        y_aug = np.concatenate([y_train_arr, y_pseudo])

        # Sample weights (original=1, pseudo=<1) - used if model.fit supports sample_weight
        sample_weight = np.concatenate([np.ones(len(X_train_arr)), np.full(len(X_pseudo), PSEUDO_WEIGHT)])

        # Retrain best_model on augmented data (try with sample_weight, fallback to without)
        print(f"Retraining {best_name} on augmented data (orig={len(X_train_arr)}, pseudo={len(X_pseudo)})...")
        try:
            best_model.fit(X_aug, y_aug, sample_weight=sample_weight)
            print("Retrained with sample_weight")
        except TypeError:
            best_model.fit(X_aug, y_aug)
            print("Retrained without sample_weight (estimator does not support it)")

        # Evaluate on validation set
        y_val_pred_new = best_model.predict(X_val)
        new_val_f1 = f1_score(y_val, y_val_pred_new)
        new_val_acc = accuracy_score(y_val, y_val_pred_new)
        print(f"Validation F1 after pseudo-label retrain: {new_val_f1:.4f}")
        print(f"Validation Acc after pseudo-label retrain: {new_val_acc:.4f}")

        # Optionally update models dict with retrained model
        models[best_name] = best_model

        print("Pseudo-labeling iteration complete. Review saved pseudo-labels and validation metrics before repeating.")


Loading unlabeled data: C:\Users\user\Videos\GeneExp\data\unlabelled_train_data.csv


100%|██████████| 19456/19456 [00:12<00:00, 1548.42it/s]


Unlabeled features prepared: (19456, 161)
Using best model from results: random_forest
Total unlabeled samples: 19456 | High-confidence (>= 0.6): 320
Saved 320 pseudo-labeled samples to: pseudo_labels.csv
Retraining random_forest on augmented data (orig=498, pseudo=320)...
Retrained with sample_weight
Validation F1 after pseudo-label retrain: 0.7581
Validation Acc after pseudo-label retrain: 0.7600
Pseudo-labeling iteration complete. Review saved pseudo-labels and validation metrics before repeating.


## Next Steps: Model Saving & FastAPI Frontend
You will now save your retrained model and preprocessing pipeline, then scaffold a FastAPI backend for gene signature classification.

In [121]:
# Save retrained model and preprocessing pipeline for deployment
import joblib
joblib.dump(models[best_name], 'gene_signature_model.pkl')
joblib.dump(vect, 'tfidf_vectorizer.pkl')
joblib.dump(pca, 'pca_transformer.pkl')
print('Saved model, vectorizer, and PCA transformer for deployment.')

Saved model, vectorizer, and PCA transformer for deployment.


## FastAPI Backend: Gene Signature Classification API
The following code scaffolds a FastAPI backend that loads your saved model and preprocessing pipeline, and exposes a `/classify` endpoint for molecular sample classification.

In [127]:
# FastAPI backend for gene signature classification
from fastapi import FastAPI, UploadFile, File, Form
from fastapi.responses import JSONResponse
import joblib
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from tqdm import tqdm
import uvicorn
app = FastAPI()

# Load model and pipeline
model = joblib.load('gene_signature_model.pkl')
vect = joblib.load('tfidf_vectorizer.pkl')
pca = joblib.load('pca_transformer.pkl')

def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    wordnet_lemm = WordNetLemmatizer()
    alpha_or_numeric = "[^a-zA-Z0-9- ]"
    pre_txt = re.sub(alpha_or_numeric, " ", text)
    pre_txt = pre_txt.lower()
    sample_words = [wordnet_lemm.lemmatize(w) for w in pre_txt.split() if w not in stop_words and len(w)>1]
    return ' '.join(sample_words)

@app.post('/classify')
async def classify_sample(sample: str = Form(...)):
    cleaned = preprocess_text(sample)
    X_vec = vect.transform([cleaned])
    X_pca = pca.transform(X_vec.toarray())
    pred = model.predict(X_pca)[0]
    if hasattr(model, 'predict_proba'):
        conf = float(model.predict_proba(X_pca)[0][1])
    else:
        conf = None
    return JSONResponse({'predicted_signature': str(pred), 'confidence': conf})

# To run:
# uvicorn filename:app --reload

## Simple HTML Frontend for Gene Signature Classification
This HTML form lets users submit a molecular sample to your FastAPI backend and displays the predicted gene signature and confidence.

In [126]:
# Save this as index.html and open in your browser
html_code = '''
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Gene Signature Classifier</title>
    <style>
        body { font-family: Arial, sans-serif; margin: 40px; background: #f7f7f7; }
        .container { max-width: 500px; margin: auto; background: #fff; padding: 30px; border-radius: 8px; box-shadow: 0 2px 8px #ccc; }
        h2 { text-align: center; }
        textarea { width: 100%; height: 100px; margin-bottom: 15px; }
        button { padding: 10px 20px; background: #007bff; color: #fff; border: none; border-radius: 4px; cursor: pointer; }
        .result { margin-top: 20px; padding: 15px; background: #e9ffe9; border-radius: 6px; }
    </style>
</head>
<body>
    <div class="container">
        <h2>Gene Signature Classifier</h2>
        <form id="classifyForm">
            <label for="sample">Paste molecular sample (text):</label><br>
            <textarea id="sample" name="sample" required></textarea><br>
            <button type="submit">Classify</button>
        </form>
        <div id="result" class="result" style="display:none;"></div>
    </div>
    <script>
        document.getElementById('classifyForm').onsubmit = async function(e) {
            e.preventDefault();
            const sample = document.getElementById('sample').value;
            const formData = new FormData();
            formData.append('sample', sample);
            const response = await fetch('http://127.0.0.1:8000/classify', {
                method: 'POST',
                body: formData
            });
            const data = await response.json();
            document.getElementById('result').style.display = 'block';
            document.getElementById('result').innerHTML = 
                `<b>Predicted Signature:</b> ${data.predicted_signature} <br>` +
                `<b>Confidence:</b> ${data.confidence !== null ? data.confidence.toFixed(3) : 'N/A'}`;
        };
    </script>
</body>
</html>
'''
with open('index.html', 'w', encoding='utf-8') as f:
    f.write(html_code)
print('HTML frontend code saved as index.html')

HTML frontend code saved as index.html
