In [None]:
import re
import spacy
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Sample biotechnology text (replace with your actual dataset)
biotech_texts = [
    "The BRCA1 gene is associated with breast cancer risk.",
    "TP53 mutations are common in many types of cancer.",
    "The insulin gene (INS) is responsible for producing insulin.",
    "EGFR overexpression is observed in some lung cancers.",
]

# Sample labels (replace with your actual labels)
labels = ["cancer", "cancer", "metabolic", "cancer"]

# Preprocess the text
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.text.lower() for token in doc if not token.is_stop and token.is_alpha]
    return " ".join(tokens)

processed_texts = [preprocess_text(text) for text in biotech_texts]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(processed_texts, labels, test_size=0.2, random_state=42)

# Create TF-IDF vectors
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = clf.predict(X_test_tfidf)

# Print classification report
print(classification_report(y_test, y_pred))

# Improved function to extract gene entities
def extract_gene_entities(text):
    # Rule-based patterns for gene names
    gene_patterns = [
        r'\b[A-Z]+[0-9]*\b',  # Uppercase letters followed by optional numbers
        r'\b[A-Z]+[a-z]+[0-9]*\b',  # Uppercase start, then lowercase, optional numbers
        r'\b[a-z]+[0-9]+\b'  # Lowercase letters followed by numbers
    ]

    # Combine patterns
    combined_pattern = '|'.join(gene_patterns)

    # Find all matches
    potential_genes = re.findall(combined_pattern, text)

    # Filter out common words and short sequences
    stop_words = set(stopwords.words('english'))
    filtered_genes = [gene for gene in potential_genes if len(gene) > 2 and gene.lower() not in stop_words]

    # Use spaCy for additional entity recognition
    doc = nlp(text)
    spacy_genes = [ent.text for ent in doc.ents if ent.label_ in ["GENE", "PROTEIN"]]

    # Combine and remove duplicates
    all_genes = list(set(filtered_genes + spacy_genes))

    return all_genes

# Function to classify a new text
def classify_text(text):
    processed = preprocess_text(text)
    tfidf = vectorizer.transform([processed])
    prediction = clf.predict(tfidf)
    return prediction[0]

# Example usage
new_text = "The HER2 gene plays a role in breast cancer development. BRCA1 and BRCA2 are also important."
extracted_genes = extract_gene_entities(new_text)
classification = classify_text(new_text)

print(f"Extracted genes: {extracted_genes}")
print(f"Classification: {classification}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


              precision    recall  f1-score   support

      cancer       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1

Extracted genes: ['BRCA2', 'BRCA1', 'HER2']
Classification: cancer


In [None]:
import re
import random
import spacy
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Download necessary NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Create a custom dataset
def create_dataset(num_samples=100):
    genes = ["BRCA1", "BRCA2", "TP53", "EGFR", "HER2", "KRAS", "PTEN", "APC", "BRAF", "ALK",
             "RET", "MET", "VEGF", "IL6", "TNF", "MTHFR", "APOE", "ACE", "CFTR", "DMD"]

    categories = ["cancer", "cardiovascular", "neurodegenerative", "metabolic", "immune"]

    templates = [
        "The {gene} gene is associated with {category} diseases.",
        "Mutations in {gene} can lead to increased risk of {category} disorders.",
        "Research shows that {gene} plays a crucial role in {category} pathways.",
        "Overexpression of {gene} is observed in some {category} conditions.",
        "The {gene} protein is a key regulator in {category}-related cellular processes.",
        "Studies indicate that {gene} variants may contribute to {category} susceptibility.",
        "Therapies targeting {gene} show promise in treating certain {category} diseases.",
        "Abnormal {gene} activity is linked to various {category} disorders.",
        "The {gene} gene product is involved in {category} signaling cascades.",
        "Genetic testing for {gene} mutations is common in {category} risk assessment."
    ]

    dataset = []
    for _ in range(num_samples):
        gene = random.choice(genes)
        category = random.choice(categories)
        text = random.choice(templates).format(gene=gene, category=category)
        dataset.append((text, category))

    return dataset

# Create the dataset
full_dataset = create_dataset(500)  # 500 samples

# Split the dataset
texts, labels = zip(*full_dataset)

# Preprocess the text
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.text.lower() for token in doc if not token.is_stop and token.is_alpha]
    return " ".join(tokens)

processed_texts = [preprocess_text(text) for text in texts]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(processed_texts, labels, test_size=0.2, random_state=42)

# Create TF-IDF vectors
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = clf.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Print classification report and accuracy
print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"\nOverall Accuracy: {accuracy:.2f}")

# Calculate and print accuracy for each class
unique_labels = np.unique(y_test)
print("\nAccuracy for each class:")
for label in unique_labels:
    label_indices = [i for i, y in enumerate(y_test) if y == label]
    label_accuracy = accuracy_score([y_test[i] for i in label_indices], [y_pred[i] for i in label_indices])
    print(f"{label}: {label_accuracy:.2f}")

# Improved function to extract gene entities
def extract_gene_entities(text):
    # Rule-based patterns for gene names
    gene_patterns = [
        r'\b[A-Z]+[0-9]*\b',  # Uppercase letters followed by optional numbers
        r'\b[A-Z]+[a-z]+[0-9]*\b',  # Uppercase start, then lowercase, optional numbers
        r'\b[a-z]+[0-9]+\b'  # Lowercase letters followed by numbers
    ]

    # Combine patterns
    combined_pattern = '|'.join(gene_patterns)

    # Find all matches
    potential_genes = re.findall(combined_pattern, text)

    # Filter out common words and short sequences
    stop_words = set(stopwords.words('english'))
    filtered_genes = [gene for gene in potential_genes if len(gene) > 2 and gene.lower() not in stop_words]

    # Use spaCy for additional entity recognition
    doc = nlp(text)
    spacy_genes = [ent.text for ent in doc.ents if ent.label_ in ["GENE", "PROTEIN"]]

    # Combine and remove duplicates
    all_genes = list(set(filtered_genes + spacy_genes))

    return all_genes

# Function to classify a new text
def classify_text(text):
    processed = preprocess_text(text)
    tfidf = vectorizer.transform([processed])
    prediction = clf.predict(tfidf)
    return prediction[0]

# Example usage
print("\nExample Usage:")
new_text = "The HER2 and BRCA1 genes play crucial roles in breast cancer development. APOE is associated with Alzheimer's disease risk."
extracted_genes = extract_gene_entities(new_text)
classification = classify_text(new_text)

print(f"Text: {new_text}")
print(f"Extracted genes: {extracted_genes}")
print(f"Classification: {classification}")

# Evaluate gene extraction on a sample of the dataset
print("\nGene Extraction Evaluation:")
sample_size = 10
sample_texts = random.sample(texts, sample_size)
for text in sample_texts:
    genes = extract_gene_entities(text)
    print(f"Text: {text}")
    print(f"Extracted genes: {genes}")
    print()

Classification Report:
                   precision    recall  f1-score   support

           cancer       0.94      0.94      0.94        16
   cardiovascular       0.94      0.76      0.84        21
           immune       0.83      0.95      0.89        21
        metabolic       0.95      1.00      0.97        18
neurodegenerative       0.88      0.88      0.88        24

         accuracy                           0.90       100
        macro avg       0.91      0.91      0.90       100
     weighted avg       0.90      0.90      0.90       100


Overall Accuracy: 0.90

Accuracy for each class:
cancer: 0.94
cardiovascular: 0.76
immune: 0.95
metabolic: 1.00
neurodegenerative: 0.88

Example Usage:
Text: The HER2 and BRCA1 genes play crucial roles in breast cancer development. APOE is associated with Alzheimer's disease risk.
Extracted genes: ['Alzheimer', 'BRCA1', 'APOE', 'HER2']
Classification: cancer

Gene Extraction Evaluation:
Text: Studies indicate that MTHFR variants may contr