<a href="https://colab.research.google.com/github/Madhuanabala/capstone/blob/main/datacollection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import os
import numpy as np
!pip install biopython


Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m1.7/3.2 MB[0m [31m51.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.2/3.2 MB[0m [31m64.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.84


In [None]:

import csv
import requests
from Bio import Entrez, SeqIO
Entrez.email = "madhuanabala567@gmail.com"
def fetch_uniprot_antigen_data(search_term, max_results=500, output_csv="antigen_data.csv"):
    """
    Fetch antigen data from uniport, including sequences, organism, and PDB IDs, and save to a CSV file.

    Parameters:
    - search_term: Search term for antigens (e.g., "antigen").
    - max_results: Maximum number of results to fetch (default: 10).
    - output_csv: Name of the output CSV file (default: "antigen_data.csv").
    """
    try:
        # UniProt search URL
        uniprot_url = "https://rest.uniprot.org/uniprotkb/search"

        # Query parameters
        params = {
            "query": search_term,
            "fields": "accession,organism_name,sequence",
            "format": "json",
            "size": max_results
        }

        print(f"Searching UniProt for '{search_term}'...")
        response = requests.get(uniprot_url, params=params)
        response.raise_for_status()
        results = response.json().get("results", [])

        if not results:
            print("No sequences found!")
            return

        print(f"Found {len(results)} sequences. Fetching details...")

        # Prepare data for CSV and FASTA
        data = []
        fasta_entries = []
        for result in results:
            accession = result.get("primaryAccession", "N/A")
            organism = result.get("organism", {}).get("scientificName", "Unknown")
            sequence = result.get("sequence", {}).get("value", "N/A")

            # Save details for CSV
            data.append([accession, organism, sequence])

            # Prepare FASTA entry
            fasta_entries.append(f">{accession} {organism}\n{sequence}")

        # Write to CSV
        with open(output_csv, mode="w", newline="") as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(["Accession", "Organism", "Sequence"])
            writer.writerows(data)

        print(f"Data saved to {output_csv}.")

        # Write sequences to FASTA file
        fasta_file = output_csv.replace(".csv", ".fasta")
        with open(fasta_file, mode="w") as fasta_output:
            fasta_output.write("\n".join(fasta_entries))

        print(f"FASTA sequences saved to {fasta_file}.")

    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
search_term = "antigen"  # Replace with specific search terms for antigens
fetch_uniprot_antigen_data(search_term, max_results=500)

Searching UniProt for 'antigen'...
Found 500 sequences. Fetching details...
Data saved to antigen_data.csv.
FASTA sequences saved to antigen_data.fasta.
