# Project: 2

In [5]:

import pandas as pd
import numpy as np


In [6]:
# NCBI dataset with Protein sequence

df2=pd.read_csv("../Data/NCBI_Protein_seq_H5N1.csv",low_memory=False)
df2_usa = df2[df2['Geo_Location'].str.contains('USA', na=False)]
df2_usa=df2_usa.drop(columns=['GenBank_RefSeq','Submitters','Geo_Location','Isolate','Tissue_Specimen_Source','Release_Date']) 
df2_usa.info()

FileNotFoundError: [Errno 2] No such file or directory: '../Data/NCBI_Protein_seq_H5N1.csv'

In [None]:
# BV-BRC dataset from GenBank
df3 = pd.read_csv("../Data/H5N1_Outbreak_Genome_GenBank.csv", low_memory=False)
df3.head()

In [None]:
#BV-BRC dataset from SRA database
df4 = pd.read_csv("../Data/H5N1_Outbreak_Genome_SRA_database.csv",low_memory=False)
df4.head()

In [None]:
# Genbank database

df3.info()

In [None]:
#SRA database
df4.info()

# Data Cleaning

In [None]:
# Concat the H5N1 Outbreak data from GenBank and SRA Databases
bv_brc_df=pd.concat([df3,df4],axis=0, ignore_index=True)

# Drop the columns with all null values
bv_brc_df.dropna(axis=1, how='all', inplace=True)

bv_brc_df.info()


In [None]:
# Drop the duplicates in the concatenated dataframe for the 'Genome ID' values
bv_brc_df.drop_duplicates(subset=['Genome ID'], keep='first', inplace=True)

# Rename the 'SRA Accession" column to match with other dataset
bv_brc_df=bv_brc_df.rename(columns={'SRA Accession':'SRA_Accession'})

bv_brc_df.head()

In [None]:
# After removing duplicates
bv_brc_df.info()

In [None]:
# Concatenate BV-BRC dataset and the NCBI protein seq dataset

h5n1_records_df = pd.concat([bv_brc_df,df2_usa],axis=1)
h5n1_records_df.info()


In [None]:
# Drop the duplicates on 'SRA_Accession' column from the concat
h5n1_records_df['SRA_Accession'] = h5n1_records_df['SRA_Accession'].astype(str)
h5n1_records_df.drop_duplicates(subset=['SRA_Accession'], keep='first', inplace=True)
h5n1_records_df.head()

In [None]:
# Remove duplicate column names
h5n1_records_df = h5n1_records_df.loc[:, ~h5n1_records_df.columns.duplicated()]
h5n1_records_df.head()

In [None]:
# Remove the column names not specific for this model
h5n1_records_df.drop(columns=['Serovar','Geographic Group','Host Name','Lab Host','Molecule_type'],inplace=True) 
h5n1_records_df.info()

In [None]:
# Create a dataframe with genetic info
h5n1_genetic_df=h5n1_records_df[['Nucleotide','Family','Genus','Species','Strain','Genotype','Segment','H5 Clade','Subclade','Host Group','Geographic Location','Host Common Name']]
h5n1_genetic_df.info()

In [None]:
# Remove duplicate values of Nucleotide
h5n1_genetic_df=h5n1_genetic_df.drop_duplicates(subset=['Nucleotide'], keep='first').reset_index(drop=True)
h5n1_genetic_df.head()

In [None]:
# drop rows with Nucleotide null values
h5n1_genetic_df=h5n1_genetic_df.dropna(subset=['Nucleotide'])
h5n1_genetic_df.info()

In [None]:
h5n1_genetic_df.head()

In [None]:
# drop rows with subclade null values
h5n1_genetic_df=h5n1_genetic_df.dropna(subset=['Subclade'])
h5n1_genetic_df.info()

In [None]:
# Delete rows with missing values for Host Group / Host group name
h5n1_genetic_df=h5n1_genetic_df.dropna(subset=['Host Group','Host Common Name'])
h5n1_genetic_df.info()

In [None]:
# Fill the missing H5 Clade values to 2.3.4.4b

h5n1_genetic_df['H5 Clade']= '2.3.4.4b'
h5n1_genetic_df.info()

In [None]:
#h5n1_genetic_df['Nucleotide'].to_csv("nucleotide_number.csv",index=False)
nucleotide_list=h5n1_genetic_df['Nucleotide'].tolist()
#print(nucleotide_list)

# Write the final dataframe as csv file
h5n1_genetic_df.to_csv("../Data/h5n1_genetic_data.csv",index_label=False)


# Data Processing


In [None]:

# Read FASTA sequences from file

from Bio import Entrez, SeqIO

sequences = list(SeqIO.parse("sequence.fasta", "fasta"))

# Print first sequence
print(sequences[0].id)
print(sequences[0].seq)
print(len(sequences[0].seq))