In [1]:
import os
import numpy as np
import pandas as pd
import csv

In [17]:
datapath99 = ('../data/cycogs/cycogsgenomes.tsv') # basic information on each genome
datapath3 = ('../data/cycogs/ortholog-metadata.csv') # matches geneID to cycog number
# REFS = '../data/img_data_cycog6/'
REF = '../data/hackl_data/pro-623-allmaps.gff'
REF_sqrd = '../data/hackl_data/pro-623-allmaps-islands-novt.tsv'

genomes_df = pd.read_csv(datapath99, sep='\t')
cycogs_df = pd.read_csv(datapath3)
gi_locations_df = pd.read_csv(REF_sqrd, sep='\t')


In [7]:
unique_values = set()
with open(REF_sqrd, 'r') as file:
    for line in file:
        # Split the line by tab to get columns
        columns = line.strip().split('\t')
        
        # Check if the line has at least one column and if the first column contains "_c"
        if len(columns) > 0:
        # and "_c" in columns[0]:
            unique_values.add(columns[0])

# Print each unique value
# for value in unique_values:
#     print(value)
print (len(unique_values))

624


In [18]:
gff_df = pd.read_csv(
    '../data/hackl_data/pro-623-allmaps.gff', sep='\t', header=None, 
    names=['contig_id', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']
)
# down-select to only concatenated contig scaffolds used for genomic island predictions
gff_df = gff_df[gff_df.contig_id.isin(gi_locations_df.contig_id)]
# strip out GeneID
gff_df['GeneID'] = gff_df.attributes.str.split(';', n=1, expand=True)[0].str.strip('ID=')
# drop gap features
gff_df = gff_df[gff_df.type != 'gap'].reset_index(drop=True)
# determine stranded start of sequence feature
gff_df['StartNt'] = gff_df['start']
gff_df.loc[gff_df.strand.eq('-'), 'StartNt'] = gff_df.loc[gff_df.strand.eq('-'), 'end']

gff_df

Unnamed: 0,contig_id,source,type,start,end,score,strand,phase,attributes,GeneID,StartNt
0,150NLHA_c,Prodigal:2.6,CDS,106,204,.,+,0,ID=150NLHA_02566;inference=ab initio predictio...,150NLHA_02566,106
1,150NLHA_c,Prodigal:2.6,CDS,201,1367,.,+,0,ID=150NLHA_02567;eC_number=2.7.7.7;Name=dnaN;g...,150NLHA_02567,201
2,150NLHA_c,Prodigal:2.6,CDS,1371,2147,.,+,0,ID=150NLHA_02568;inference=ab initio predictio...,150NLHA_02568,1371
3,150NLHA_c,Prodigal:2.6,CDS,2187,4589,.,+,0,ID=150NLHA_02569;eC_number=6.3.5.3;inference=a...,150NLHA_02569,2187
4,150NLHA_c,Prodigal:2.6,CDS,4649,6106,.,+,0,ID=150NLHA_02570;eC_number=2.4.2.14;Name=purF;...,150NLHA_02570,4649
...,...,...,...,...,...,...,...,...,...,...,...
885336,W11_c,Prodigal:2.6,CDS,1500110,1500433,.,-,0,ID=W11_00397;inference=ab initio prediction:Pr...,W11_00397,1500433
885337,W11_c,Prodigal:2.6,CDS,1500448,1501380,.,-,0,ID=W11_00398;Name=rbn;gene=rbn;inference=ab in...,W11_00398,1501380
885338,W11_c,Prodigal:2.6,CDS,1501464,1501691,.,-,0,ID=W11_00399;eC_number=3.1.3.15;inference=ab i...,W11_00399,1501691
885339,W11_c,Prodigal:2.6,CDS,1544863,1546809,.,+,0,ID=W11_00459;eC_number=4.1.1.19;inference=ab i...,W11_00459,1544863


In [28]:
target_df = gff_df[gff_df.contig_id == "150NLHA_c"]
target_genes = target_df.GeneID.tolist()
display(target_df)
# print (target_genes)

Unnamed: 0,contig_id,source,type,start,end,score,strand,phase,attributes,GeneID,StartNt
0,150NLHA_c,Prodigal:2.6,CDS,106,204,.,+,0,ID=150NLHA_02566;inference=ab initio predictio...,150NLHA_02566,106
1,150NLHA_c,Prodigal:2.6,CDS,201,1367,.,+,0,ID=150NLHA_02567;eC_number=2.7.7.7;Name=dnaN;g...,150NLHA_02567,201
2,150NLHA_c,Prodigal:2.6,CDS,1371,2147,.,+,0,ID=150NLHA_02568;inference=ab initio predictio...,150NLHA_02568,1371
3,150NLHA_c,Prodigal:2.6,CDS,2187,4589,.,+,0,ID=150NLHA_02569;eC_number=6.3.5.3;inference=a...,150NLHA_02569,2187
4,150NLHA_c,Prodigal:2.6,CDS,4649,6106,.,+,0,ID=150NLHA_02570;eC_number=2.4.2.14;Name=purF;...,150NLHA_02570,4649
...,...,...,...,...,...,...,...,...,...,...,...
2613,150NLHA_c,Prodigal:2.6,CDS,2496792,2499752,.,-,0,ID=150NLHA_02561;inference=ab initio predictio...,150NLHA_02561,2499752
2614,150NLHA_c,Prodigal:2.6,CDS,2499798,2501480,.,-,0,ID=150NLHA_02562;Name=recN;gene=recN;inference...,150NLHA_02562,2501480
2615,150NLHA_c,Prodigal:2.6,CDS,2501560,2503428,.,+,0,ID=150NLHA_02563;inference=ab initio predictio...,150NLHA_02563,2501560
2616,150NLHA_c,Prodigal:2.6,CDS,2503453,2504040,.,+,0,ID=150NLHA_02564;inference=ab initio predictio...,150NLHA_02564,2503453


In [30]:
row = target_df[target_df["GeneID"] == "150NLHA_02573"]

# Check if the row exists
if not row.empty:
    # Extract the StartNt value
    start_nt_value = row["StartNt"].values[0]

    # Define the range
    lower_bound = start_nt_value - 10000
    upper_bound = start_nt_value + 10000

    # Filter the DataFrame based on the range
    filtered_df = target_df[(target_df["StartNt"] >= lower_bound) & (target_df["StartNt"] <= upper_bound)]

    # Print the filtered rows
    display(filtered_df)
else:
    print("GeneID not found.")

Unnamed: 0,contig_id,source,type,start,end,score,strand,phase,attributes,GeneID,StartNt
2,150NLHA_c,Prodigal:2.6,CDS,1371,2147,.,+,0,ID=150NLHA_02568;inference=ab initio predictio...,150NLHA_02568,1371
3,150NLHA_c,Prodigal:2.6,CDS,2187,4589,.,+,0,ID=150NLHA_02569;eC_number=6.3.5.3;inference=a...,150NLHA_02569,2187
4,150NLHA_c,Prodigal:2.6,CDS,4649,6106,.,+,0,ID=150NLHA_02570;eC_number=2.4.2.14;Name=purF;...,150NLHA_02570,4649
5,150NLHA_c,Prodigal:2.6,CDS,6142,8637,.,-,0,ID=150NLHA_02571;eC_number=5.99.1.3;Name=gyrA_...,150NLHA_02571,8637
6,150NLHA_c,Prodigal:2.6,CDS,8715,9608,.,-,0,ID=150NLHA_02572;inference=ab initio predictio...,150NLHA_02572,9608
7,150NLHA_c,Prodigal:2.6,CDS,9618,10592,.,-,0,ID=150NLHA_02573;inference=ab initio predictio...,150NLHA_02573,10592
8,150NLHA_c,Prodigal:2.6,CDS,10679,11293,.,+,0,ID=150NLHA_02574;inference=ab initio predictio...,150NLHA_02574,10679
9,150NLHA_c,Prodigal:2.6,CDS,11366,12115,.,+,0,ID=150NLHA_02575;inference=ab initio predictio...,150NLHA_02575,11366
10,150NLHA_c,Prodigal:2.6,CDS,12145,12780,.,+,0,ID=150NLHA_02576;Name=nusB;gene=nusB;inference...,150NLHA_02576,12145
11,150NLHA_c,Prodigal:2.6,CDS,12780,14234,.,+,0,ID=150NLHA_02577;Name=ftsY;gene=ftsY;inference...,150NLHA_02577,12780


In [26]:
target_df[target_df.GeneID == '150NLHA_02573']

Unnamed: 0,contig_id,source,type,start,end,score,strand,phase,attributes,GeneID,StartNt
7,150NLHA_c,Prodigal:2.6,CDS,9618,10592,.,-,0,ID=150NLHA_02573;inference=ab initio predictio...,150NLHA_02573,10592


In [23]:
gi_locations_df[gi_locations_df.contig_id == '150NLHA_c']

Unnamed: 0,genome_id,contig_id,start,end
5255,150NLHA,150NLHA_c,89947,134093
5256,150NLHA,150NLHA_c,1847422,1892281
5257,150NLHA,150NLHA_c,2081798,2103630
5258,150NLHA,150NLHA_c,2248955,2259352
5259,150NLHA,150NLHA_c,2292896,2376881
5260,150NLHA,150NLHA_c,268914,339830
5261,150NLHA,150NLHA_c,412155,433249
5262,150NLHA,150NLHA_c,588921,611217
5263,150NLHA,150NLHA_c,904482,1111189
5264,150NLHA,150NLHA_c,1268710,1282680


In [33]:
cycogs_df

Unnamed: 0,MappingName,OrthologID,GenomeName,GeneID,Annotation
0,WH8102_2607658325,60000001,WH8102,2607658325,membrane protease FtsH catalytic subunit
1,MIT0917_2681971350,60000001,MIT0917,2681971350,membrane protease FtsH catalytic subunit
2,AG-424-P18_2717338506,60000001,AG-424-P18,2717338506,membrane protease FtsH catalytic subunit
3,scB245a_521A19_2655604637,60000001,scB245a_521A19,2655604637,membrane protease FtsH catalytic subunit
4,GFB01_2638208352,60000001,GFB01,2638208352,membrane protease FtsH catalytic subunit
...,...,...,...,...,...
964917,AG-363-C02_2667889608,60040295,AG-363-C02,2667889608,hypothetical protein
964918,AG-363-C02_2667889615,60040295,AG-363-C02,2667889615,hypothetical protein
964919,AG-363-C02_2667890048,60040295,AG-363-C02,2667890048,hypothetical protein
964920,AG-363-C02_2667890054,60040295,AG-363-C02,2667890054,hypothetical protein


In [45]:

cycog = 60003220
target_df = cycogs_df[cycogs_df.OrthologID == cycog]
ID_list = target_df['GeneID'].tolist()
cycog_gene_ids = []
better_list = []

for genome in os.listdir(REFS):
    if not os.path.isdir(os.path.join(REFS, genome)):
        continue
        # Path to the GFF file for the current genome
    gff_path = os.path.join(REFS, genome, f'{genome}.gff')
    with open(gff_path, 'r') as file:
        linenumber = 0
        for line in file:
            parts = line.strip().split()
            test_list = []
            if len(parts) > 7:  # Ensure there are enough parts to parse start and end
                try:
                    start = int(parts[3])
                    end = int(parts[4])
                    # annot = parts[8]
                except ValueError:
                    continue  # Skip lines that do not have valid integers in the expected columns
                gene_id = int(parts[8].split('=')[1].split(';')[0])
                cycog_gene_ids.append(gene_id)
                if gene_id in ID_list:
                    for index, row in target_df.iterrows():
                        if row['GeneID'] == gene_id:
                            genome_name = row['GenomeName']
                            test_list.append(genome_name)
                    test_list.append(genome)
                    test_list.append(start)
                    test_list.append(end)
                    test_list.append(parts[6])
                    test_list.append(gene_id)
                    better_list.append(test_list)

parts_df = pd.DataFrame(better_list, columns=["Genome_Name", "GenomeID", "Start", "End", "Strand", "GeneID"])
parts_df

Unnamed: 0,Genome_Name,GenomeID,Start,End,Strand,GeneID
0,AG-402-B19,2716884732,75734,75976,-,2717659556
1,MIT9107,2606217692,21749,24718,+,2608235998
2,W8,2551306554,23960,27061,-,2553538053
3,AG-679-B05,2716884670,3308,6415,+,2717597201
4,AG-311-J23,2716884685,67380,70469,+,2717616792
5,WH7805,2623620868,340116,343088,-,2626027976
6,AG-670-O07,2716884481,10083,13187,+,2717407834
7,AG-670-A04,2716884607,6406,9378,+,2717523028
8,AG-676-L21,2716884808,30252,33230,-,2717758195
9,AG-331-B23,2716884653,3,2138,+,2717580141


In [None]:
for index, row in parts_df.iterrows():
    genome_id = row['GenomeID']
    file_path = os.path.join(REFS, str(genome_id), f'{genome_id}.gff')
    
    # Check if the file exists
    if os.path.isfile(file_path):
        with open(file_path, 'r') as file:
            # Print the first line of the file
            first_line = file.readline().strip()
            print(f"First line of {genome_id}: {first_line}")
    else:
        print(f"File {genome_id} does not exist in the directory {REFS}")

In [None]:
for index, row in parts_df.iterrows():
    genome_id = row['GenomeID']
    start_value = row['Start']
    file_path = os.path.join(REFS, str(genome_id), f'{genome_id}.gff')
    
    # Check if the file exists
    if os.path.isfile(file_path):
        with open(file_path, 'r') as file:
            found = False
            for line in file:
                if str(start_value) in line.split('\t'):
                    print(f"Line with start value {start_value} in {genome_id}: {line.strip()}")
                    found = True
                    break
            if not found:
                print(f"Start value {start_value} not found in {genome_id}")
    else:
        print(f"File {genome_id} does not exist in the directory {REFS}")

In [None]:
for index, row in parts_df.iterrows():
    genome_id = row['GenomeID']
    start_value = row['Start']
    target_start_value = max(0, start_value - 10000)  # Adjust the start value by subtracting 10,000, ensuring it's not negative
    tolerance = 1500
    file_path = os.path.join(REFS, str(genome_id), f'{genome_id}.gff')
    
    closest_line = None
    closest_diff = float('inf')  # Initialize with a large number
    
    # Check if the file exists
    if os.path.isfile(file_path):
        with open(file_path, 'r') as file:
            for line in file:
                parts = line.split('\t')
                if len(parts) > 3:  # Ensure there are enough parts to parse the start value
                    try:
                        start = int(parts[3])
                        diff = abs(start - target_start_value)
                        if diff <= tolerance and diff < closest_diff:
                            closest_diff = diff
                            closest_line = line.strip()
                    except ValueError:
                        continue  # Skip lines that do not have valid integers in the expected columns
            
            if closest_line:
                print(f"Closest line with adjusted start value {target_start_value} (within {tolerance} tolerance) in {genome_id}: {closest_line}")
            else:
                print(f"No line found within tolerance for adjusted start value {target_start_value} in {genome_id}")
    else:
        print(f"File {genome_id} does not exist in the directory {REFS}")

In [None]:
tolerance = 1500

for index, row in parts_df.iterrows():
    genome_id = row['GenomeID']
    start_value = row['Start']
    end_value = row['End']
    
    target_start_value = max(0, start_value - 10000)  # Adjust the start value by subtracting 10,000, ensuring it's not negative
    target_end_value = end_value + 10000  # Adjust the end value by adding 10,000
    
    file_path = os.path.join(REFS, str(genome_id), f'{genome_id}.gff')
    
    closest_start_line = None
    closest_start_diff = float('inf')  # Initialize with a large number
    
    closest_end_line = None
    closest_end_diff = float('inf')  # Initialize with a large number
    highest_value_line = None
    highest_value = float('-inf')  # Initialize with a very small number
    
    # Check if the file exists
    if os.path.isfile(file_path):
        with open(file_path, 'r') as file:
            for line in file:
                parts = line.split('\t')
                if len(parts) > 3:  # Ensure there are enough parts to parse the start and end values
                    try:
                        start = int(parts[3])
                        end = int(parts[4])
                        
                        start_diff = abs(start - target_start_value)
                        if start_diff <= tolerance and start_diff < closest_start_diff:
                            closest_start_diff = start_diff
                            closest_start_line = line.strip()
                            
                        end_diff = abs(end - target_end_value)
                        if end_diff <= tolerance and end_diff < closest_end_diff:
                            closest_end_diff = end_diff
                            closest_end_line = line.strip()
                        
                        if end > highest_value:
                            highest_value = end
                            highest_value_line = line.strip()
                    except ValueError:
                        continue  # Skip lines that do not have valid integers in the expected columns
            
            if closest_start_line:
                print(f"Closest line with adjusted start value {target_start_value} (within {tolerance} tolerance) in {genome_id}: {closest_start_line}")
            else:
                print(f"No line found within tolerance for adjusted start value {target_start_value} in {genome_id}")
                
            if closest_end_line:
                print(f"Closest line with adjusted end value {target_end_value} (within {tolerance} tolerance) in {genome_id}: {closest_end_line}")
            else:
                print(f"No line found within tolerance for adjusted end value {target_end_value} in {genome_id}. Highest value line: {highest_value_line}")
    else:
        print(f"File {genome_id} does not exist in the directory {REFS}")


In [None]:

tolerance = 1500

for index, row in parts_df.iterrows():
    genome_id = row['GenomeID']
    start_value = row['Start']
    end_value = row['End']
    
    target_start_value = max(0, start_value - 10000)  # Adjust the start value by subtracting 10,000, ensuring it's not negative
    target_end_value = end_value + 10000  # Adjust the end value by adding 10,000
    
    file_path = os.path.join(REFS, str(genome_id), f'{genome_id}.gff')
    
    closest_start_start = None
    closest_start_end = None
    closest_start_diff = float('inf')  # Initialize with a large number
    
    closest_end_start = None
    closest_end_end = None
    closest_end_diff = float('inf')  # Initialize with a large number
    
    lines = []
    
    # Check if the file exists
    if os.path.isfile(file_path):
        with open(file_path, 'r') as file:
            for line in file:
                parts = line.split('\t')
                if len(parts) > 8:  # Ensure there are enough parts to parse the attributes
                    try:
                        start = int(parts[3])
                        end = int(parts[4])
                        
                        # Check for the closest start value within tolerance
                        start_diff = abs(start - target_start_value)
                        if start_diff <= tolerance and start_diff < closest_start_diff:
                            closest_start_diff = start_diff
                            closest_start_start = start
                            closest_start_end = end
                            
                        # Check for the closest end value within tolerance
                        end_diff = abs(end - target_end_value)
                        if end_diff <= tolerance and end_diff < closest_end_diff:
                            closest_end_diff = end_diff
                            closest_end_start = start
                            closest_end_end = end
                        
                        # Collect relevant data for all lines
                        line_data = None
                        if len(parts) > 8 and '=' in parts[8]:
                            line_data = int(parts[8].split('=')[1].split(';')[0])
                        lines.append((start, line_data))
                    except ValueError:
                        continue  # Skip lines that do not have valid integers in the expected columns
            
            # Sort lines by their start values
            lines.sort(key=lambda x: x[0])
            
            # Gather the data from lines between the closest start and end lines (inclusive)
            collected_data = []
            collecting = False
            for start, line_data in lines:
                if closest_start_start is not None and start == closest_start_start:
                    collecting = True
                if collecting and line_data is not None:
                    collected_data.append(line_data)
                if closest_end_end is not None and start == closest_end_end:
                    break
            
            if collected_data:
                print(f"Data between adjusted start value {target_start_value} and adjusted end value {target_end_value} (inclusive) in {genome_id}:")
                print(collected_data)
            else:
                print(f"No data found between adjusted start value {target_start_value} and adjusted end value {target_end_value} in {genome_id}")
    else:
        print(f"File {genome_id} does not exist in the directory {REFS}")


In [46]:
cycog_neighbors_list = []

for index, row in parts_df.iterrows():
    genome_id = row['GenomeID']
    start_value = row['Start']
    end_value = row['End']
    file_path = os.path.join(REFS, str(genome_id), f'{genome_id}.gff')
    
    # Adjust start_value if it's less than 10,000
    start_value_adjusted = max(start_value - 10000, 0)
    
    # Adjust end_value + 10,000
    end_value_adjusted = end_value + 10000
    
    if os.path.isfile(file_path):
        with open(file_path, 'r') as file:
            start_found = False
            end_found = False
            lines_to_collect = []
            
            # Variables to store closest lines
            closest_start_line = None
            closest_start_distance = float('inf')
            closest_end_line = None
            closest_end_distance = float('inf')
            
            # Read the file and find lines close to start_value_adjusted and end_value_adjusted
            for line in file:
                parts = line.strip().split('\t')
                try:
                    current_start = int(parts[3])
                    current_end = int(parts[4])
                except (IndexError, ValueError):
                    continue
                
                if current_start <= start_value_adjusted <= current_end:
                    closest_start_line = line.strip()
                    start_found = True
                
                if current_start <= end_value_adjusted <= current_end:
                    closest_end_line = line.strip()
                    end_found = True
                
                # Check if the line is within the range of closest lines
                if start_found and end_found:
                    if current_start <= end_value_adjusted and current_end >= start_value_adjusted:
                        if len(parts) > 8 and '=' in parts[8]:
                            lines_to_collect.append(int(parts[8].split('=')[1].split(';')[0]))
            
            # If the end line isn't found, use the line with the highest value
            if not end_found:
                with open(file_path, 'r') as file:
                    highest_line = None
                    highest_value = -1
                    for line in file:
                        parts = line.strip().split('\t')
                        try:
                            current_end = int(parts[4])
                        except (IndexError, ValueError):
                            continue
                        
                        if current_end > highest_value:
                            highest_value = current_end
                            highest_line = line.strip()
                    
                    if highest_line:
                        parts = highest_line.split('\t')
                        lines_to_collect.append(int(parts[8].split('=')[1].split(';')[0]))
            
            # Append the results to the list
            if start_found or end_found:
                gene_id = row['GeneID']
                cycog_neighbors_list.append({
                    'GeneID': gene_id,
                    'Neighbor_GeneIDs': lines_to_collect
                })
    else:
        print(f"File {genome_id} does not exist in the directory {REFS}")

# print(cycog_neighbors_list)


In [49]:
cycog_neighbors_list = []

for index, row in parts_df.iterrows():
    genome_id = row['GenomeID']
    start_value = row['Start']
    end_value = row['End']
    file_path = os.path.join(REFS, str(genome_id), f'{genome_id}.gff')
    
    # Adjust start_value if it's less than 10,000
    start_value_adjusted = max(start_value - 10000, 0)
    
    # Adjust end_value + 10,000
    end_value_adjusted = end_value + 10000
    
    if os.path.isfile(file_path):
        lines_to_collect = []
        
        with open(file_path, 'r') as file:
            lines = file.readlines()
            
            # Variables to store closest lines
            closest_start_line = None
            closest_end_line = None
            closest_start_distance = float('inf')
            closest_end_distance = float('inf')
            
            # Check if the start_value_adjusted is negative
            if start_value_adjusted == 0:
                start_index = 0
            else:
                start_index = None
                for i, line in enumerate(reversed(lines)):
                    parts = line.strip().split('\t')
                    try:
                        current_start = int(parts[3])
                        if current_start <= start_value_adjusted:
                            start_index = len(lines) - 1 - i
                            break
                    except (IndexError, ValueError):
                        continue
            
            if start_index is None:
                start_index = 0  # Default to start of file if not found
            
            # Check if the end_value_adjusted exceeds the file length
            if end_value_adjusted >= len(lines):
                end_index = len(lines) - 1
            else:
                end_index = None
                for i, line in enumerate(lines):
                    parts = line.strip().split('\t')
                    try:
                        current_end = int(parts[4])
                        if current_end >= end_value_adjusted:
                            end_index = i
                            break
                    except (IndexError, ValueError):
                        continue
            
            if end_index is None:
                end_index = len(lines) - 1  # Default to end of file if not found
            
            # Collect lines between start_index and end_index
            for i in range(start_index, end_index + 1):
                line = lines[i].strip()
                parts = line.split('\t')
                try:
                    gene_id = int(parts[8].split('=')[1].split(';')[0])
                    lines_to_collect.append(gene_id)
                except (IndexError, ValueError):
                    continue
            
            # Append the results to the list
            gene_id = row['GeneID']
            cycog_neighbors_list.append({
                'GeneID': gene_id,
                'Neighbor_GeneIDs': lines_to_collect
            })
    else:
        print(f"File {genome_id} does not exist in the directory {REFS}")

print(cycog_neighbors_list)


[{'GeneID': 2717659556, 'Neighbor_GeneIDs': [2717660262]}, {'GeneID': 2608235998, 'Neighbor_GeneIDs': [2608237302]}, {'GeneID': 2553538053, 'Neighbor_GeneIDs': [2553538615, 2553538616, 2553538617, 2553538618, 2553538619, 2553538620, 2553538621, 2553538622, 2553538623, 2553538624, 2553538625, 2553538626, 2553538627, 2553538628]}, {'GeneID': 2717597201, 'Neighbor_GeneIDs': [2717595867, 2717595868, 2717595869, 2717595870, 2717595871, 2717595872, 2717595873, 2717595874, 2717595875, 2717595876, 2717595877, 2717595878, 2717595879, 2717595880, 2717595881, 2717595882, 2717595883, 2717595884, 2717595885, 2717595886, 2717595887, 2717595888, 2717595889, 2717595890, 2717595891, 2717595892, 2717595893, 2717595894, 2717595895, 2717595896, 2717595897, 2717595898, 2717595899, 2717595900, 2717595901, 2717595902, 2717595903, 2717595904, 2717595905, 2717595906, 2717595907, 2717595908, 2717595909, 2717595910, 2717595911, 2717595912, 2717595913, 2717595914, 2717595915, 2717595916, 2717595917, 2717595918, 2

In [47]:
# for item in cycog_neighbors_list:
#     print (type(item))

print (len(cycog_neighbors_list))

36


In [48]:
print (cycog_neighbors_list)

[{'GeneID': 2717659556, 'Neighbor_GeneIDs': [2717658540, 2717658704, 2717658705, 2717658706, 2717658707, 2717658708, 2717658709, 2717658710, 2717658711, 2717658712, 2717658713, 2717658714, 2717658715, 2717658716, 2717658717, 2717658718, 2717658719, 2717658720, 2717658721, 2717658722, 2717658723, 2717658724, 2717658873, 2717658874, 2717658875, 2717658876, 2717658877, 2717658878, 2717658879, 2717658880, 2717658881, 2717658882, 2717658883, 2717658884, 2717658885, 2717658886, 2717658887, 2717658888, 2717658889, 2717658890, 2717658891, 2717658892, 2717658893, 2717658894, 2717659045, 2717659046, 2717659047, 2717659048, 2717659049, 2717659050, 2717659051, 2717659052, 2717659053, 2717659054, 2717659055, 2717659056, 2717659057, 2717659058, 2717659059, 2717659060, 2717659061, 2717659062, 2717659063, 2717659206, 2717659207, 2717659208, 2717659209, 2717659210, 2717659211, 2717659212, 2717659213, 2717659214, 2717659215, 2717659216, 2717659217, 2717659218, 2717659219, 2717659220, 2717659221, 2717659