In [158]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [159]:
# Load the species rows CSV file
species_rows_file = 'D:/Users/ernes/Documents/1KCL/Zelezniak/DeepExpressionWorking/2018_11_26/gfp_data_combined_output/expression_species_rows.csv'
species_rows_df = pd.read_csv(species_rows_file)

# Load the predictions from the npy file
pred_file = 'D:/Users/ernes/Documents/1KCL/Zelezniak/DeepExpressionWorking/2018_11_26/gfp_data_combined_output/GFP_Ascomycota_pred.npy'
pred_array = np.load(pred_file)
pred_array = pred_array.flatten()  # This makes sure that pred_array is a 1-dimensional array

# Create a dictionary using species names as keys and data points as values
gfp_pred_dict = {}
for species in species_rows_df.columns[:]:  # Ignore the first column ('Unnamed: 0')
    indices = species_rows_df[species]
    # Remove NaNs, negative values, and convert to integer
    indices = indices[indices >= 0].dropna().astype(int)
    # Make sure indices are within the length of pred_array
    indices = indices[indices < len(pred_array)]
    
    # Split the species name by "_" and take the first two parts
    name_parts = species.split("_")[:2]
    # Capitalize the second part
    name_parts[1] = name_parts[1].capitalize()
    # Join them back together
    new_species = "_".join(name_parts)

    gfp_pred_dict[new_species] = pred_array[indices]

# Print the structure of the dictionary
for species, value in gfp_pred_dict.items():
    print(species, value)


Aaosphaeria_Arxii [ 9.86516    8.702513   7.4766626 ...  9.60866    7.981915  10.158073 ]
Amniculicola_Lignicola [6.9574475 9.051777  9.563999  ... 6.133963  6.300941  8.168323 ]
Ampelomyces_Quisqualis [9.57017  8.078639 8.346651 ... 8.210822 9.01806  8.355364]
Ascobolus_Immersus [8.482015 8.43309  7.164706 ... 8.78146  8.966575 8.835614]
Ascodesmis_Nigricans [8.486106  9.533059  8.81864   ... 8.962505  6.8227954 9.083183 ]
Aspergillus_Alliaceus [8.286455  7.195724  9.1061535 ... 8.878536  8.342495  8.768535 ]
Aspergillus_Arachidicola [9.079378  8.072748  7.8133264 ... 8.166818  8.159739  7.7989035]
Aspergillus_Avenaceus [8.498599  9.564416  6.9947953 ... 6.583483  9.838404  8.241159 ]
Aspergillus_Bertholletiae [9.405075  9.6075735 8.738061  ... 8.647374  8.546244  8.248696 ]
Aspergillus_Caelatus [ 8.838922  10.088196   7.563749  ...  9.232365   8.9788475  9.521281 ]
Aspergillus_Coremiiformis [8.49796   9.942291  8.329149  ... 8.389869  7.2478337 8.935309 ]
Aspergillus_Costaricaensis [

In [160]:
# Dictionaries to store the top 10 and bottom 10 expression levels and indices
top_10_expr = {}
bottom_10_expr = {}
top_10_indices = {}
bottom_10_indices = {}

# For each species
for species, values in gfp_pred_dict.items():
    # Convert the values to a Series for easier handling
    values_series = pd.Series(values)
    
    # Get the top 10 and bottom 10 expression levels and their indices
    top_10_expr[species] = values_series.nlargest(10).values
    bottom_10_expr[species] = values_series.nsmallest(10).values
    top_10_indices[species] = values_series.nlargest(10).index.values
    bottom_10_indices[species] = values_series.nsmallest(10).index.values
    
# Create DataFrames from the dictionaries
df_top_10_expr = pd.DataFrame(top_10_expr)
df_bottom_10_expr = pd.DataFrame(bottom_10_expr)
df_top_10_indices = pd.DataFrame(top_10_indices)
df_bottom_10_indices = pd.DataFrame(bottom_10_indices)

In [161]:
print(df_top_10_expr)
print(df_bottom_10_expr)

   Aaosphaeria_Arxii  Amniculicola_Lignicola  Ampelomyces_Quisqualis  \
0          11.482256               12.026109               11.662296   
1          11.360446               11.526175               11.614923   
2          11.304045               11.358045               11.455908   
3          11.099888               11.309372               11.416203   
4          11.088264               11.303117               11.400668   
5          11.015469               11.176522               11.335292   
6          10.871590               11.155830               11.310605   
7          10.841323               11.127237               11.270744   
8          10.837256               11.090908               11.247834   
9          10.809961               11.048904               11.231232   

   Ascobolus_Immersus  Ascodesmis_Nigricans  Aspergillus_Alliaceus  \
0           10.972997             11.418514              11.593937   
1           10.902094             11.171214              11.321283 

In [162]:
print(df_top_10_indices)
print(df_bottom_10_indices)

   Aaosphaeria_Arxii  Amniculicola_Lignicola  Ampelomyces_Quisqualis  \
0               1284                    1220                    2417   
1               3801                    2882                    1938   
2               1732                    1651                    2550   
3                798                     417                     395   
4               2082                    3491                     284   
5               2391                     779                    3094   
6                224                    3145                    3053   
7                843                    1302                    1825   
8               2723                     197                    1178   
9                865                    1177                     240   

   Ascobolus_Immersus  Ascodesmis_Nigricans  Aspergillus_Alliaceus  \
0                3452                   977                    745   
1                 531                   708                    915 

In [164]:
#Load the species rows CSV file
species_rows_file = 'D:/Users/ernes/Documents/1KCL/Zelezniak/DeepExpressionWorking/2018_11_26/gfp_data_combined_output/expression_species_rows.csv'
species_rows_df = pd.read_csv(species_rows_file)
for species in species_rows_df.columns[:]:  # Ignore the first column ('Unnamed: 0')
    indices = species_rows_df[species]
    # Remove NaNs, negative values, and convert to integer
    indices = indices[indices >= 0].dropna().astype(int)
    # Make sure indices are within the length of pred_array
    indices = indices[indices < len(pred_array)]
    
    # Split the species name by "_" and take the first two parts
    name_parts = species.split("_")[:2]
    # Capitalize the second part
    name_parts[1] = name_parts[1].capitalize()
    # Join them back together
    new_species = "_".join(name_parts)
    species_rows_df.rename(columns={species: new_species}, inplace=True)
print(species_rows_df)

      Aaosphaeria_Arxii  Amniculicola_Lignicola  Ampelomyces_Quisqualis  \
0                   0.0                  4137.0                  7857.0   
1                   1.0                  4138.0                  7858.0   
2                   2.0                  4139.0                  7859.0   
3                   3.0                  4140.0                  7860.0   
4                   4.0                  4141.0                  7861.0   
...                 ...                     ...                     ...   
4597                NaN                     NaN                     NaN   
4598                NaN                     NaN                     NaN   
4599                NaN                     NaN                     NaN   
4600                NaN                     NaN                     NaN   
4601                NaN                     NaN                     NaN   

      Ascobolus_Immersus  Ascodesmis_Nigricans  Aspergillus_Alliaceus  \
0                11247.0  

In [165]:
# Initialize new dictionaries
top_10_indices_updated = {}
bottom_10_indices_updated = {}

# For each species
for species in gfp_pred_dict.keys():
    # Convert the values to a Series for easier handling
    top_10_indices_series = pd.Series(top_10_indices[species])
    bottom_10_indices_series = pd.Series(bottom_10_indices[species])

    # Replace the indices with the corresponding values from species_rows_df
    top_10_indices_updated[species] = species_rows_df.loc[top_10_indices_series, species].values
    bottom_10_indices_updated[species] = species_rows_df.loc[bottom_10_indices_series, species].values

# Create DataFrames from the updated dictionaries
df_top_10_indices_updated = pd.DataFrame(top_10_indices_updated)
df_bottom_10_indices_updated = pd.DataFrame(bottom_10_indices_updated)

# Print the updated DataFrames
print(df_top_10_indices_updated)
print(df_bottom_10_indices_updated)

   Aaosphaeria_Arxii  Amniculicola_Lignicola  Ampelomyces_Quisqualis  \
0             1284.0                  5357.0                 10274.0   
1             3801.0                  7019.0                  9795.0   
2             1732.0                  5788.0                 10407.0   
3              798.0                  4554.0                  8252.0   
4             2082.0                  7628.0                  8141.0   
5             2391.0                  4916.0                 10951.0   
6              224.0                  7282.0                 10910.0   
7              843.0                  5439.0                  9682.0   
8             2723.0                  4334.0                  9035.0   
9              865.0                  5314.0                  8097.0   

   Ascobolus_Immersus  Ascodesmis_Nigricans  Aspergillus_Alliaceus  \
0             14699.0               16361.0                19164.0   
1             11778.0               16092.0                19334.0 

In [175]:
fasta_file = 'D:/Users/ernes/Documents/1KCL/Zelezniak/DeepExpressionWorking/2018_11_26/gfp_data_combined_output/Ascomycota_GFP.seqs.csv'
features_file = 'D:/Users/ernes/Documents/1KCL/Zelezniak/DeepExpressionWorking/2018_11_26/gfp_data_combined_output/Ascomycota_GFP.Xval.npy'

In [257]:
beginning_sequence = {}
end_sequence = {}

# Iterate over species in the species_rows_df DataFrame
for species in species_rows_df.columns[:]:
    indices = species_rows_df[species]
    indices = indices[indices >= 0].dropna().astype(int)
    indices = indices[indices < len(pred_array)]

    name_parts = species.split("_")[:2]
    name_parts[1] = name_parts[1].capitalize()
    new_species = "_".join(name_parts)

    # Get the index row for the species from FASTA file
    fasta_row = fasta_df.loc[indices]

    # Concatenate prom and 5UTR sequences and add to the beginning_sequence dictionary
    beginning_sequence[new_species] = fasta_row['prom'] + 'H' + fasta_row['5UTR']

    # Concatenate 3UTR and term sequences and add to the end_sequence dictionary
    end_sequence[new_species] = fasta_row['3UTR'] + 'H' + fasta_row['term']

# Print the dictionaries
print("Beginning Sequence Dictionary:")
for species, sequence in beginning_sequence.items():
    print(f"{species}: {sequence}\n")

print("End Sequence Dictionary:")
for species, sequence in end_sequence.items():
    print(f"{species}: {sequence}\n")

Beginning Sequence Dictionary:
Aaosphaeria_Arxii: 0       ACAGGGTGGGAAAAGCAGAAGGATTGAGATCAACAACAGGGGTTCT...
1       AGGAACCGCACATACATTATGCCGCCGCCCCTGCCATGCTCGTTCT...
2       GAAAGTGAAAGGCATTGAGATTAGAGGCAAAGGCGTCGATTTACGG...
3       CAACAATTTCCATTTCGATTTGGGTGGCCTTGGACCTAGTGAGCTG...
4       TCGTAGTGCTTGATGATAAAATTCACTGCAACAATTATACATGTCA...
                              ...                        
4132    TCAACGATAATTTATTTCTCACGTTTCTCATTCTCTATGAGCTTTT...
4133    ACCGGCGAGACCACCCTCTCAGCAACGCCTCTAATTTCAGTATGTA...
4134    GTCCGAAGTGGCCCCATTCTAGTACACAACTCCTCCCGAGATGAGC...
4135    ATTGTGCGATGCACTCGATGTGGAAGAGAGATTAGCCATTGTTAGT...
4136    TTTATAGCTATTCAGATAACAGAACTTAGGAAATAGGCTTCCTCCC...
Length: 4137, dtype: object

Amniculicola_Lignicola: 4137    CAGAGCTTGATCCGTGCTGGGCTGGGTATCTGATGACAGGTCAACG...
4138    CACCTCCACACTCCCAAATATATCCTCTGGCCACGCAATCCTCCCC...
4139    CCCGGCAGTCGCAAATGGTCGTACGACCGGTACGACGTTCGAGGGG...
4140    CCGGCCAAGTGAAAATTACGGGCTCTCTGACCACCGTTGTTATGGA...
4141    TACGTTGCGAAAGTTTATA

In [258]:
top_10_beginning_sequence = {
    species: beginning_sequence[species][df_top_10_indices_updated[species]]
    for species in df_top_10_indices_updated.columns
}

top_10_end_sequence = {
    species: end_sequence[species][df_top_10_indices_updated[species]]
    for species in df_top_10_indices_updated.columns
}

bottom_10_beginning_sequence = {
    species: beginning_sequence[species][df_bottom_10_indices_updated[species]]
    for species in df_bottom_10_indices_updated.columns
}

bottom_10_end_sequence = {
    species: end_sequence[species][df_bottom_10_indices_updated[species]]
    for species in df_bottom_10_indices_updated.columns
}



# Print the new dictionaries
print("Top 10 Beginning Sequence Dictionary:")
for species, sequence in top_10_beginning_sequence.items():
    print(f"{species}: {sequence}\n")

print("Top 10 End Sequence Dictionary:")
for species, sequence in top_10_end_sequence.items():
    print(f"{species}: {sequence}\n")

print("Bottom 10 Beginning Sequence Dictionary:")
for species, sequence in bottom_10_beginning_sequence.items():
    print(f"{species}: {sequence}\n")

print("Bottom 10 End Sequence Dictionary:")
for species, sequence in bottom_10_end_sequence.items():
    print(f"{species}: {sequence}\n")

Top 10 Beginning Sequence Dictionary:
Aaosphaeria_Arxii: 1284.0    TATAGTGTTTGGTGGAAATAGTAAGGAGTGGTGTATGGATATTTCT...
3801.0    ATTCAATCCCTTTTCCTTCTTCTCTTTTTTCTTCCACCTCCGCTGT...
1732.0    TATGCCTACTTGTATACTCGTAATAGTTAGTCTGATCGGATGGGGT...
798.0     TTCCAAGCCTAGGCTTGATATTCGCGCTACGAGCTGAGGAAGCAGA...
2082.0    CTCCGTTGTCCCGCATCGCATCGCATCGCATCGAATCTCCCAAGCG...
2391.0    GCAGTCCCAGCAGGCAATCGACGATCTTGCAGTCCAGATGGTCGAG...
224.0     ATAGCACGCCTGCTGGCTTCAAACTTCTGCGAGCCAAACATGGATC...
843.0     CTGCTTGAGTTCGACAGAAAGTTCTCCCAGATCAATTGAGATACTC...
2723.0    TTCCAGTCTGCGGATTCCCGACATCACTCTCTTTTACACTCTTTTG...
865.0     TTGGAATGCAAGCAACTGCCCAGCAGTTCGCGGATGGATTAACAAT...
dtype: object

Amniculicola_Lignicola: 5357.0    GATGCTTTCCAGCGCGCAGGTGAACGGTGTTCTGAACCGGAAAAGG...
7019.0    TCAGCCGTTTGTTCTTCCGGGGTCCGGCGGCGGATGAATAGAGACG...
5788.0    CCCGGCACGCCCATCGTGGGCTGATGACGCTTTCCCGGTGCACGTC...
4554.0    GCGTGCAGAAACCGGGAGCGAGGCATCCGTGATTGAGGCAGATCTG...
7628.0    TTGCCGTGGTGCAGCTGCCCCGCCAGCGCGACGTTGCGCACCTGCT...
4916

In [562]:
features_file = 'D:/Users/ernes/Documents/1KCL/Zelezniak/DeepExpressionWorking/2018_11_26/gfp_data_combined_output/Ascomycota_GFP.Xval.npy'
#Could not do the same way due to npy vs csv
beginning_sequence = {}
end_sequence = {}

# Load features file as a NumPy array
features_array = np.load(features_file)

# Iterate over species in the species_rows_df DataFrame
for species_index, species in enumerate(species_rows_df.columns[1:]):
    indices = species_rows_df[species]
    indices = indices[indices >= 0].dropna().astype(int)
    indices = indices[indices < len(pred_array)]

    name_parts = species.split("_")[:2]
    name_parts[1] = name_parts[1].capitalize()
    new_species = "_".join(name_parts)

    # Get the index row for the species from features file
    features_row = features_array[indices]

    # Extract specific values for the beginning sequence as integers
    beginning_values = features_row[:, [0, 3]].astype(int)

    # Extract specific values for the end sequence as integers
    end_values = features_row[:, [2, 4]].astype(int)

    # Add the values to the respective dictionaries with index
    beginning_sequence[new_species] = {index: value for index, value in enumerate(beginning_values)}
    end_sequence[new_species] = {index: value for index, value in enumerate(end_values)}

In [563]:
features_file = 'D:/Users/ernes/Documents/1KCL/Zelezniak/DeepExpressionWorking/2018_11_26/gfp_data_combined_output/Ascomycota_GFP.Xval.npy'

top_10_beginning_features = {}
top_10_end_features = {}
bottom_10_beginning_features = {}
bottom_10_end_features = {}

# Load features file as a NumPy array
features_array = np.load(features_file)

# Iterate over species in the df_top_10_indices_updated and df_bottom_10_indices_updated columns
for species in df_top_10_indices_updated.columns:
    top_10_indices = df_top_10_indices_updated[species].astype(int)
    bottom_10_indices = df_bottom_10_indices_updated[species].astype(int)

    name_parts = species.split("_")[:2]
    name_parts[1] = name_parts[1].capitalize()
    new_species = "_".join(name_parts)

    # Get the index rows for the species from features file
    top_10_features = features_array[top_10_indices]
    bottom_10_features = features_array[bottom_10_indices]

    # Extract specific values for the beginning sequence as integers
    top_10_beginning_values = top_10_features[:, [0, 3]].astype(int)
    bottom_10_beginning_values = bottom_10_features[:, [0, 3]].astype(int)

    # Extract specific values for the end sequence as integers
    top_10_end_values = top_10_features[:, [2, 4]].astype(int)
    bottom_10_end_values = bottom_10_features[:, [2, 4]].astype(int)

    # Add the values to the respective dictionaries with index
    top_10_beginning_features[new_species] = {index: value for index, value in enumerate(top_10_beginning_values)}
    top_10_end_features[new_species] = {index: value for index, value in enumerate(top_10_end_values)}
    bottom_10_beginning_features[new_species] = {index: value for index, value in enumerate(bottom_10_beginning_values)}
    bottom_10_end_features[new_species] = {index: value for index, value in enumerate(bottom_10_end_values)}

# Print the dictionaries with index
print("Top 10 Beginning Features Dictionary:")
for species, values in top_10_beginning_features.items():
    print(species)
    for index, value in values.items():
        print(f"{index}: {value}")
    print()

print("Top 10 End Features Dictionary:")
for species, values in top_10_end_features.items():
    print(species)
    for index, value in values.items():
        print(f"{index}: {value}")
    print()

print("Bottom 10 Beginning Features Dictionary:")
for species, values in bottom_10_beginning_features.items():
    print(species)
    for index, value in values.items():
        print(f"{index}: {value}")
    print()

print("Bottom 10 End Features Dictionary:")
for species, values in bottom_10_end_features.items():
    print(species)
    for index, value in values.items():
        print(f"{index}: {value}")
    print()


Top 10 Beginning Features Dictionary:
Aaosphaeria_Arxii
0: [ 35 333]
1: [328 440]
2: [ 87 511]
3: [107 407]
4: [135 433]
5: [ 82 421]
6: [142 377]
7: [185 456]
8: [128 379]
9: [106 457]

Amniculicola_Lignicola
0: [132 646]
1: [ 99 510]
2: [126 740]
3: [205 524]
4: [  7 625]
5: [124 632]
6: [ 88 674]
7: [ 99 620]
8: [119 483]
9: [101 735]

Ampelomyces_Quisqualis
0: [258 563]
1: [ 28 724]
2: [465 547]
3: [152 633]
4: [352 535]
5: [ 69 657]
6: [ 79 537]
7: [ 19 500]
8: [133 656]
9: [438 578]

Ascobolus_Immersus
0: [137 420]
1: [146 476]
2: [ 13 500]
3: [149 486]
4: [ 43 477]
5: [304 432]
6: [ 92 430]
7: [141 429]
8: [ 23 416]
9: [ 77 435]

Ascodesmis_Nigricans
0: [133 485]
1: [  7 375]
2: [ 87 568]
3: [100 445]
4: [ 73 486]
5: [115 500]
6: [118 462]
7: [184 491]
8: [119 525]
9: [130 511]

Aspergillus_Alliaceus
0: [ 99 480]
1: [131 446]
2: [337 393]
3: [108 330]
4: [ 89 444]
5: [229 408]
6: [100 366]
7: [ 46 382]
8: [106 439]
9: [259 546]

Aspergillus_Arachidicola
0: [ 60 360]
1: [322 495]

Aaosphaeria_Arxii
0: [282 650]
1: [470 486]
2: [342 518]
3: [   2 1000]
4: [446 624]
5: [302 600]
6: [404 488]
7: [303 582]
8: [322 597]
9: [431 553]

Amniculicola_Lignicola
0: [498 543]
1: [   1 1000]
2: [482 577]
3: [   1 1000]
4: [480 469]
5: [386 558]
6: [441 477]
7: [327 560]
8: [   2 1000]
9: [   1 1000]

Ampelomyces_Quisqualis
0: [462 544]
1: [468 562]
2: [436 688]
3: [1119  491]
4: [368 682]
5: [356 691]
6: [498 529]
7: [419 561]
8: [471 563]
9: [423 558]

Ascobolus_Immersus
0: [497 546]
1: [475 592]
2: [357 536]
3: [   1 1000]
4: [440 546]
5: [   1 1000]
6: [488 486]
7: [247 568]
8: [   1 1000]
9: [215 634]

Ascodesmis_Nigricans
0: [484 571]
1: [343 595]
2: [454 514]
3: [   1 1000]
4: [461 575]
5: [414 542]
6: [291 633]
7: [420 624]
8: [443 479]
9: [   2 1000]

Aspergillus_Alliaceus
0: [1464  477]
1: [1306  461]
2: [837 510]
3: [722 496]
4: [647 535]
5: [729 532]
6: [507 584]
7: [676 456]
8: [745 458]
9: [706 469]

Aspergillus_Arachidicola
0: [1342  552]
1: [345 557]
2: [702 4

In [564]:
# Creating final dictionaries
top_10_final_sequence = {}
top_10_final_feature = {}
bottom_10_final_sequence = {}
bottom_10_final_feature = {}

for species in df_top_10_indices_updated.columns:
    top_10_final_sequence[species] = []
    top_10_final_feature[species] = []
    bottom_10_final_sequence[species] = []
    bottom_10_final_feature[species] = []
    
    # Generate all possible pairings of sequences and features
    for (beginning_seq, end_seq) in product(top_10_beginning_sequence[species], top_10_end_sequence[species]):
        top_10_final_sequence[species].append(str(beginning_seq) + 'H' + str(end_seq))

    for (beginning_feature, end_feature) in product(top_10_beginning_features[species].values(), top_10_end_features[species].values()):
        top_10_final_feature[species].append(beginning_feature.tolist() + end_feature.tolist())
    
    for (beginning_seq, end_seq) in product(bottom_10_beginning_sequence[species], bottom_10_end_sequence[species]):
        bottom_10_final_sequence[species].append(str(beginning_seq) + 'H' + str(end_seq))

    for (beginning_feature, end_feature) in product(bottom_10_beginning_features[species].values(), bottom_10_end_features[species].values()):
        bottom_10_final_feature[species].append(beginning_feature.tolist() + end_feature.tolist())

In [565]:
print("Top 10 Final Feature Dictionary:")
for species, feature in top_10_final_feature.items():
    print(f"{species}: {feature}\n")

Top 10 Final Feature Dictionary:
Aaosphaeria_Arxii: [[35, 333, 77, 384], [35, 333, 24, 280], [35, 333, 49, 480], [35, 333, 36, 378], [35, 333, 39, 300], [35, 333, 71, 361], [35, 333, 14, 266], [35, 333, 145, 417], [35, 333, 44, 422], [35, 333, 100, 376], [328, 440, 77, 384], [328, 440, 24, 280], [328, 440, 49, 480], [328, 440, 36, 378], [328, 440, 39, 300], [328, 440, 71, 361], [328, 440, 14, 266], [328, 440, 145, 417], [328, 440, 44, 422], [328, 440, 100, 376], [87, 511, 77, 384], [87, 511, 24, 280], [87, 511, 49, 480], [87, 511, 36, 378], [87, 511, 39, 300], [87, 511, 71, 361], [87, 511, 14, 266], [87, 511, 145, 417], [87, 511, 44, 422], [87, 511, 100, 376], [107, 407, 77, 384], [107, 407, 24, 280], [107, 407, 49, 480], [107, 407, 36, 378], [107, 407, 39, 300], [107, 407, 71, 361], [107, 407, 14, 266], [107, 407, 145, 417], [107, 407, 44, 422], [107, 407, 100, 376], [135, 433, 77, 384], [135, 433, 24, 280], [135, 433, 49, 480], [135, 433, 36, 378], [135, 433, 39, 300], [135, 433, 71,

In [566]:
print("Bottom 10 Final Feature Dictionary:")
for species, feature in bottom_10_final_feature.items():
    print(f"{species}: {feature}\n")

Bottom 10 Final Feature Dictionary:
Aaosphaeria_Arxii: [[212, 511, 282, 650], [212, 511, 470, 486], [212, 511, 342, 518], [212, 511, 2, 1000], [212, 511, 446, 624], [212, 511, 302, 600], [212, 511, 404, 488], [212, 511, 303, 582], [212, 511, 322, 597], [212, 511, 431, 553], [423, 542, 282, 650], [423, 542, 470, 486], [423, 542, 342, 518], [423, 542, 2, 1000], [423, 542, 446, 624], [423, 542, 302, 600], [423, 542, 404, 488], [423, 542, 303, 582], [423, 542, 322, 597], [423, 542, 431, 553], [363, 513, 282, 650], [363, 513, 470, 486], [363, 513, 342, 518], [363, 513, 2, 1000], [363, 513, 446, 624], [363, 513, 302, 600], [363, 513, 404, 488], [363, 513, 303, 582], [363, 513, 322, 597], [363, 513, 431, 553], [232, 429, 282, 650], [232, 429, 470, 486], [232, 429, 342, 518], [232, 429, 2, 1000], [232, 429, 446, 624], [232, 429, 302, 600], [232, 429, 404, 488], [232, 429, 303, 582], [232, 429, 322, 597], [232, 429, 431, 553], [322, 424, 282, 650], [322, 424, 470, 486], [322, 424, 342, 518], [3

In [567]:
print("Top 10 Final Sequence Dictionary:")
for species, feature in top_10_final_sequence.items():
    print(f"{species}: {feature}\n")

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [568]:
print("Bottom 10 Final Feature Dictionary:")
for species, feature in bottom_10_final_sequence.items():
    print(f"{species}: {feature}\n")

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [261]:
pd.DataFrame(top_10_final_sequence).transpose().to_csv('D:/Users/ernes/Documents/1KCL/Zelezniak/DeepExpressionWorking/2018_11_26/gfp_data_combined_output/top_10_final_sequence.csv')
pd.DataFrame(bottom_10_final_sequence).transpose().to_csv('D:/Users/ernes/Documents/1KCL/Zelezniak/DeepExpressionWorking/2018_11_26/gfp_data_combined_output/bottom_10_final_sequence.csv')

In [372]:
from tensorflow.keras.utils import to_categorical
import numpy as np

def zero_padding(inp,length,start):
    #Zero pad input one hot matrix to desired length
    #Start .. boolean if pad start of sequence (True) or end (False)
    assert len(inp) < length
    out = np.zeros((length,inp.shape[1]))
    if start:
        out[-inp.shape[0]:] = inp
    else:
        out[0:inp.shape[0]] = inp
    return out

def NT_to_onehot(data):
    alphabet = 'ACGT'
    char_to_int = dict((c, i) for i, c in enumerate(alphabet))
    integer_encoded = []
    for char in data:
        if char in alphabet:
            integer_encoded.append(char_to_int[char])
        else:
            integer_encoded.append(0)
    onehot = np.array(to_categorical(integer_encoded,num_classes=4),dtype=np.int8)
    onehot[[char not in alphabet for char in data]] = [0,0,0,0]
    return onehot

In [335]:
# Read the CSV file into a DataFrame
df = pd.read_csv('D:/Users/ernes/Documents/1KCL/Zelezniak/DeepExpressionWorking/2018_11_26/gfp_data_combined_output/top_10_final_sequence.csv')

# Drop the species column
df = df.drop(df.columns[0], axis=1)

# Convert the DataFrame into a Series, dropping any NaN values
sequences = df.stack().reset_index(drop=True)

# Convert the Series back into a DataFrame
df_sequences = pd.DataFrame(sequences, columns=['Sequence'])

print(df_sequences)

In [399]:
# Split 'Sequence' column on 'H', expand=True splits the string into separate columns
df_split = df_sequences['Sequence'].str.split('H', expand=True)

# Rename columns
df_split.columns = ['prom', '5UTR', '3UTR', 'term']

# Replace NaN values with an empty string
df_split = df_split.fillna('')

# Display the updated dataframe
print(df_split.head())

                                                prom  \
0  TATAGTGTTTGGTGGAAATAGTAAGGAGTGGTGTATGGATATTTCT...   
1  TATAGTGTTTGGTGGAAATAGTAAGGAGTGGTGTATGGATATTTCT...   
2  TATAGTGTTTGGTGGAAATAGTAAGGAGTGGTGTATGGATATTTCT...   
3  TATAGTGTTTGGTGGAAATAGTAAGGAGTGGTGTATGGATATTTCT...   
4  TATAGTGTTTGGTGGAAATAGTAAGGAGTGGTGTATGGATATTTCT...   

                                   5UTR  \
0  AGTGTGTTCATACACCATATTGTTACTTCATACAGA   
1  AGTGTGTTCATACACCATATTGTTACTTCATACAGA   
2  AGTGTGTTCATACACCATATTGTTACTTCATACAGA   
3  AGTGTGTTCATACACCATATTGTTACTTCATACAGA   
4  AGTGTGTTCATACACCATATTGTTACTTCATACAGA   

                                                3UTR  \
0  AGTGTGCTAAAATATTGCAAGTTGATTACGGTATTTACTATCCCAT...   
1                          AGAATTTCCTAGTCCATTTTCTTTT   
2  GGATGGGATATCTACCAAGCATGCTATGCTATGTTCGAAACCGCAA...   
3              CACTTATTCTTTCTTCATGCGTGAAGTTACCTTTGAC   
4           ACGATTTTCGAGTTCAAATAAAATAATGCTCTGTTCTTTG   

                                                term  
0  AGGGGGGCG

In [461]:
def process_onehot_vars(df):
    len_utr5 = int(300)
    len_utr3 = int(350)

    onehot_data = []
    
    for i in range(len(df)):
        row_data = []
        # one hot
        row_data.append(NT_to_onehot(df.loc[i, 'prom']))
        row_data.append(NT_to_onehot(df.loc[i, 'term']))
        
        # zero padding
        utr5 = df.loc[i, '5UTR']
        assert (len(utr5) <= len_utr5)
        if len(utr5) == len_utr5: # no zero padding 
            row_data.append(NT_to_onehot(utr5))
        elif len(utr5) > 0: # zero padding
            row_data.append(zero_padding(NT_to_onehot(utr5),len_utr5,True))
        else: # pass np.zeros(1,4)
            row_data.append(zero_padding(np.zeros((1,4)),len_utr5,True))

        utr3 = df.loc[i, '3UTR']
        assert (len(utr3) <= len_utr3)     
        if len(utr3) == len_utr3:
            row_data.append(NT_to_onehot(utr3))
        elif len(utr3) > 0: # zero padding
            row_data.append(zero_padding(NT_to_onehot(utr3),len_utr3,True))
        else:
            row_data.append(zero_padding(np.zeros((1,4)),len_utr3,True))
            
        onehot_data.append(row_data)
        
    return onehot_data

def generate_output_arrays(onehot_data):
    '''make separate output arrays for one_hots and variables'''
    Xhot = []

    for row in onehot_data:
        tmp = []
        tmp.append(np.asarray(row[0],dtype=np.int8)) #prom
        tmp.append(np.asarray(row[2],dtype=np.int8)) #5UTR
        tmp.append(np.asarray(row[3],dtype=np.int8)) #3UTR
        tmp.append(np.asarray(row[1],dtype=np.int8)) #term
        Xhot.append(np.concatenate(tmp))

    return Xhot


In [535]:
df_split_process = df_split
onehot_data = process_onehot_vars(df_split_process)
Xhot = generate_output_arrays(onehot_data)

# Create a placeholder for the stacked arrays
stacked_array = np.empty((0, 2150, 4))

# List to store indices of arrays with incorrect shape
top_incorrect_shape_indices = []

# Loop over the elements in Xhot and stack them
for i in range(len(Xhot)):
    print(i)
    array_to_stack = np.asarray(Xhot[i])
    
    # Check if the array shape is (2150, 4)
    if array_to_stack.shape != (2150, 4):
        # Store the index of the array with incorrect shape
        top_incorrect_shape_indices.append(i)
        # Skip this iteration
        continue

    # Ensure that the array is 3D
    array_to_stack = np.expand_dims(array_to_stack, axis=0)
    # Stack the array
    stacked_array = np.vstack((stacked_array, array_to_stack))

print(stacked_array.shape)
print("Indices with incorrect shape:", top_incorrect_shape_indices)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061


3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702


In [536]:
shape = stacked_array.shape
print("Shape:", shape)

Shape: (4890, 2150, 4)


In [537]:
output_folder = 'gfp_data_permutate'
fname = 'top_permutate'
append=''
np.save(os.path.join(output_folder, fname+'.Xhot'+str(append)), stacked_array)

In [541]:
# Read the CSV file into a DataFrame
df = pd.read_csv('D:/Users/ernes/Documents/1KCL/Zelezniak/DeepExpressionWorking/2018_11_26/gfp_data_combined_output/bottom_10_final_sequence.csv')

# Drop the species column
df = df.drop(df.columns[0], axis=1)

# Convert the DataFrame into a Series, dropping any NaN values
sequences = df.stack().reset_index(drop=True)

# Convert the Series back into a DataFrame
df_sequences = pd.DataFrame(sequences, columns=['Sequence'])

print(df_sequences)

                                               Sequence
0     CCTTCGTGTCCGCCGCGTCCATAGCCAGCGGCGCGATCGCTACCTA...
1     CCTTCGTGTCCGCCGCGTCCATAGCCAGCGGCGCGATCGCTACCTA...
2     CCTTCGTGTCCGCCGCGTCCATAGCCAGCGGCGCGATCGCTACCTA...
3     CCTTCGTGTCCGCCGCGTCCATAGCCAGCGGCGCGATCGCTACCTA...
4     CCTTCGTGTCCGCCGCGTCCATAGCCAGCGGCGCGATCGCTACCTA...
...                                                 ...
4895  CTTTCCTTTGTATCTGGGTTAGATCATGTCTCTGTAGGAGGTGTTG...
4896  CTTTCCTTTGTATCTGGGTTAGATCATGTCTCTGTAGGAGGTGTTG...
4897  CTTTCCTTTGTATCTGGGTTAGATCATGTCTCTGTAGGAGGTGTTG...
4898  CTTTCCTTTGTATCTGGGTTAGATCATGTCTCTGTAGGAGGTGTTG...
4899  CTTTCCTTTGTATCTGGGTTAGATCATGTCTCTGTAGGAGGTGTTG...

[4900 rows x 1 columns]


In [542]:
# Split 'Sequence' column on 'H', expand=True splits the string into separate columns
df_split = df_sequences['Sequence'].str.split('H', expand=True)

# Rename columns
df_split.columns = ['prom', '5UTR', '3UTR', 'term']

# Replace NaN values with an empty string
df_split = df_split.fillna('')

# Display the updated dataframe
print(df_split.head())

                                                prom  \
0  CCTTCGTGTCCGCCGCGTCCATAGCCAGCGGCGCGATCGCTACCTA...   
1  CCTTCGTGTCCGCCGCGTCCATAGCCAGCGGCGCGATCGCTACCTA...   
2  CCTTCGTGTCCGCCGCGTCCATAGCCAGCGGCGCGATCGCTACCTA...   
3  CCTTCGTGTCCGCCGCGTCCATAGCCAGCGGCGCGATCGCTACCTA...   
4  CCTTCGTGTCCGCCGCGTCCATAGCCAGCGGCGCGATCGCTACCTA...   

                                                5UTR  \
0  CTATAATGCGACACGGTGAATACCGACTAGTAGGAGGAGGAGTAGC...   
1  CTATAATGCGACACGGTGAATACCGACTAGTAGGAGGAGGAGTAGC...   
2  CTATAATGCGACACGGTGAATACCGACTAGTAGGAGGAGGAGTAGC...   
3  CTATAATGCGACACGGTGAATACCGACTAGTAGGAGGAGGAGTAGC...   
4  CTATAATGCGACACGGTGAATACCGACTAGTAGGAGGAGGAGTAGC...   

                                                3UTR  \
0  GACGCGACTGCTGAGGAAAACAATACTGACTGACAGACAGACAGGC...   
1  TTGGGCTCCCCACCTCCCCCTTAGTCTGGAAGGGCAACAGAAGAAC...   
2  GGGGTGCGTGTGCACGTCCGCTTGGCATATGAGCGTTTCCTTTTCA...   
3                                                GCC   
4  CAGTTCCCTCAAGACCTTCAAATCGTCCTCACGAATACCCACC

In [547]:
df_split_process = df_split
onehot_data = process_onehot_vars(df_split_process)
Xhot = generate_output_arrays(onehot_data)

# Create a placeholder for the stacked arrays
stacked_array = np.empty((0, 2150, 4))

# List to store indices of arrays with incorrect shape
bottom_incorrect_shape_indices = []

# Loop over the elements in Xhot and stack them
for i in range(len(Xhot)):
    print(i)
    array_to_stack = np.asarray(Xhot[i])
    
    # Check if the array shape is (2150, 4)
    if array_to_stack.shape != (2150, 4):
        # Store the index of the array with incorrect shape
        bottom_incorrect_shape_indices.append(i)
        # Skip this iteration
        continue

    # Ensure that the array is 3D
    array_to_stack = np.expand_dims(array_to_stack, axis=0)
    # Stack the array
    stacked_array = np.vstack((stacked_array, array_to_stack))

print(stacked_array.shape)
print("Indices with incorrect shape:", bottom_incorrect_shape_indices)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062


3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702


In [548]:
shape = stacked_array.shape
print("Shape:", shape)

Shape: (4900, 2150, 4)


In [549]:
output_folder = 'gfp_data_permutate'
fname = 'bottom_permutate'
append=''
np.save(os.path.join(output_folder, fname+'.Xhot'+str(append)), stacked_array)

In [554]:
features_file = np.load('D:/Users/ernes/Documents/1KCL/Zelezniak/DeepExpressionWorking/2018_11_26/gfp_data_combined_output/Ascomycota_GFP.Xval.npy')

features_file.shape

(168066, 72)

In [574]:
import numpy as np

# Load the features_file
features_file = np.load('D:/Users/ernes/Documents/1KCL/Zelezniak/DeepExpressionWorking/2018_11_26/gfp_data_combined_output/Ascomycota_GFP.Xval.npy')

# Extract the desired rows (up to row 4900)
bottom_features_permute_final_np = np.copy(features_file[:4900])
top_features_permute_final_np = np.copy(features_file[:4900])

# Print the shapes of the copied arrays
print("Shape of bottom_features_permute_final_np:", bottom_features_permute_final_np.shape)
print("Shape of top_features_permute_final_np:", top_features_final_np.shape)

Shape of bottom_features_permute_final_np: (4900, 72)
Shape of top_features_final_np: (4900, 72)


In [569]:
import pandas as pd

# Flatten the lists and create a new dataframe
df_top = pd.DataFrame([item for sublist in top_10_final_feature.values() for item in sublist])
df_bottom = pd.DataFrame([item for sublist in bottom_10_final_feature.values() for item in sublist])

In [570]:
print(df_top)

       0    1   2    3
0     35  333  77  384
1     35  333  24  280
2     35  333  49  480
3     35  333  36  378
4     35  333  39  300
...   ..  ...  ..  ...
4895  94  452  37  236
4896  94  452  76  324
4897  94  452  60  245
4898  94  452  76  363
4899  94  452  41  357

[4900 rows x 4 columns]


In [571]:
print(df_bottom)

        0    1    2     3
0     212  511  282   650
1     212  511  470   486
2     212  511  342   518
3     212  511    2  1000
4     212  511  446   624
...   ...  ...  ...   ...
4895  290  515  369   540
4896  290  515  683   437
4897  290  515   17   777
4898  290  515  353   542
4899  290  515  162   588

[4900 rows x 4 columns]


In [576]:
print(top_features_final_np)

[[  13. 1014.   84. ...    5.    5.    9.]
 [ 227. 1014.  102. ...    5.    5.    9.]
 [ 252. 1014.  196. ...    5.    5.    9.]
 ...
 [ 237. 1014.  194. ...    5.    5.    9.]
 [ 118. 1014.  241. ...    5.    5.    9.]
 [ 241. 1014.   16. ...    5.    5.    9.]]


In [577]:
print(bottom_features_permute_final_np)

[[  13. 1014.   84. ...    5.    5.    9.]
 [ 227. 1014.  102. ...    5.    5.    9.]
 [ 252. 1014.  196. ...    5.    5.    9.]
 ...
 [ 237. 1014.  194. ...    5.    5.    9.]
 [ 118. 1014.  241. ...    5.    5.    9.]
 [ 241. 1014.   16. ...    5.    5.    9.]]


In [584]:
def replace_values(df, np_array):
    for i in range(len(df)):
        row = df.iloc[i]
        # replace values in np_array with corresponding rows from dataframe
        np_array[i, 0] = row[0]
        np_array[i, 3] = row[1]
        np_array[i, 2] = row[2]
        np_array[i, 4] = row[3]
        
    return np_array

top_features_permute_final_np = replace_values(df_top, top_features_permute_final_np)
bottom_features_permute_final_np = replace_values(df_bottom, bottom_features_permute_final_np)

In [585]:
top_features_permute_final_np = np.delete(top_features_permute_final_np, top_incorrect_shape_indices, axis=0)
bottom_features_permute_final_np = np.delete(bottom_features_permute_final_np, bottom_incorrect_shape_indices, axis=0)

In [586]:
def save_np_array(fname, np_array):
    output_folder = 'gfp_data_permutate'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Save numpy array as .npy file
    np.save(os.path.join(output_folder, fname+'.npy'), np_array)

    # Print shape information
    print(f"Shape of {fname}: {np_array.shape}")

save_np_array('top_features_permute_final', top_features_permute_final_np)
save_np_array('bottom_features_permute_final', bottom_features_permute_final_np)


Shape of top_features_permute_final: (4890, 72)
Shape of bottom_features_permute_final: (4900, 72)
