#### NCI-60 Molecular Characterization from [CellMinerCDB](https://discover.nci.nih.gov/rsconnect/cellminercdb/), include:

- Gene expression: 
    - data_NCI-60_xsq.txt
    - footnotes_NCI-60_xsq.csv
    - Cell_line_annotation_nci60.txt
    - Table_Drugs_Synonyms_cdb.txt

- microRNA expression:
    - data_NCI-60_mir.txt
    - footnotes_NCI-60_mir.csv
    - Cell_line_annotation_nci60.txt
    - Table_Drugs_Synonyms_cdb.txt

- Protein abundance:
    - data_NCI-60_swa.txt
    - footnotes_NCI-60_swa.csv
    - Cell_line_annotation_nci60.txt
    - Table_Drugs_Synonyms_cdb.txt

In [1]:
# Read the Gene expression: data_NCI-60_xsq.txt file

import pandas as pd

# Path to your text file
txt_file_path = "/nfs/turbo/med-kayvan-lab/Projects/DrugCombination/b-DrugCombination/DC_Data/NCI60-GeneExpression/data_NCI-60_xsq.txt"

# Read the text file into a DataFrame
df = pd.read_csv(txt_file_path, delimiter='\t')

# Display the DataFrame
print(df.info())
print(df.head())
# print(df.tail())

# Save the DataFrame to a new CSV file
df.to_csv("NCI60_GeneExpression.csv", index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23826 entries, 0 to 23825
Data columns (total 65 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ID              23826 non-null  object 
 1   chr             23826 non-null  object 
 2   txStart         23826 non-null  int64  
 3   txEnd           23826 non-null  int64  
 4   entrez.gene.id  23804 non-null  float64
 5   BR:MCF7         23826 non-null  float64
 6   BR:MDA-MB-231   23826 non-null  float64
 7   BR:HS 578T      23826 non-null  float64
 8   BR:BT-549       23826 non-null  float64
 9   BR:T-47D        23826 non-null  float64
 10  CNS:SF-268      23826 non-null  float64
 11  CNS:SF-295      23826 non-null  float64
 12  CNS:SF-539      23826 non-null  float64
 13  CNS:SNB-19      23826 non-null  float64
 14  CNS:SNB-75      23826 non-null  float64
 15  CNS:U251        23826 non-null  float64
 16  CO:COLO 205     23826 non-null  float64
 17  CO:HCC-2998     23826 non-null 

In [4]:
# Read the microRNA expression: data_NCI-60_mir.txt file

import pandas as pd

# Path to your text file
txt_file_path = "/nfs/turbo/med-kayvan-lab/Projects/DrugCombination/b-DrugCombination/DC_Data/NCI60-microRNA/data_NCI-60_mir.txt"

# Read the text file into a DataFrame
df = pd.read_csv(txt_file_path, delimiter='\t')

# Display the DataFrame
print(df.info())
print(df.head())
# print(df.tail())

# Save the DataFrame to a new CSV file
df.to_csv("NCI60_microRNAExpression.csv", index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 417 entries, 0 to 416
Data columns (total 71 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Probe id           417 non-null    object 
 1   Gene name          0 non-null      float64
 2   Entrez gene id     0 non-null      float64
 3   Chromosome         417 non-null    object 
 4   Start              417 non-null    int64  
 5   End                417 non-null    int64  
 6   Cytoband           0 non-null      float64
 7   RefSeq(mRNA)       0 non-null      float64
 8   RefSeq(protein)    0 non-null      float64
 9   miRNA Accession #  417 non-null    object 
 10  MirBase Name       417 non-null    object 
 11  BR:MCF7            417 non-null    float64
 12  BR:MDA-MB-231      416 non-null    float64
 13  BR:HS 578T         417 non-null    float64
 14  BR:BT-549          417 non-null    float64
 15  BR:T-47D           417 non-null    float64
 16  CNS:SF-268         417 non

In [10]:
# Read the Protein abundance: data_NCI-60_swa.txt file

import pandas as pd
import numpy as np

# Path to your text file
txt_file_path = "/nfs/turbo/med-kayvan-lab/Projects/DrugCombination/b-DrugCombination/DC_Data/NCI60-Protein/data_NCI-60_swa.txt"

# Read the text file into a DataFrame
df = pd.read_csv(txt_file_path, delimiter='\t')

# Apply log2(x+1) transformation to columns from 11th column to last column
df.iloc[:, 10:] = df.iloc[:, 10:].apply(lambda x: np.log2(x + 1))

# Display the DataFrame
print(df.info())
print(df.head())

# Save the DataFrame to a new CSV file
df.to_csv("NCI60_ProteinAbundance_Log2Transformed.csv", index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3167 entries, 0 to 3166
Data columns (total 70 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ID              3167 non-null   object 
 1   probe_nm        3167 non-null   object 
 2   uniprot_id      3167 non-null   object 
 3   Gene_name       3167 non-null   object 
 4   Entrez_gene     3167 non-null   int64  
 5   Chromosome      3167 non-null   object 
 6   Start_pos       3167 non-null   int64  
 7   End_pos         3167 non-null   int64  
 8   Cytoband        3167 non-null   object 
 9   M_score         3167 non-null   float64
 10  BR:MCF7         3167 non-null   float64
 11  BR:MDA-MB-231   3167 non-null   float64
 12  BR:HS 578T      3167 non-null   float64
 13  BR:BT-549       3167 non-null   float64
 14  BR:T-47D        3167 non-null   float64
 15  CNS:SF-268      3167 non-null   float64
 16  CNS:SF-295      3167 non-null   float64
 17  CNS:SF-539      3167 non-null   f

In [33]:
# Concatenate three dataframes, make it as "60CellLinesDescriptor.csv"

import pandas as pd

# Load the three dataframes
gene_expression_df = pd.read_csv("NCI60_GeneExpression.csv")
microRNA_expression_df = pd.read_csv("NCI60_microRNAExpression.csv")
protein_abundance_df = pd.read_csv("NCI60_ProteinAbundance_Log2Transformed.csv")

# Rename the non cell line column names:
for i in range(5):
    gene_expression_df = gene_expression_df.rename(columns={gene_expression_df.columns[i]: f"GeneExp_{gene_expression_df.columns[i]}"})

for i in range(11):
    microRNA_expression_df = microRNA_expression_df.rename(columns={microRNA_expression_df.columns[i]: f"microRNAExp_{microRNA_expression_df.columns[i]}"})

for i in range(10):
    protein_abundance_df = protein_abundance_df.rename(columns={protein_abundance_df.columns[i]: f"ProteinExp_{protein_abundance_df.columns[i]}"})

# Concatenate the dataframes vertically
concatenated_df = pd.concat([gene_expression_df, microRNA_expression_df, protein_abundance_df], join='outer', ignore_index=True)

# Move columns between 6th and 65th to the end
columns_to_move = concatenated_df.columns[5:65]
columns_to_move_df = concatenated_df[columns_to_move]
concatenated_df = concatenated_df.drop(columns_to_move, axis=1)
concatenated_df = pd.concat([concatenated_df, columns_to_move_df], axis=1)

# Display information about the concatenated dataframe
print(concatenated_df.info())

# Save the dataframe to a new CSV file
concatenated_df.to_csv("60CellLinesDescriptor.csv", index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27410 entries, 0 to 27409
Data columns (total 86 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   GeneExp_ID                     23826 non-null  object 
 1   GeneExp_chr                    23826 non-null  object 
 2   GeneExp_txStart                23826 non-null  float64
 3   GeneExp_txEnd                  23826 non-null  float64
 4   GeneExp_entrez.gene.id         23804 non-null  float64
 5   microRNAExp_Probe id           417 non-null    object 
 6   microRNAExp_Gene name          0 non-null      float64
 7   microRNAExp_Entrez gene id     0 non-null      float64
 8   microRNAExp_Chromosome         417 non-null    object 
 9   microRNAExp_Start              417 non-null    float64
 10  microRNAExp_End                417 non-null    float64
 11  microRNAExp_Cytoband           0 non-null      float64
 12  microRNAExp_RefSeq(mRNA)       0 non-null     