# (Purpose) Prepare the project folder once have the aggregated datasets

- notice that the files have over 50000 columns(=genes) so it has not been filtered for protein-coding genes yet
- notice the number of columns are the same for both datasets so both cleaned to have same common genes already

In [None]:
echo "   NOTICE that at this point, you should have the final cleaned TCGA/GTEx datasets generated"
pwd
echo ""

echo "     create new variables that will be used for the copied files in this portion of the pipeline"
WORKING_WITH_INDEPENDENT="GTEX"
# WORKING_WITH_INDEPENDENT="ICGC"
# WORKING_WITH_INDEPENDENT="GEO"
# WORKING_WITH_INDEPENDENT="ICGCGEO"

tcga_log2tpm_ALL_GENES=data/tcga_original_data/TCGA.TPM.log.txt       # local computer
gtex_log2tpm_ALL_GENES=data/gtex_original_data/GTEX.TPM.log.txt       # local computer
# gtex_log2tpm_ALL_GENES=data/icgc_original_data/ICGC.TPM.log.txt       # local computer 
# gtex_log2tpm_ALL_GENES=data/geo_original_data/GEO.TPM.log.txt         # local computer 
# gtex_log2tpm_ALL_GENES=data/icgcgeo_original_data/ICGC.GEO.TPM.log.txt         # local computer 

if [ "$WORKING_WITH_INDEPENDENT" = "GTEX" ]; then
    echo " -----------------------------------------------------------------------"    
    echo " [NOTICE] WORKING_WITH_INDEPENDENT is GTEX"
    echo " -----------------------------------------------------------------------" 
elif [ "$WORKING_WITH_INDEPENDENT" = "ICGC" ]; then
    echo " -----------------------------------------------------------------------"    
    echo " [NOTICE] WORKING_WITH_INDEPENDENT is ICGC"
    echo " -----------------------------------------------------------------------" 
elif [ "$WORKING_WITH_INDEPENDENT" = "GEO" ]; then
    echo " -----------------------------------------------------------------------"    
    echo " [NOTICE] WORKING_WITH_INDEPENDENT is GEO"
    echo " -----------------------------------------------------------------------" 
elif [ "$WORKING_WITH_INDEPENDENT" = "ICGCGEO" ]; then
    echo " -----------------------------------------------------------------------"    
    echo " [NOTICE] WORKING_WITH_INDEPENDENT is ICGC and GEO combined"
    echo " -----------------------------------------------------------------------" 
else
    echo " -----------------------------------------------------------------------"    
    echo " [ERROR] WORKING_WITH_INDEPENDENT is not defined as expected"
    echo " -----------------------------------------------------------------------" 
fi
    

echo "     checking TCGA file; display dimensions too"
ls -lF $tcga_log2tpm_ALL_GENES
wc -l $tcga_log2tpm_ALL_GENES && head -1 $tcga_log2tpm_ALL_GENES  | awk -F"\t" '{print NF; exit}'
echo ""

echo "     checking GTEx file; display dimensions too"
ls -lF $gtex_log2tpm_ALL_GENES
wc -l $gtex_log2tpm_ALL_GENES && head -1 $gtex_log2tpm_ALL_GENES  | awk -F"\t" '{print NF; exit}'

## Make necessary directories for entire project folder

In [None]:
mkdir -pv results/
mkdir -pv results/calculations
mkdir -pv results/classification_CV_RESULTS
mkdir -pv results/classification_PCA_PLOTS
mkdir -pv results/classification_PCA_PLOTS/pdfs_without_a_title
mkdir -pv results/classification_UMAP_PLOTS
mkdir -pv results/classification_UMAP_PLOTS/pdfs_without_a_title
mkdir -pv results/classification_PREDICTIONS
mkdir -pv results/classification_PREDICTIONS/excel_files

## Prepare files for preprocessing procedures

In [None]:
echo "   Copy the aggregated datasets into our new folder so the original file remains untouched"

echo "     create new variables that will be used for the copied files in this portion of the pipeline"
tcga_unscaled_unnormalized_nobatchcorrection=data/preprocessing_combinations/tcga_unscaled_unnormalized_nobatchcorrection.tsv
gtex_unscaled_unnormalized_nobatchcorrection=data/preprocessing_combinations/gtex_unscaled_unnormalized_nobatchcorrection.tsv

echo "     copying over the final versions of cleaned tcga and gtex datasets"
cp -v $tcga_log2tpm_ALL_GENES $tcga_unscaled_unnormalized_nobatchcorrection
cp -v $gtex_log2tpm_ALL_GENES $gtex_unscaled_unnormalized_nobatchcorrection

In [None]:
echo "   Checking the header before making a change"
head -n 1 $tcga_unscaled_unnormalized_nobatchcorrection | cut -f1-5 -d$'\t' | cat
head -n 1 $gtex_unscaled_unnormalized_nobatchcorrection | cut -f1-5 -d$'\t' | cat

echo "   Changing the column name of 'ID' to 'sample_id' in each dataset that was copied over"
sed "1s/ID/sample_id/" $tcga_unscaled_unnormalized_nobatchcorrection > tmp.tsv && mv tmp.tsv $tcga_unscaled_unnormalized_nobatchcorrection
sed "1s/ID/sample_id/" $gtex_unscaled_unnormalized_nobatchcorrection > tmp.tsv && mv tmp.tsv $gtex_unscaled_unnormalized_nobatchcorrection

echo "   Changing the column name of 'Type' to 'label' in each dataset that was copied over"
sed "1s/Type/label/" $tcga_unscaled_unnormalized_nobatchcorrection > tmp.tsv && mv tmp.tsv $tcga_unscaled_unnormalized_nobatchcorrection
sed "1s/Type/label/" $gtex_unscaled_unnormalized_nobatchcorrection > tmp.tsv && mv tmp.tsv $gtex_unscaled_unnormalized_nobatchcorrection

echo "   Checking the header after making a change"
head -n 1 $tcga_unscaled_unnormalized_nobatchcorrection | cut -f1-5 -d$'\t' | cat
head -n 1 $gtex_unscaled_unnormalized_nobatchcorrection | cut -f1-5 -d$'\t' | cat

In [None]:
echo "   Sorting the files by label(2nd column) then by sample_id(1st column); keeping the header as first line"
# (source) https://unix.stackexchange.com/questions/78925/how-to-sort-by-multiple-columns

echo "     previewing tcga data before sort"
head -n 3 $tcga_unscaled_unnormalized_nobatchcorrection | cut -f1-5 -d$'\t' | cat

echo "     begin tcga data sort"
head -n 1 $tcga_unscaled_unnormalized_nobatchcorrection > tmp_header.tsv
tail -n +2 $tcga_unscaled_unnormalized_nobatchcorrection | sort -t $'\t' -k2,2 -k1,1 > tmp.tsv 
cat tmp_header.tsv > $tcga_unscaled_unnormalized_nobatchcorrection
cat tmp.tsv >> $tcga_unscaled_unnormalized_nobatchcorrection
rm tmp_header.tsv
rm tmp.tsv
 
echo "     previewing tcga data after sort"
head -n 3 $tcga_unscaled_unnormalized_nobatchcorrection | cut -f1-5 -d$'\t' | cat
 
 
echo "     previewing gtex data before sort"
head -n 3 $gtex_unscaled_unnormalized_nobatchcorrection | cut -f1-5 -d$'\t' | cat

echo "     begin gtex data sort"
head -n 1 $gtex_unscaled_unnormalized_nobatchcorrection > tmp_header.tsv
tail -n +2 $gtex_unscaled_unnormalized_nobatchcorrection | sort -t $'\t' -k2,2 -k1,1 > tmp.tsv 
cat tmp_header.tsv > $gtex_unscaled_unnormalized_nobatchcorrection
cat tmp.tsv >> $gtex_unscaled_unnormalized_nobatchcorrection
rm tmp_header.tsv
rm tmp.tsv
 
echo "     previewing gtex data after sort"
head -n 3 $gtex_unscaled_unnormalized_nobatchcorrection | cut -f1-5 -d$'\t' | cat

In [None]:
echo "   Creating Mock data version of the tcga datasets"
tcga__mockData=data/preprocessing_combinations/tcga_unscaled_unnormalized_nobatchcorrection__mockData.tsv
cut -f1-5 -d$'\t' $tcga_unscaled_unnormalized_nobatchcorrection > tmp_mock.tsv
head -n 101 tmp_mock.tsv > $tcga__mockData                   # BLCA, grabs the header too
head -n 600 tmp_mock.tsv | tail -n 100 >> $tcga__mockData    # BRCA
head -n 1700 tmp_mock.tsv | tail -n 100 >> $tcga__mockData   # CESC
head -n 2500 tmp_mock.tsv | tail -n 100 >> $tcga__mockData   # COAD
head -n 3000 tmp_mock.tsv | tail -n 100 >> $tcga__mockData   # GI
head -n 3500 tmp_mock.tsv | tail -n 100 >> $tcga__mockData   # HNSC
head -n 4000 tmp_mock.tsv | tail -n 100 >> $tcga__mockData   # KIRC
head -n 4900 tmp_mock.tsv | tail -n 100 >> $tcga__mockData   # LIHC
head -n 5500 tmp_mock.tsv | tail -n 100 >> $tcga__mockData   # LUAD
head -n 6300 tmp_mock.tsv | tail -n 100 >> $tcga__mockData   # PAAD
head -n 6450 tmp_mock.tsv | tail -n 100 >> $tcga__mockData   # PCPG
head -n 6600 tmp_mock.tsv | tail -n 100 >> $tcga__mockData   # PRAD
head -n 7200 tmp_mock.tsv | tail -n 100 >> $tcga__mockData   # THCA
tail -n 100 tmp_mock.tsv >> $tcga__mockData                  # UCEC
wc -l $tcga__mockData && head -1 $tcga__mockData  | awk -F"\t" '{print NF; exit}'

if [ "$WORKING_WITH_INDEPENDENT" = "GTEX" ]; then
    ## THIS IS THE ACTUAL GTEX, the other one below is for ICGC_GEO; even though it's variables are gtex since prelim
    echo "   Creating Mock data version of the gtex datasets"
    gtex__mockData=data/preprocessing_combinations/gtex_unscaled_unnormalized_nobatchcorrection__mockData.tsv
    cut -f1-5 -d$'\t' $gtex_unscaled_unnormalized_nobatchcorrection > tmp_mock.tsv
    head -n 11 tmp_mock.tsv > $gtex__mockData                    # BLCA, grabs the header too
    head -n 300 tmp_mock.tsv | tail -n 100 >> $gtex__mockData    # BRCA
    head -n 321 tmp_mock.tsv | tail -n 5 >> $gtex__mockData      # CESC
    head -n 500 tmp_mock.tsv | tail -n 100 >> $gtex__mockData    # COAD
    head -n 1000 tmp_mock.tsv | tail -n 100 >> $gtex__mockData   # GI
    head -n 1408 tmp_mock.tsv | tail -n 100 >> $gtex__mockData   # HNSC
    head -n 1450 tmp_mock.tsv | tail -n 40 >> $gtex__mockData    # KIRC
    head -n 1600 tmp_mock.tsv | tail -n 100 >> $gtex__mockData   # LIHC
    head -n 2000 tmp_mock.tsv | tail -n 100 >> $gtex__mockData   # LUAD
    head -n 2300 tmp_mock.tsv | tail -n 100 >> $gtex__mockData   # PAAD
    head -n 2500 tmp_mock.tsv | tail -n 100 >> $gtex__mockData   # PCPG
    head -n 2700 tmp_mock.tsv | tail -n 100 >> $gtex__mockData   # PRAD
    head -n 3200 tmp_mock.tsv | tail -n 300 >> $gtex__mockData   # THCA
    tail -n 100 tmp_mock.tsv >> $gtex__mockData                  # UCEC
    wc -l $gtex__mockData && head -1 $gtex__mockData  | awk -F"\t" '{print NF; exit}'
elif [ "$WORKING_WITH_INDEPENDENT" = "ICGC" ]; then
    echo "   Creating Mock data version of ICGC (but using gtex as variable) datasets"
    gtex__mockData=data/preprocessing_combinations/gtex_unscaled_unnormalized_nobatchcorrection__mockData.tsv
    cut -f1-5 -d$'\t' $gtex_unscaled_unnormalized_nobatchcorrection > $gtex__mockData
    touch tmp_mock.tsv
    wc -l $gtex__mockData && head -1 $gtex__mockData  | awk -F"\t" '{print NF; exit}'
elif [ "$WORKING_WITH_INDEPENDENT" = "GEO" ]; then
    echo "   Creating Mock data version of GEO (but using gtex as variable) datasets"
    gtex__mockData=data/preprocessing_combinations/gtex_unscaled_unnormalized_nobatchcorrection__mockData.tsv
    cut -f1-5 -d$'\t' $gtex_unscaled_unnormalized_nobatchcorrection > $gtex__mockData
    touch tmp_mock.tsv
    wc -l $gtex__mockData && head -1 $gtex__mockData  | awk -F"\t" '{print NF; exit}'
elif [ "$WORKING_WITH_INDEPENDENT" = "ICGCGEO" ]; then
    echo "   Creating Mock data version of ICGC_GEO (but using gtex as variable) datasets"
    gtex__mockData=data/preprocessing_combinations/gtex_unscaled_unnormalized_nobatchcorrection__mockData.tsv
    cut -f1-5 -d$'\t' $gtex_unscaled_unnormalized_nobatchcorrection > $gtex__mockData
    touch tmp_mock.tsv
    wc -l $gtex__mockData && head -1 $gtex__mockData  | awk -F"\t" '{print NF; exit}'
else
    echo " -----------------------------------------------------------------------"    
    echo " [ERROR] WORKING_WITH_INDEPENDENT is not defined as expected"
    echo " -----------------------------------------------------------------------" 
fi
    



echo "   done creating mock versions"

In [None]:
# echo "   Cleaning up intermediate/processing files for this notebook"
rm tmp_mock.tsv                                     