# Predicting Cancer Type from Single Cell RNA-Seq Data

## Dataloader

### Elsa Bismuth & Jonathan Mathews

Updated On: 11/09/2022

# Import Statements

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
import torch.nn as nn
import torch.utils.data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import os

# Collecting and Processing RNA-Seq Data

  - Converts each file into a dataframe
  - Finds common genes across all datasets

In [None]:
DATADIR = "/content/drive/My Drive/ML4FG_Project/Data/"
numerizeClass = {'AML': 0, 'CRC': 1, 'GBM': 2, 'LUAD': 3, 'MPAL': 4, 'PDAC': 5}
allGenes = []
fileNames = []
cancerLabels = []
classDist = []

for tsvFile in os.listdir(DATADIR):

  if tsvFile.endswith('.tsv'):
    
    fileNames.append(tsvFile)

    data = pd.read_csv(os.path.join(DATADIR, tsvFile), sep = '\t')
    data = set(data.index)
    allGenes.append(data)

    cancerLabels.append(tsvFile.split('-')[0])
    classDist.append(numerizeClass[ tsvFile.split('-')[0] ])

    print(len(allGenes), len(cancerLabels))

1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
10 10
11 11
12 12
13 13
14 14
15 15


  exec(code_obj, self.user_global_ns, self.user_ns)


16 16


  exec(code_obj, self.user_global_ns, self.user_ns)


17 17


  exec(code_obj, self.user_global_ns, self.user_ns)


18 18


  exec(code_obj, self.user_global_ns, self.user_ns)


19 19


  exec(code_obj, self.user_global_ns, self.user_ns)


20 20


  exec(code_obj, self.user_global_ns, self.user_ns)


21 21


  exec(code_obj, self.user_global_ns, self.user_ns)


22 22


  exec(code_obj, self.user_global_ns, self.user_ns)


23 23


  exec(code_obj, self.user_global_ns, self.user_ns)


24 24


  exec(code_obj, self.user_global_ns, self.user_ns)


25 25


  exec(code_obj, self.user_global_ns, self.user_ns)


26 26


  exec(code_obj, self.user_global_ns, self.user_ns)


27 27


  exec(code_obj, self.user_global_ns, self.user_ns)


28 28


  exec(code_obj, self.user_global_ns, self.user_ns)


29 29


  exec(code_obj, self.user_global_ns, self.user_ns)


30 30
31 31
32 32
33 33
34 34
35 35
36 36
37 37
38 38
39 39
40 40
41 41
42 42
43 43
44 44
45 45
46 46
47 47
48 48
49 49
50 50
51 51
52 52
53 53
54 54
55 55
56 56
57 57
58 58
59 59
60 60
61 61
62 62
63 63
64 64
65 65
66 66
67 67
68 68
69 69
70 70
71 71
72 72
73 73
74 74
75 75
76 76
77 77
78 78
79 79
80 80
81 81
82 82
83 83
84 84


# Split Data into Train/Validation/Test Sets

- 50% Training
- 25% Validation
- 25% Testing
- Stratifies across cancer label to give each dataset the same class distribution

In [None]:
filesTrainVal, filesTest, yTrainVal, yTest = train_test_split(fileNames,
                                                              classDist,
                                                              test_size = 0.25,
                                                              random_state = 42,
                                                              shuffle = True,
                                                              stratify = classDist)

filesTrain, filesVal, yTrain, yVal = train_test_split(filesTrainVal,
                                                      yTrainVal,
                                                      test_size = 0.33,
                                                      random_state = 42,
                                                      shuffle = True,
                                                      stratify = yTrainVal)

# Prepare Files for Dataloader

- Stores the filenames of each dataset
- Allows for easy input to a dataloader in PyTorch

In [None]:
with open('/content/drive/My Drive/ML4FG_Project/Train_Files.txt', 'w') as f:

  for trainFile in filesTrain:

    f.write('%s\n' % trainFile)

with open('/content/drive/My Drive/ML4FG_Project/Validation_Files.txt', 'w') as f:

  for valFile in filesVal:

    f.write('%s\n' % valFile)

with open('/content/drive/My Drive/ML4FG_Project/Test_Files.txt', 'w') as f:

  for testFile in filesTest:

    f.write('%s\n' % testFile)

# Find and Store Common Genes

- Common genes between all datasets found to allow machine learning models to properly run

In [None]:
commonGenes = list(set.intersection(*allGenes))

In [None]:
with open('/content/drive/My Drive/ML4FG_Project/Common_Genes.txt', 'w') as f:

  f.write('%s\n' % len(commonGenes))
  for gene in commonGenes:

    f.write('%s\n' % gene)

In [None]:
cancerLabelDF = pd.Series(cancerLabels)
cancerClassCounts = cancerLabelDF.value_counts()
cancerClassCounts.to_csv('/content/drive/My Drive/ML4FG_Project/Class_Labels_Distribution.csv', index = True)