# Data Processing

In this notebook you will find code for CSV processing. The raw CSV is formatted like this:

<img src="mRNA_raw_csv.png" width="400">

Column labels represent <font color = 'red'> <b> patients </b> </font> with either mild or severe RSV. Row labels represent <font color = 'red'> <b> genes </b> </font>. For each observation, we have a <font color = 'red'> <b> non-normalized mRNA count </b> </font> of that gene in each patient.

Let's start by importing our packages, then loading in our data.

In [91]:
# Imports
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

'''
Loaded RNA seq data, reformatted the gene names, and replaced case numbers with GSM numbers to make it easier to compare
with patient data.
'''

# raw counts
rna_seq = pd.read_csv('GSE155925_Raw_counts_matrix.txt.gz', sep = '\t', header = 0, index_col = 0)

# patient data
patient_data = pd.read_csv("../RAW_DATA/patient_data.csv")
patient_data.index = patient_data['Accession']

# remove emsembl ID from gene names
def strip_name(gene):
    return gene[:gene.find(':')]

rna_seq = rna_seq.rename(index = lambda s: strip_name(s))


# rename cases by GSM number (raw counts data labels cases as "Case 1, Case 2, etc.")
gsm = []
for i in range(len(rna_seq.columns)):
    gsm.append('GSM' + str(4715941 + i))    
rna_seq.columns = gsm

display(rna_seq.head())

Unnamed: 0,GSM4715941,GSM4715942,GSM4715943,GSM4715944,GSM4715945,GSM4715946,GSM4715947,GSM4715948,GSM4715949,GSM4715950,...,GSM4715995,GSM4715996,GSM4715997,GSM4715998,GSM4715999,GSM4716000,GSM4716001,GSM4716002,GSM4716003,GSM4716004
TSPAN6,7,12,7,7,5,6,5,3,3,4,...,18,3,60,34,22,37,1,18,6,29
TNMD,0,1,0,0,0,0,0,0,0,0,...,0,0,19,1,0,2,0,1,0,0
DPM1,227,201,259,224,154,143,257,248,133,147,...,428,211,76,312,307,391,172,333,194,263
SCYL3,500,635,454,643,473,309,519,585,413,333,...,1058,540,295,892,820,1011,378,759,397,834
C1orf112,158,174,173,157,134,131,144,219,116,79,...,361,134,105,317,304,341,128,201,174,209


In [92]:
'''
Add rows to include patient data
'''
# pathogen groups: 0 = negative, 1 = RSV, 2 = other
# pulled manually from site
groups = [0, 1, 1, 2, 0, 0, 1, 1, 0, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 0, 1, 2, 1, 2, 0, 1, 2, 0, 2, 0, 2, 1, 1, 1, 1, 1, 0, 0, 2, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1, 1]

rna_seq.loc[len(rna_seq.index)] = groups
rna_seq = rna_seq.rename(index = {19919: 'Group'})


# sex: 0 = M, 1 = F
# pulled manually from site
sex = [0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0]

rna_seq.loc[len(rna_seq.index)] = sex
rna_seq = rna_seq.rename(index = {19920: 'Sex'})


# severity of RSV: 0 = patient does not have RSV
rna_seq.loc[len(rna_seq.index)] = severity
rna_seq = rna_seq.rename(index = {19921: 'Severity'})

severity = [0] * 64
for col in rna_seq.columns:
    if col in patient_data.index:
        rna_seq.at['Severity', col] = patient_data.at[col, 'Severity']

# display patient data
display(rna_seq.tail())

Unnamed: 0,GSM4715941,GSM4715942,GSM4715943,GSM4715944,GSM4715945,GSM4715946,GSM4715947,GSM4715948,GSM4715949,GSM4715950,...,GSM4715995,GSM4715996,GSM4715997,GSM4715998,GSM4715999,GSM4716000,GSM4716001,GSM4716002,GSM4716003,GSM4716004
RP3-454G6.2,16,9,4,7,10,2,4,18,6,2,...,14,7,20,10,10,18,6,15,4,9
RP5-937E21.8,0,9,12,0,17,0,12,0,0,5,...,28,0,8,5,28,17,0,9,13,4
Group,0,1,1,2,0,0,1,1,0,2,...,1,1,1,2,1,2,2,1,1,1
Sex,0,1,1,0,1,0,1,0,0,1,...,1,1,1,1,0,0,0,0,1,0
Severity,0,mild,mild,0,0,0,severe,mild,0,0,...,mild,severe,mild,0,mild,0,0,severe,severe,mild


In [93]:
# Calculate sum of each column, convert to proportions. 

def logarithm(element):
    if element > 0:
        return math.log(element)
    else:
        return 0

# get rid of patient data rows
genes_rna_seq = rna_seq.drop(index = ['Group', 'Sex', 'Severity'])

# Normalize by total mRNA content per patient.
for label in rna_seq.columns.tolist():
    genes_rna_seq[label] = genes_rna_seq[label].astype(int)
    col_sum = genes_rna_seq[label].sum()
    genes_rna_seq[label] = genes_rna_seq[label].div(col_sum)
    genes_rna_seq[label] = genes_rna_seq[label].apply(logarithm) # Take log proportion to avoid really small numbers.

# add patient data rows back
norm_rna_seq = genes_rna_seq

for _ in range(3):
    norm_rna_seq.loc[len(genes_rna_seq.index)] = rna_seq.iloc[len(genes_rna_seq.index)]
    
norm_rna_seq = norm_rna_seq.rename({19919: 'Group'})
norm_rna_seq = norm_rna_seq.rename({19920: 'Sex'})
norm_rna_seq = norm_rna_seq.rename({19921: 'Severity'})


norm_rna_seq.head()
#norm_rna_seq.tail(10)

Unnamed: 0,GSM4715941,GSM4715942,GSM4715943,GSM4715944,GSM4715945,GSM4715946,GSM4715947,GSM4715948,GSM4715949,GSM4715950,...,GSM4715995,GSM4715996,GSM4715997,GSM4715998,GSM4715999,GSM4716000,GSM4716001,GSM4716002,GSM4716003,GSM4716004
TSPAN6,-14.055245,-13.316598,-14.15154,-14.545247,-14.525813,-13.921265,-14.588879,-14.829556,-14.569697,-14.411967,...,-13.652072,-15.29741,-11.110669,-12.868854,-13.180301,-12.800547,-16.02726,-13.475689,-14.134371,-12.929009
TNMD,0.0,-15.801505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-12.260574,-16.395214,0.0,-15.718318,0.0,-16.366061,0.0,0.0
DPM1,-10.576206,-10.4982,-10.540622,-11.079511,-11.098299,-10.75018,-10.64924,-10.41474,-10.77796,-10.807829,...,-10.48332,-11.044165,-10.87428,-10.652211,-10.544495,-10.442757,-10.879766,-10.557918,-10.658273,-10.724151
SCYL3,-9.786547,-9.34788,-9.979353,-10.025012,-9.976156,-9.979683,-9.946413,-9.556557,-9.644862,-9.990119,...,-9.578308,-10.104454,-9.518038,-9.601748,-9.562039,-9.49277,-10.092366,-9.734059,-9.942195,-9.570072
C1orf112,-10.938561,-10.642449,-10.944158,-11.434911,-11.237411,-10.837827,-11.228503,-10.539097,-10.914719,-11.428814,...,-10.653566,-11.498183,-10.551053,-10.636312,-10.554315,-10.579582,-11.17523,-11.062756,-10.767076,-10.953971


Perfect! Let's write this into a CSV.

In [101]:
#mRNA_data.to_csv("normalized_mRNA_counts.csv", index = False)