# Data Processing

In this notebook you will find code for CSV processing. The raw CSV is formatted like this:

<img src="mRNA_raw_csv.png" width="400">

Column labels represent <font color = 'red'> <b> patients </b> </font> with either mild or severe RSV. Row labels represent <font color = 'red'> <b> genes </b> </font>. For each observation, we have a <font color = 'red'> <b> non-normalized mRNA count </b> </font> of that gene in each patient.

Let's start by importing our packages, then loading in our data.

In [102]:
# Imports
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

'''
Loaded RNA seq data, reformatted the gene names, and replaced case numbers with GSM numbers to make it easier to compare
with patient data.
'''

# raw counts
rna_seq = pd.read_csv('../RAW_DATA/GSE155925_Raw_counts_matrix.txt.gz', sep = '\t', header = 0, index_col = 0)

# patient data
patient_data = pd.read_csv("../RAW_DATA/patient_data.csv")
patient_data.index = patient_data['Accession']

# remove emsembl ID from gene names
def strip_name(gene):
    return gene[:gene.find(':')]

rna_seq = rna_seq.rename(index = lambda s: strip_name(s))


# rename cases by GSM number (raw counts data labels cases as "Case 1, Case 2, etc.")
gsm = []
for i in range(len(rna_seq.columns)):
    gsm.append('GSM' + str(4715941 + i))    
rna_seq.columns = gsm

display(rna_seq.head())

Unnamed: 0,GSM4715941,GSM4715942,GSM4715943,GSM4715944,GSM4715945,GSM4715946,GSM4715947,GSM4715948,GSM4715949,GSM4715950,...,GSM4715995,GSM4715996,GSM4715997,GSM4715998,GSM4715999,GSM4716000,GSM4716001,GSM4716002,GSM4716003,GSM4716004
TSPAN6,7,12,7,7,5,6,5,3,3,4,...,18,3,60,34,22,37,1,18,6,29
TNMD,0,1,0,0,0,0,0,0,0,0,...,0,0,19,1,0,2,0,1,0,0
DPM1,227,201,259,224,154,143,257,248,133,147,...,428,211,76,312,307,391,172,333,194,263
SCYL3,500,635,454,643,473,309,519,585,413,333,...,1058,540,295,892,820,1011,378,759,397,834
C1orf112,158,174,173,157,134,131,144,219,116,79,...,361,134,105,317,304,341,128,201,174,209


In [103]:
'''
Add rows to include patient data
'''
# pathogen groups: 0 = negative, 1 = RSV, 2 = other
# pulled manually from site
groups = [0, 1, 1, 2, 0, 0, 1, 1, 0, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 0, 1, 2, 1, 2, 0, 1, 2, 0, 2, 0, 2, 1, 1, 1, 1, 1, 0, 0, 2, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1, 1]

rna_seq.loc[len(rna_seq.index)] = groups


# sex: 0 = M, 1 = F
# pulled manually from site
sex = [0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0]

rna_seq.loc[len(rna_seq.index)] = sex

# severity of RSV: 0 = patient does not have RSV
severity = [None] * 64
rna_seq.loc[len(rna_seq.index)] = severity

#rename the patient data rows
rna_seq = rna_seq.rename({19919: 'Group', 19920: 'Sex', 19921: 'Severity'})

# assign severity
for col in rna_seq.columns:
    if col in patient_data.index:
        rna_seq.at['Severity', col] = patient_data.at[col, 'Severity']


        
# display patient data
display(rna_seq.tail(3))

Unnamed: 0,GSM4715941,GSM4715942,GSM4715943,GSM4715944,GSM4715945,GSM4715946,GSM4715947,GSM4715948,GSM4715949,GSM4715950,...,GSM4715995,GSM4715996,GSM4715997,GSM4715998,GSM4715999,GSM4716000,GSM4716001,GSM4716002,GSM4716003,GSM4716004
Group,0.0,1,1,2.0,0.0,0.0,1,1,0.0,2.0,...,1,1,1,2.0,1,2.0,2.0,1,1,1
Sex,0.0,1,1,0.0,1.0,0.0,1,0,0.0,1.0,...,1,1,1,1.0,0,0.0,0.0,0,1,0
Severity,,mild,mild,,,,severe,mild,,,...,mild,severe,mild,,mild,,,severe,severe,mild


In [104]:
# Calculate sum of each column, convert to proportions. 

def logarithm_2(element):
    if element > 0:
        return math.log(element) / math.log(2)
    else:
        return 0

# get rid of patient data rows
norm_rna_seq = rna_seq.drop(index = ['Group', 'Sex', 'Severity'])

# Normalize by total mRNA content per patient.
for label in norm_rna_seq.columns.tolist():
    norm_rna_seq[label] = norm_rna_seq[label].astype(int)
    col_sum = norm_rna_seq[label].sum()
    norm_rna_seq[label] = norm_rna_seq[label].div(col_sum)
    norm_rna_seq[label] = norm_rna_seq[label].apply(logarithm_2) # Take log proportion to avoid really small numbers.

# add patient data rows back
for _ in range(3):
    norm_rna_seq.loc[len(norm_rna_seq.index)] = rna_seq.iloc[len(norm_rna_seq.index)]
    
norm_rna_seq = norm_rna_seq.rename({19919: 'Group', 19920: 'Sex', 19921: 'Severity'})

#norm_rna_seq.head()
norm_rna_seq.tail(10)

Unnamed: 0,GSM4715941,GSM4715942,GSM4715943,GSM4715944,GSM4715945,GSM4715946,GSM4715947,GSM4715948,GSM4715949,GSM4715950,...,GSM4715995,GSM4715996,GSM4715997,GSM4715998,GSM4715999,GSM4716000,GSM4716001,GSM4716002,GSM4716003,GSM4716004
EXOC3L2,0.0,0.0,0.0,-23.79171,-20.278247,-22.669103,-23.369231,-21.394527,-22.604592,-22.792073,...,-22.280739,-19.654461,-17.68827,-21.068332,0.0,-21.354811,-20.800521,-22.026273,0.0,-20.510599
RP13-210D15.9,-23.084788,-22.796752,-23.223711,0.0,-22.278247,-22.669103,0.0,0.0,0.0,-22.792073,...,-21.865701,-23.654461,-18.766272,-20.653294,-21.889623,-21.091777,-21.122449,-20.80388,-20.654622,0.0
WDFY4,-13.870469,-13.415209,-13.442352,-12.973927,-12.753705,-12.528273,-13.835901,-13.156123,-12.591968,-12.830624,...,-13.099172,-12.022829,-13.452381,-12.996869,-13.713035,-13.053772,-13.224603,-13.534419,-12.397234,-13.045032
RP11-244E17.1,-21.084788,-19.989397,-22.223711,-20.469782,-21.278247,-20.669103,-20.047303,-19.809565,-19.145161,-19.091634,...,-20.165261,-22.654461,-17.351235,-19.952855,-20.304661,-20.091777,-21.122449,-20.026273,-20.391588,-21.188671
CTB-60B18.23,0.0,0.0,-23.223711,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
RP3-454G6.2,-19.084788,-19.626827,-21.223711,-20.984355,-19.956319,-21.669103,-21.369231,-18.809565,-20.01963,-21.792073,...,-20.058346,-20.847106,-17.614269,-20.331366,-20.152658,-19.506814,-20.537486,-19.704344,-20.97655,-20.340674
RP5-937E21.8,0.0,-19.626827,-19.638749,0.0,-19.190784,0.0,-19.784268,0.0,0.0,-20.470145,...,-19.058346,0.0,-18.936197,-21.331366,-18.667231,-19.589276,0.0,-20.44131,-19.27611,-21.510599
Group,0.0,1.0,1.0,2.0,0.0,0.0,1.0,1.0,0.0,2.0,...,1.0,1.0,1.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0
Sex,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
Severity,,mild,mild,,,,severe,mild,,,...,mild,severe,mild,,mild,,,severe,severe,mild


Perfect! Let's write this into a CSV.

In [105]:
norm_rna_seq.to_csv("normalized_mRNA_counts.csv", index = True)

In [107]:
'''
Data frames for patient data groups
'''
# RSV and control groups
control = norm_rna_seq.loc[:, norm_rna_seq.loc['Group'] == 0].drop(index = ['Group', 'Sex', 'Severity'])
rsv = norm_rna_seq.loc[:, norm_rna_seq.loc['Group'] == 1].drop(index = ['Group', 'Sex', 'Severity'])

control.to_csv('control_norm.csv', index = True)
rsv.to_csv('rsv_norm.csv', index = True)

# Severity
mild = norm_rna_seq.loc[:, norm_rna_seq.loc['Severity'] == 'mild'].drop(index = ['Group', 'Sex', 'Severity'])
severe = norm_rna_seq.loc[:, norm_rna_seq.loc['Severity'] == 'severe'].drop(index = ['Group', 'Sex', 'Severity'])

mild.to_csv('mild_norm.csv', index = True)
severe.to_csv('severe_norm.csv', index = True)

# check number of samples in each group
print(len(control.columns), len(rsv.columns), len(mild.columns), len(severe.columns))
display(control.tail(3))

10 37 29 8


Unnamed: 0,GSM4715941,GSM4715945,GSM4715946,GSM4715949,GSM4715974,GSM4715979,GSM4715982,GSM4715984,GSM4715991,GSM4715992
CTB-60B18.23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
RP3-454G6.2,-19.084788,-19.956319,-21.669103,-20.01963,-19.963393,-20.056781,-19.619998,-20.75909,-19.995884,-18.502533
RP5-937E21.8,0.0,-19.190784,0.0,0.0,0.0,-21.641744,-21.093929,0.0,-20.803238,-20.961965
