# Data preprocessing


## Tomato dataset

In [1]:
import pandas as pd
import numpy as np

We will load the data, select the samples from the two species with complete combinatorial experiments and do a log2(x+1) transformation to make them suited for linear modeling. 

In [3]:
data = pd.read_table("../data/GSE45774_rpkm_all.txt", index_col=0)
# Select columns 29:52 (in python 28:)
data = data.iloc[:,28:]
# Do the log2(x+1 transformation)
data_log = data.applymap(lambda x: np.log2(x+1))


We transpose the data to have genes as columns and create potential targets from the row names.

In [6]:
# Transpose the data
data_transposed = data_log.transpose()
# Create extra features that might serve as target
data_transposed['species'] = [l.split('.')[0] for l in data_transposed.index]
data_transposed['position'] = [l.split('.')[1] for l in data_transposed.index]
data_transposed['tissue'] = [l.split('.')[2] for l in data_transposed.index]
data_transposed['root'] = [1 if l.split('.')[2] == 'root' else 0 for l in data_transposed.index]

# print the info on the data
print(f"There are {data_transposed.shape[0]} rows and {data_transposed.shape[1]} columns")
# Save the data - will be loaded in the notebooks
data_transposed.to_csv('../data/tomato_with_targets.txt')

There are 24 rows and 28302 columns


In [8]:
# These are the potential targets:
potential_targets = ['species','position','tissue','root']
data_transposed[potential_targets]

Unnamed: 0,species,position,tissue,root
penn.Sh.floral,penn,Sh,floral,0
penn.Sh.leaf,penn,Sh,leaf,0
penn.Sh.root,penn,Sh,root,1
penn.Sh.sdling,penn,Sh,sdling,0
penn.Sh.stem,penn,Sh,stem,0
penn.Sh.veg,penn,Sh,veg,0
penn.Sun.floral,penn,Sun,floral,0
penn.Sun.leaf,penn,Sun,leaf,0
penn.Sun.root,penn,Sun,root,1
penn.Sun.sdling,penn,Sun,sdling,0
