# Création de windowed_dataset

Ce notebook sert à créer des jeux de données d'entraînement et de tests à partir des miliers de fichiers csv concernant les protéines.

In [1]:
import pandas as pd
import numpy as np
import os
import random

In [2]:
train_dir = "data_train/encode"
test_dir = "data_test/encode"

In [3]:
def get_sample_files(data_dir, n, seed=None):
    '''
    Renvoie une liste de n dataframe provenant de n fichiers tirés au hasard dans le dossier data_dir
    '''
    if seed is not None:
        random.seed(seed)
    sample = []
    for filename in random.sample(os.listdir(data_dir), n):
        f = os.path.join(data_dir, filename)
        sample.append(pd.read_csv(f))
    random.shuffle(sample)
    return sample

In [4]:
def new_format(df, win_size):
    '''
    Renvoie un 2d array correspondant a une version "windowed" de la séquence de protéine contenue dans df
    '''
    X = df.iloc[:win_size].values
    X = X.reshape(21 * win_size, order='F')
    X = np.expand_dims(X, axis=0)
    
    n_max = df.shape[0] - win_size + 1
    for i in range(1, n_max):
        b = df.iloc[i:i+win_size].values
        b = b.reshape(21 * win_size, order='F')
        b = np.expand_dims(b, axis=0)
        X = np.concatenate((X, b), axis=0)
    return X

In [5]:
def get_windowed_dataset(data_dir, n, win_size, seed = None):
    '''
    Enchaîne les fonctions 'get_sample_files', 'new_format' pour chaque dataframe, et concaténe le tout
    '''
    data = get_sample_files(data_dir, n, seed=seed)
    data = [new_format(df, win_size) for df in data]
    data = np.concatenate(data)
    data = pd.DataFrame(data)
    data.columns = [str(i) for i in range(21 * win_size)]
    return data

In [6]:
n = 5000
win_size = 10
seed = 42

train = get_windowed_dataset(train_dir, n, win_size, seed)

In [7]:
train.shape

(1390636, 210)

In [8]:
train.to_csv('data_train/windowed_dataset/encode_train_5000.csv', index=False)

In [9]:
n = 1000
win_size = 10

random.seed(42)
test = get_windowed_dataset(test_dir, n, win_size, seed)

In [10]:
test.shape

(276312, 210)

In [11]:
test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,200,201,202,203,204,205,206,207,208,209
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
test.to_csv('data_test/windowed_dataset/encode_test_1000.csv', index=False)