# Découpage définitif des données au format AutoML

pour le format AutoML cf. la page https://github.com/madclam/m2aic2019/blob/master/Starting_Kit_M2info.pdf

- extraire les données de Magali
- les enrichir avec SMOTE
- passer au format AutoML (train, valid, test), en découpant de manière à ce que les classes soient équilibrées à chaque fois

### PLAN

- 1) chargement des données clean et génération du dataset global (1000 de chaque classe avec SMOTE)
- 2) découpage auto_ML global (TOUTES nos données): train, valid, test (800,100,100 pour chaque classe)
- 3) découpage sample pour starting_kit dans le TRAIN (!!!) précédent

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from imblearn.over_sampling import ADASYN, SMOTE

# Chargement des données clean

on charge les données et on utilise SMOTE pour créer l'ensemble "global" de données avec lequel on va travailler

In [None]:
# Chargement des données, clean_data étant les données nettoyées des NaN trop nombreux
df_data = pd.read_csv("clean_data.csv")
print("df_data shape:", df_data.shape)
df_metadata = pd.read_csv("metadata.csv")
print("df_metadata shape:", df_metadata.shape)
df_metadata[:5]

In [3]:
# Convertir les données en ndarray et supprimer les colonnes inutiles
D = df_data.loc[:, ~df_data.columns.str.contains('^Unnamed')].values
D = D.T

print(type(D))
print(D.shape)

# Générer les labels en fonction d'une colonne choisie
status = pd.Series(df_metadata["tissue_status"].values)
stage = pd.Series(df_metadata["tumor_stage"].values)

labelsBinary, valuesBinary = pd.factorize(status)
labelsStages, valuesStages = pd.factorize(stage)

yBinary = labelsBinary
yStage = labelsStages

print("labelsBinary :", labelsBinary)
print("valuesBinary :", valuesBinary)

print("labelsStages :", labelsStages)
print("valuesStages :", valuesStages)

<class 'numpy.ndarray'>
(685, 20103)
labelsBinary : [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0
 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0

In [4]:
# IL FAUT ARRIVER À 300 MB après avoir agrandi le nombre d'observations
# ===>>> couper les features
# selection des k best features grâce au test chi2
chi2_selector = SelectKBest(chi2, k=1000)
D = chi2_selector.fit_transform(D, labelsBinary)
print(D.shape)

(685, 1000)


In [20]:
# 6 occurences nécessaire pour Smote ou Adasyn, 
# on enlève la classe 8 et on duplique une occurence de 2 pour passer à 6 (min nécessaire à SMOTE) 

X_train, X_test, y_train, y_test = train_test_split(D, yStage, test_size=0.2, random_state=42)
# stratify=y, mais une classe avec 1 occurences,on vire cette classe?

recounted = Counter(y_train)
print(recounted)

X_train_sans_8 = X_train[np.where(y_train!=8)]
X_2 = X_train[np.where(y_train==2)][0]
print(X_train_sans_8.shape)
print(X_2.shape)

# dédoublement d'un exemple de la classe 2
X_train_sans_8_double_2 = np.vstack([X_train_sans_8,X_2])
print(X_train_sans_8_double_2.shape)

# dédoublement d'un label de la classe 2
y_train_sans_8 = y_train[np.where(y_train!=8)]
print(y_train_sans_8)
y_train_sans_8 = np.append(y_train_sans_8, 2)
print(y_train_sans_8)

# retrait des instances potentielles de la classe 8 trop petite dans le test
X_test = X_test[np.where(y_test!=8)]
y_test = y_test[np.where(y_test!=8)]

Counter({0: 141, 1: 131, 5: 68, 3: 63, -1: 49, 7: 42, 4: 26, 9: 13, 6: 9, 2: 5, 8: 1})
(547, 1000)
(1000,)
(548, 1000)
[ 0  0  7  1  7  0  5  3  4 -1  0  0  5  0  7  0  6 -1  1  3  0  0  1  1
  3 -1  1  5  1  5  0  3  0  4  7  7  3  3  0  0  1  0  0  0  4 -1 -1 -1
  1  0  1  0 -1  0  9  6  5  0  0  0 -1  3  1  7  0  0  5  0  1  1  1  1
 -1  5  4  3  4  3 -1  0  7  0  5  1  1  6  1  9  1  0  7  5  1  1  0  5
  7  0  5  0  3  0  3  0  0  6  7  0  1  1  5  3  7  0  1  5  5  1  1  7
  0  1  0 -1 -1  0  1  0  0 -1  9  0 -1  0  3  1  0  7  5  1  3 -1  1  1
  7  0  1  1  0  1  1  6  5  7  9  1  1  0  7  3  1  1  0  7  4  4  3  5
  5  3  5 -1 -1  0  7  3  0  1  3  6  0  9  0  6  0  4  0  3 -1  1  7 -1
  3  1  4  0  1  4  1  1  1  5  0 -1  9  1  0  1  1 -1  5  5  0 -1  3  4
  0  1  7 -1  1  1  1  1  7  3  7  0  0  4  5  5  0  3  2  5  7 -1  1  7
  0  5  1  0  1  1  0  1  3  1  5  1 -1  0  3  0  7  1  5  3  9 -1  0  5
  1  1  5  1  9  9  3  3  0  0  1  1 -1 -1  5  1  1  1  0  0  0  2  0  0
  6  

In [6]:
X_train = X_train_sans_8_double_2
y_train = y_train_sans_8

# nombre d'occurences désirées par classe 
# # on a enlevé la classe 8 car qu'une occurence 
# impossible d'appliquer SMOTE ou ADASYN et génération d'une population à partir
# d'un seul exemple est absurde

dict= {0: 1000, 1: 1000, 5: 1000, 3: 1000, -1: 1000, 7: 1000, 4: 1000, 9: 1000, 6: 1000, 2: 1000}  
smote = SMOTE(random_state=42, sampling_strategy=dict)

X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
y_resampled = np.array([y_1 if y_1 != -1 else 8 for y_1 in y_resampled])
print(set(y_resampled))
print(X_resampled.shape)
print(y_resampled.shape)

  n_samples_majority))
  n_samples_majority))
  n_samples_majority))
  n_samples_majority))
  n_samples_majority))
  n_samples_majority))
  n_samples_majority))
  n_samples_majority))
  n_samples_majority))
  n_samples_majority))


{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
(10000, 1000)
(10000,)


# Découpage auto_ML global (TOUTES nos données): train, valid, test

chaque ensemble (train, valid, test) doit être équilibré

La dernière colonne contient les labels pour X_and_y

In [14]:
# on récupère un data_frame de chaque classe, dans lesquelles on va piocher pour redéfinir chaque ensemble

X_resampled_df = pd.DataFrame(X_resampled)
y_resampled_df = pd.DataFrame(y_resampled)
y_resampled_df = y_resampled_df.rename(columns={0: 'label'})


# La dernière colonne contient les labels
X_and_y = pd.concat([X_resampled_df, y_resampled_df], axis=1, sort=False)
#X_and_y.head(n=2)
print(X_and_y.shape)

(10000, 1001)


In [15]:
np.unique(X_and_y['label'].values) # vérifie la disparition de la classe -1

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [16]:
# définition d'un dataframe par classe pour découper par classe
# et obtenir des train/valid/test équilibrés

X_and_y_class0 = X_and_y[X_and_y["label"] == 0]
X_and_y_class1 = X_and_y[X_and_y["label"] == 1]
X_and_y_class2 = X_and_y[X_and_y["label"] == 2]
X_and_y_class3 = X_and_y[X_and_y["label"] == 3]
X_and_y_class4 = X_and_y[X_and_y["label"] == 4]
X_and_y_class5 = X_and_y[X_and_y["label"] == 5]
X_and_y_class6 = X_and_y[X_and_y["label"] == 6]
X_and_y_class7 = X_and_y[X_and_y["label"] == 7]
X_and_y_class8 = X_and_y[X_and_y["label"] == 8]
X_and_y_class9 = X_and_y[X_and_y["label"] == 9]

### TRAIN global de taille 800 (dans lequel on prendra tous les sets du starting kit)

La dernière colonne contient les labels

In [17]:
X_and_y_class0_train = X_and_y_class0[:800]
print(X_and_y_class0_train.shape)

X_and_y_class1_train = X_and_y_class1[:800]
X_and_y_class2_train = X_and_y_class2[:800]
X_and_y_class3_train = X_and_y_class3[:800]
X_and_y_class4_train = X_and_y_class4[:800]
X_and_y_class5_train = X_and_y_class5[:800]
X_and_y_class6_train = X_and_y_class6[:800]
X_and_y_class7_train = X_and_y_class7[:800]
X_and_y_class8_train = X_and_y_class8[:800]
X_and_y_class9_train = X_and_y_class9[:800]

# TRAIN global par concaténations des 800 premiers de chaque classe pour obtenir un train équilibré
X_and_y_train = pd.concat([X_and_y_class0_train,X_and_y_class1_train, X_and_y_class2_train,X_and_y_class3_train,X_and_y_class4_train,X_and_y_class5_train,X_and_y_class6_train,X_and_y_class7_train,X_and_y_class8_train,X_and_y_class9_train], axis=0, sort=False)
print(X_and_y_train.shape)

(800, 1001)
(8000, 1001)


### VALID global de taille 100 (dans lequel on ne prendra RIEN pour le starting kit)

La dernière colonne contient les labels

In [18]:
X_and_y_class0_valid = X_and_y_class0[800:900]
X_and_y_class1_valid = X_and_y_class1[800:900]
X_and_y_class2_valid = X_and_y_class2[800:900]
X_and_y_class3_valid = X_and_y_class3[800:900]
X_and_y_class4_valid = X_and_y_class4[800:900]
X_and_y_class5_valid = X_and_y_class5[800:900]
X_and_y_class6_valid = X_and_y_class6[800:900]
X_and_y_class7_valid = X_and_y_class7[800:900]
X_and_y_class8_valid = X_and_y_class8[800:900]
X_and_y_class9_valid = X_and_y_class9[800:900]

# TRAIN global par concaténations des 800 premiers de chaque classe pour obtenir un train équilibré
X_and_y_valid = pd.concat([X_and_y_class0_valid,X_and_y_class1_valid, X_and_y_class2_valid,X_and_y_class3_valid,X_and_y_class4_valid,X_and_y_class5_valid,X_and_y_class6_valid,X_and_y_class7_valid,X_and_y_class8_valid,X_and_y_class9_valid], axis=0, sort=False)
print(X_and_y_valid.shape)

(1000, 1001)


### TEST global de taille 100 (dans lequel on ne prendra RIEN pour le starting kit)

La dernière colonne contient les labels

In [19]:
X_and_y_class0_test = X_and_y_class0[800:900]
X_and_y_class1_test = X_and_y_class1[800:900]
X_and_y_class2_test = X_and_y_class2[800:900]
X_and_y_class3_test = X_and_y_class3[800:900]
X_and_y_class4_test = X_and_y_class4[800:900]
X_and_y_class5_test = X_and_y_class5[800:900]
X_and_y_class6_test = X_and_y_class6[800:900]
X_and_y_class7_test = X_and_y_class7[800:900]
X_and_y_class8_test = X_and_y_class8[800:900]
X_and_y_class9_test = X_and_y_class9[800:900]

# TRAIN global par concaténations des 800 premiers de chaque classe pour obtenir un train équilibré
X_and_y_test = pd.concat([X_and_y_class0_test,X_and_y_class1_test, X_and_y_class2_test,X_and_y_class3_test,X_and_y_class4_test,X_and_y_class5_test,X_and_y_class6_test,X_and_y_class7_test,X_and_y_class8_test,X_and_y_class9_test], axis=0, sort=False)
print(X_and_y_test.shape)

(1000, 1001)


# Découpage sample pour starting_kit dans le train global défini dans les cellules précédentes (sample train/valid/test provenant TOUS du train global !!!)




### 1) Il faut re-séparer les features des labels (contenus dans la dernière colonne de X_and_y_train)

In [29]:
X_sample_startingKit = X_and_y_train.iloc[:,:-1]
print(X_sample_startingKit.shape)
#X_sample_startingKit.head(n=2)

Y_sample_startingKit = X_and_y_train.iloc[:,-1]
print(Y_sample_startingKit.shape)

(8000, 1000)
(8000,)


In [31]:
X_sample_startingKit[50:100]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
180,0.341633,0.440993,0.509952,0.423177,0.338598,0.39047,0.634911,0.376345,0.329483,0.459987,...,0.381522,0.330865,0.342764,0.413373,0.472934,0.409854,0.45414,0.452469,0.307763,0.459593
182,0.214555,0.68486,0.570595,0.51957,0.090734,0.607159,0.445992,0.263203,0.418204,0.440554,...,0.155288,0.550634,0.31982,0.093627,0.317221,0.242133,0.685767,0.457543,0.376467,0.440837
184,0.566268,0.607626,0.643717,0.469905,0.45473,0.405132,0.717707,0.377301,0.539922,0.60838,...,0.600748,0.43237,0.507892,0.481603,0.599435,0.470757,0.600382,0.473444,0.331542,0.50975
186,0.368406,0.47286,0.573142,0.201227,0.170136,0.546219,0.698731,0.401563,0.476176,0.429403,...,0.205408,0.107961,0.391019,0.289498,0.565784,0.357936,0.41872,0.452516,0.216026,0.296441
195,0.173027,0.508731,0.426622,0.330334,0.1537,0.331619,0.646659,0.528968,0.300872,0.319803,...,0.502974,0.360215,0.315989,0.390461,0.567547,0.381435,0.470391,0.428021,0.207213,0.495934
202,0.633283,0.608561,0.617743,0.139081,0.476393,0.61349,0.860635,0.671405,0.587867,0.582469,...,0.233133,0.534024,0.613289,0.500379,0.61725,0.529457,0.585994,0.486035,0.379864,0.478976
206,0.676396,0.673611,0.522822,0.077804,0.610571,0.760658,0.502143,0.2183,0.279952,0.63762,...,0.201483,0.184115,0.671914,0.550878,0.151255,0.320293,0.565526,0.523423,0.52125,0.652261
212,0.065331,0.274792,0.206423,0.150882,0.068681,0.097633,0.921732,0.732361,0.2231,0.118001,...,0.122319,0.0901,0.077084,0.081269,0.190256,0.115764,0.120371,0.101721,0.076258,0.13295
216,0.537506,0.610978,0.536701,0.562739,0.51285,0.607453,0.557159,0.531162,0.522079,0.602304,...,0.692937,0.587156,0.466182,0.520763,0.179217,0.55114,0.612835,0.524011,0.378265,0.449729
227,0.116659,0.48431,0.395852,0.518796,0.357369,0.197398,0.686479,0.393502,0.376495,0.256521,...,0.552171,0.318906,0.383564,0.208579,0.462173,0.243219,0.297147,0.316627,0.314755,0.416386


### 2) puis écrire dans les fichiers AutoML

In [32]:
with open('../starting_kit/sample_data/hadaca_feat.name', 'w') as f:
    for i in range(X_sample_startingKit.shape[1]):
        f.write('methyl_{}\n'.format(i))

with open('../starting_kit/sample_data/hadaca_train.data', 'w') as f:
    for x in X_sample_startingKit[50:100]:
        for feat in x:
            f.write('{} '.format(feat))

        f.write('\n')

with open('../starting_kit/sample_data/hadaca_test.data', 'w') as f:
    for x in X_sample_startingKit[:50]:
        for feat in x:
            f.write('{} '.format(feat))

        f.write('\n')


with open('../starting_kit/sample_data/hadaca_valid.data', 'w') as f:
    for x in X_sample_startingKit[100:150]:
        for feat in x:
            f.write('{} '.format(feat))

        f.write('\n')

with open('../starting_kit/sample_data/hadaca_train.solution', 'w') as f:
    for x in Y_sample_startingKit[50:100]:
        f.write('{}'.format(x))

        f.write('\n')

with open('../starting_kit/sample_data/hadaca_test.solution', 'w') as f:
    for x in Y_sample_startingKit[:50]:
        f.write('{}'.format(x))

        f.write('\n')


with open('../starting_kit/sample_data/hadaca_valid.solution', 'w') as f:
    for x in Y_sample_startingKit[100:150]:
        f.write('{}'.format(x))

        f.write('\n')

TypeError: 'int' object is not iterable