In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_squared_error, log_loss
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from imblearn.over_sampling import ADASYN, SMOTE

# 0) Chargement des données

In [16]:
# Chargement des données, clean_data étant les données nettoyées des NaN trop nombreux
df_data = pd.read_csv("data/clean_data.csv")
print("df_data shape:", df_data.shape)
df_metadata = pd.read_csv("data/metadata.csv")
print("df_metadata shape:", df_metadata.shape)
df_metadata[:5]

df_data shape: (20103, 687)
df_metadata shape: (685, 14)


Unnamed: 0,dmprocr_ID,indiv,sample,trscr,cnv,meth,gender,days_to_birth,tumor_stage,da,fut,age_diag,days_to_death,tissue_status
0,97-7552-01,97-7552,1,1,1,1,male,-25578.0,stage ib,alive,1932.0,25578.0,,patho
1,44-7671-01,44-7671,1,0,1,1,male,-23538.0,stage ib,alive,889.0,23538.0,,patho
2,86-7953-01,86-7953,1,1,1,1,female,-25315.0,stage ia,alive,997.0,25315.0,,patho
3,L4-A4E5-01,L4-A4E5,1,1,1,1,female,-17680.0,stage i,alive,578.0,17680.0,,patho
4,NJ-A4YP-01,NJ-A4YP,1,1,1,1,male,-19106.0,stage ib,alive,50.0,19106.0,,patho


In [17]:
# Convertir les données en ndarray et supprimer les colonnes inutiles
X = df_data.loc[:, ~df_data.columns.str.contains('^Unnamed')].values
X = X.T

print(type(X))
print(X.shape)

# Générer les labels en fonction d'une colonne choisie
status = pd.Series(df_metadata["tissue_status"].values)
stage = pd.Series(df_metadata["tumor_stage"].values)

labelsBinary, valuesBinary = pd.factorize(status)
labelsStages, valuesStages = pd.factorize(stage)

yBinary = labelsBinary
yStage = labelsStages

print("Shape of X :", X.shape)
print("Shape of y :", y.shape)

print("labelsBinary :", labelsBinary)
print("valuesBinary :", valuesBinary)

print("labelsStages :", labelsStages)
print("valuesStages :", valuesStages)

<class 'numpy.ndarray'>
(685, 20103)
Shape of X : (685, 20103)
Shape of y : (685,)
labelsBinary : [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0
 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0

In [18]:
# uniformisation de la notation
D = X

# IL FAUT ARRIVER À 300 MB après avoir agrandi le nombre d'observations
# ===>>> couper les features
# selection des k best features grâce au test chi2
chi2_selector = SelectKBest(chi2, k=1000)
D = chi2_selector.fit_transform(D, y)
print(D.shape)

(685, 1000)


## Enrichissement artificiel des données

In [23]:
# 6 occurences nécessaire pour Smote ou Adasyn, on enlève la classe 8 et on duplique occurence de 2  la classe 5 

X_train, X_test, y_train, y_test = train_test_split(D, yStage, test_size=0.2, random_state=42)
# stratify=y, mais une classe avec 1 occurences,on vire cette classe?

recounted = Counter(y_train)
print(recounted)

X_train_sans_8= X_train[np.where(y_train!=8)  ]
X_2 = X_train[np.where(y_train==2)  ][0]
print(X_train_sans_8.shape)
print(X_2.shape)
X_train_sans_8_1 = np.vstack( [X_train_sans_8,X_2  ])
X_train_sans_8_1.shape



y_train_sans_8= y_train[np.where(y_train!=8)]
y_train_sans_8 = np.append(y_train_sans_8, 2)

Counter({0: 141, 1: 131, 5: 68, 3: 63, -1: 49, 7: 42, 4: 26, 9: 13, 6: 9, 2: 5, 8: 1})
(547, 1000)
(1000,)


# 1) Classification par stage sur D

# 2) Factorisation de D en A*T

# 3) Classification par stage sur A