In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_squared_error, log_loss
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from imblearn.over_sampling import ADASYN, SMOTE
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD

# 0) Chargement des données

In [6]:
# Chargement des données, clean_data étant les données nettoyées des NaN trop nombreux
df_data = pd.read_csv("clean_data.csv")
print("df_data shape:", df_data.shape)
df_metadata = pd.read_csv("metadata.csv")
print("df_metadata shape:", df_metadata.shape)
df_metadata[:5]

FileNotFoundError: File b'clean_data.csv' does not exist

In [4]:
# Convertir les données en ndarray et supprimer les colonnes inutiles
D = df_data.loc[:, ~df_data.columns.str.contains('^Unnamed')].values
D = D.T

print(type(D))
print(D.shape)

# Générer les labels en fonction d'une colonne choisie
status = pd.Series(df_metadata["tissue_status"].values)
stage = pd.Series(df_metadata["tumor_stage"].values)

labelsBinary, valuesBinary = pd.factorize(status)
labelsStages, valuesStages = pd.factorize(stage)

yBinary = labelsBinary
yStage = labelsStages

print("labelsBinary :", labelsBinary)
print("valuesBinary :", valuesBinary)

print("labelsStages :", labelsStages)
print("valuesStages :", valuesStages)

<class 'numpy.ndarray'>
(685, 20103)
labelsBinary : [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0
 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0

In [7]:
# IL FAUT ARRIVER À 300 MB après avoir agrandi le nombre d'observations
# ===>>> couper les features
# selection des k best features grâce au test chi2
chi2_selector = SelectKBest(chi2, k=1000)
D = chi2_selector.fit_transform(D, labelsBinary)
print(D.shape)

(685, 1000)


## Enrichissement artificiel des données

Problème potentiel: il reste peut-être des classes 8 dans le test set, potentiellement à nettoyer

In [8]:
# 6 occurences nécessaire pour Smote ou Adasyn, 
# on enlève la classe 8 et on duplique une occurence de 2 pour passer à 6 (min SMOTE) 

X_train, X_test, y_train, y_test = train_test_split(D, yStage, test_size=0.2, random_state=42)
# stratify=y, mais une classe avec 1 occurences,on vire cette classe?

recounted = Counter(y_train)
print(recounted)

X_train_sans_8 = X_train[np.where(y_train!=8)]
X_2 = X_train[np.where(y_train==2)][0]
print(X_train_sans_8.shape)
print(X_2.shape)

# dédoublement d'un exemple de la classe 2
X_train_sans_8_double_2 = np.vstack([X_train_sans_8,X_2])
print(X_train_sans_8_double_2.shape)

# dédoublement d'un label de la classe 2
y_train_sans_8 = y_train[np.where(y_train!=8)]
print(y_train_sans_8)
y_train_sans_8 = np.append(y_train_sans_8, 2)
print(y_train_sans_8)

# retrait des instances potentielles de la classe 8 trop petite dans le test
X_test = X_test[np.where(y_test!=8)]
y_test = y_test[np.where(y_test!=8)]

Counter({0: 141, 1: 131, 5: 68, 3: 63, -1: 49, 7: 42, 4: 26, 9: 13, 6: 9, 2: 5, 8: 1})
(547, 1000)
(1000,)
(548, 1000)
[ 0  0  7  1  7  0  5  3  4 -1  0  0  5  0  7  0  6 -1  1  3  0  0  1  1
  3 -1  1  5  1  5  0  3  0  4  7  7  3  3  0  0  1  0  0  0  4 -1 -1 -1
  1  0  1  0 -1  0  9  6  5  0  0  0 -1  3  1  7  0  0  5  0  1  1  1  1
 -1  5  4  3  4  3 -1  0  7  0  5  1  1  6  1  9  1  0  7  5  1  1  0  5
  7  0  5  0  3  0  3  0  0  6  7  0  1  1  5  3  7  0  1  5  5  1  1  7
  0  1  0 -1 -1  0  1  0  0 -1  9  0 -1  0  3  1  0  7  5  1  3 -1  1  1
  7  0  1  1  0  1  1  6  5  7  9  1  1  0  7  3  1  1  0  7  4  4  3  5
  5  3  5 -1 -1  0  7  3  0  1  3  6  0  9  0  6  0  4  0  3 -1  1  7 -1
  3  1  4  0  1  4  1  1  1  5  0 -1  9  1  0  1  1 -1  5  5  0 -1  3  4
  0  1  7 -1  1  1  1  1  7  3  7  0  0  4  5  5  0  3  2  5  7 -1  1  7
  0  5  1  0  1  1  0  1  3  1  5  1 -1  0  3  0  7  1  5  3  9 -1  0  5
  1  1  5  1  9  9  3  3  0  0  1  1 -1 -1  5  1  1  1  0  0  0  2  0  0
  6  

In [9]:
X_train = X_train_sans_8_double_2
y_train = y_train_sans_8

In [10]:
# nombre d'occurences désirées par classe 
# # on a enlevé la classe 8 car qu'une occurence 
# impossible d'appliquer SMOTE ou ADASYN et génération d'une population à partir
# d'un seul exemple est absurde

dict= {0: 1000, 1: 1000, 5: 1000, 3: 1000, -1: 1000, 7: 1000, 4: 1000, 9: 1000, 6: 1000, 2: 1000}  
smote = SMOTE(random_state=42, sampling_strategy=dict)

X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
print(X_resampled.shape)
print(X_train.shape)

  n_samples_majority))
  n_samples_majority))
  n_samples_majority))
  n_samples_majority))
  n_samples_majority))
  n_samples_majority))
  n_samples_majority))
  n_samples_majority))
  n_samples_majority))
  n_samples_majority))


(10000, 1000)
(548, 1000)


# 1) Classification par stage sur D

NB: les données enrichies pour classifier sont dans X_resampled, y_resampled

In [52]:
# XGBoost

XGBclass = XGBClassifier(objective='binary:logistic', max_depth=3, n_estimators=100, learning_rate=0.1, n_jobs=3)

XGBclass.fit(X_resampled,y_resampled)
pred_train = XGBclass.predict(X_resampled) # on vérifie que le train est normal

print("Train:")
print(classification_report(y_resampled,pred_train))

pred_test = XGBclass.predict(X_test)
print("Test:")
print(classification_report(y_test,pred_test))

Train:
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00      1000
           0       1.00      0.99      0.99      1000
           1       1.00      1.00      1.00      1000
           2       1.00      1.00      1.00      1000
           3       1.00      1.00      1.00      1000
           4       1.00      1.00      1.00      1000
           5       1.00      1.00      1.00      1000
           6       1.00      1.00      1.00      1000
           7       1.00      1.00      1.00      1000
           9       1.00      1.00      1.00      1000

   micro avg       1.00      1.00      1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000

Test:
              precision    recall  f1-score   support

          -1       0.86      1.00      0.92        12
           0       0.15      0.20      0.17        30
           1       0.28      0.30      0.29        27
           

  'precision', 'predicted', average, warn_for)


In [11]:
# Random forest

RFclass = RandomForestClassifier(n_estimators = 200, max_depth = 20, n_jobs = 3)
RFclass.fit(X_resampled,y_resampled)

pred_train = RFclass.predict(X_resampled) # on vérifie que le train est normal

print("Train:")
print(classification_report(y_resampled,pred_train))

pred_test = RFclass.predict(X_test)
print("Test:")
print(classification_report(y_test,pred_test))

Train:
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00      1000
           0       1.00      1.00      1.00      1000
           1       1.00      1.00      1.00      1000
           2       1.00      1.00      1.00      1000
           3       1.00      1.00      1.00      1000
           4       1.00      1.00      1.00      1000
           5       1.00      1.00      1.00      1000
           6       1.00      1.00      1.00      1000
           7       1.00      1.00      1.00      1000
           9       1.00      1.00      1.00      1000

   micro avg       1.00      1.00      1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000

Test:
              precision    recall  f1-score   support

          -1       0.55      1.00      0.71        12
           0       0.19      0.37      0.25        30
           1       0.38      0.44      0.41        27
           

  'precision', 'predicted', average, warn_for)


In [73]:
# SVM

SVCclass = SVC(C=1)
SVCclass.fit(X_resampled,y_resampled)
pred_train = SVCclass.predict(X_resampled)

print("Train:")
print(classification_report(y_resampled,pred_train))
pred_test = SVCclass.predict(X_test)

print("Test:")
print(classification_report(y_test,pred_test))



Train:
              precision    recall  f1-score   support

          -1       0.85      0.98      0.91      1000
           0       0.34      0.48      0.39      1000
           1       0.28      0.61      0.38      1000
           2       1.00      1.00      1.00      1000
           3       0.91      0.14      0.25      1000
           4       0.74      0.60      0.66      1000
           5       0.98      0.10      0.17      1000
           6       1.00      0.83      0.91      1000
           7       0.49      0.73      0.58      1000
           9       0.97      0.80      0.88      1000

   micro avg       0.63      0.63      0.63     10000
   macro avg       0.76      0.63      0.61     10000
weighted avg       0.76      0.63      0.61     10000

Test:
              precision    recall  f1-score   support

          -1       0.57      1.00      0.73        12
           0       0.25      0.27      0.26        30
           1       0.27      0.30      0.28        27
           

  'precision', 'predicted', average, warn_for)


# 2) Factorisation de D en A*T

comparer les résultats après facto NMF et facto TruncatedSVD ?

In [72]:
svd = TruncatedSVD(n_components = 100)

A = svd.fit_transform(D)
T = svd.components_
print(A.shape)
print(T.shape)

# qualité de la factorisation
rmse = mean_squared_error(D,np.dot(A,T))

(685, 100)
(100, 1000)


# 3) Classification par stage sur A

In [None]:
# Random forest

RFclass = RandomForestClassifier(n_estimators = 200, max_depth = 20, n_jobs = 3)
RFclass.fit(X_resampled,y_resampled)

pred_train = RFclass.predict(X_resampled) # on vérifie que le train est normal

print("Train:")
print(classification_report(y_resampled,pred_train))

pred_test = RFclass.predict(X_test)
print("Test:")
print(classification_report(y_test,pred_test))

In [None]:
# SVM

SVCclass = SVC(C=1)
SVCclass.fit(X_resampled,y_resampled)
pred_train = SVCclass.predict(X_resampled)

print("Train:")
print(classification_report(y_resampled,pred_train))
pred_test = SVCclass.predict(A_test)

print("Test:")
print(classification_report(y_test,pred_test))

# 4) Comparaison des classifications en partant de D et de A