In [36]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.metrics import log_loss
import pandas as pd
import numpy as np


In [37]:
# Chargement des données
df_data = pd.read_csv("data/clean_data.csv")
df_metadata = pd.read_csv("data/metadata.csv")

# Convertir les données en ndarray et supprimer les colonnes inutiles
X = df_data.loc[:, ~df_data.columns.str.contains('^Unnamed')].values
X = X.T

# Générer les labels en fonction d'une colonne choisie
s = pd.Series(df_metadata["tumor_stage"].values)
labels,values = pd.factorize(s)
y = labels

print("Shape of X :", X.shape)
print("Shape of y :", y.shape)

print("labels :", values)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)   


Shape of X : (685, 20103)
Shape of y : (685,)
labels : Index(['stage ib', 'stage ia', 'stage i', 'stage iib', 'stage iv',
       'stage iiia', 'not reported', 'stage iia', 'stage ii', 'stage iiib'],
      dtype='object')


In [41]:
df_data_t = df_data.transpose()
df_data_t.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20093,20094,20095,20096,20097,20098,20099,20100,20101,20102
count,687.0,687.0,687.0,687.0,687.0,687.0,687.0,687.0,687.0,687.0,...,687.0,687.0,687.0,687.0,687.0,687.0,687.0,687.0,687.0,687.0
unique,687.0,687.0,687.0,687.0,687.0,687.0,687.0,687.0,687.0,687.0,...,687.0,687.0,687.0,687.0,687.0,687.0,687.0,687.0,687.0,687.0
top,0.660569,0.509137,0.072333,0.191283,0.99105,0.118721,0.011137,0.092511,0.073411,0.110074,...,0.05117,0.079595,0.046512,0.847541,0.111704,0.030079,0.061064,0.326466,0.093704,0.07159
freq,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [30]:
from sklearn.decomposition import TruncatedSVD

model = TruncatedSVD(n_components = 50)
A = model.fit_transform(X)
T = model.components_
D = np.dot(A,T)

In [31]:
mean_squared_error(X,D)

0.0025490126675077876

In [28]:
from sklearn.svm import SVC

model = SVC(C=10.)
model.fit(X_train,y_train)
pred = model.predict(X_test)
print(classification_report(y_test,pred))

             precision    recall  f1-score   support

         -1       0.46      1.00      0.63        12
          0       0.24      0.60      0.35        30
          1       0.30      0.41      0.34        27
          3       0.00      0.00      0.00        28
          4       0.00      0.00      0.00         5
          5       0.00      0.00      0.00        21
          6       0.00      0.00      0.00         1
          7       0.00      0.00      0.00        11
          9       0.00      0.00      0.00         2

avg / total       0.15      0.30      0.20       137



  'precision', 'predicted', average, warn_for)


In [29]:
chi2_selector = SelectKBest(chi2, k=1)
X_kbest = chi2_selector.fit_transform(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_kbest, y, test_size=0.2, random_state=42)   


In [35]:
model = SVC(C=10.)
model.fit(X_train,y_train)
pred = model.predict(X_test)
# print("log loass:", log_loss(y_test,pred))
print(classification_report(y_test,pred))

             precision    recall  f1-score   support

         -1       0.90      0.75      0.82        12
          0       0.24      1.00      0.38        30
          1       0.00      0.00      0.00        27
          3       0.00      0.00      0.00        28
          4       0.00      0.00      0.00         5
          5       0.00      0.00      0.00        21
          6       0.00      0.00      0.00         1
          7       0.00      0.00      0.00        11
          9       0.00      0.00      0.00         2

avg / total       0.13      0.28      0.16       137



  'precision', 'predicted', average, warn_for)


In [24]:
print(list(zip(X_train,y_train)))

[(array([0.84089555]), 0), (array([0.57640103]), 0), (array([0.66398585]), 0), (array([0.67277344]), 0), (array([0.60502405]), 0), (array([0.61642684]), 0), (array([0.04111567]), 1), (array([0.10300795]), 1), (array([0.64098739]), 0), (array([0.19455798]), 0), (array([0.76831346]), 0), (array([0.58581236]), 0), (array([0.48485486]), 0), (array([0.71270767]), 0), (array([0.10144172]), 0), (array([0.03770736]), 1), (array([0.37755857]), 0), (array([0.5884909]), 0), (array([0.79074771]), 0), (array([0.60450463]), 0), (array([0.89080295]), 0), (array([0.63324632]), 0), (array([0.10078853]), 1), (array([0.47341552]), 0), (array([0.56303475]), 0), (array([0.89449985]), 0), (array([0.16192081]), 0), (array([0.19658556]), 0), (array([0.70556838]), 0), (array([0.5733945]), 0), (array([0.06238439]), 1), (array([0.69972305]), 0), (array([0.54438825]), 0), (array([0.60958904]), 0), (array([0.05688243]), 0), (array([0.43855204]), 0), (array([0.05186854]), 1), (array([0.12085269]), 1), (array([0.265