In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_squared_error, log_loss
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np


In [2]:
# Chargement des données
df_data = pd.read_csv("data/clean_data.csv")
df_metadata = pd.read_csv("data/metadata.csv")
# df_data_t = df_data.transpose()
# df_metadata

# Convertir les données en ndarray et supprimer les colonnes inutiles
X = df_data.loc[:, ~df_data.columns.str.contains('^Unnamed')].values
X = X.T

# Générer les labels en fonction d'une colonne choisie
s = pd.Series(df_metadata["tumor_stage"].values)
labels,values = pd.factorize(s)
y = labels

print("Shape of X :", X.shape)
print("Shape of y :", y.shape)

print("labels :", values)


Shape of X : (685, 20103)
Shape of y : (685,)
labels : Index(['stage ib', 'stage ia', 'stage i', 'stage iib', 'stage iv',
       'stage iiia', 'not reported', 'stage iia', 'stage ii', 'stage iiib'],
      dtype='object')


In [3]:
# selection des k best features grâce au test chi2

chi2_selector = SelectKBest(chi2, k=1000)
X = chi2_selector.fit_transform(X, y)
print(X.shape)

(685, 1000)


In [4]:
def generate_artificial_data(data, lbls, nb_of_new, noise = True):
    
    sigma_noise = 10e-2
    new_X = np.zeros((data.shape[1],0))
    new_y = np.zeros((0))

    for label in np.unique(y):

        X_cond = data[np.where(lbls==label)] # X correspondant à un label    
        # estimation des paramètres de la normale
        mean_X = np.mean(X_cond, axis=0)
        std_X = np.std(X_cond, axis=0) 

        # nombre de nouveaux patiends à ajouter

        nb_new_patients_cond = int(nb_of_new*(len(lbls[np.where(lbls==label)])/lbls.shape[0]))
        

        # generer les labels
        
        y_cond = np.empty(nb_new_patients_cond)
        y_cond.fill(label)

        
        # boucle qui genere des nombres aléatoires suivant
        # une loi normale avec les paramètres calculés

        new_X_cond = np.zeros((0,nb_new_patients_cond))

        for i in range(X.shape[1]):
            new_feature = np.random.normal(mean_X[i], std_X[i], (1,nb_new_patients_cond))
            if(noise):
                new_feature += np.random.normal(0,1, new_feature.shape)*sigma_noise
            new_X_cond = np.append(new_X_cond,new_feature, axis=0)

        new_X = np.concatenate((new_X,new_X_cond),axis=1)
        new_y = np.append(new_y, y_cond)

    new_X = MinMaxScaler().fit_transform(new_X.T)
    
    
    print(new_X.shape)
    print(new_y.shape)
    
    return (new_X,new_y)



In [5]:
X_art,y_art = generate_artificial_data(X,y,15001)

X_train, X_test, y_train, y_test = train_test_split(X_art, y_art, test_size=0.2, random_state=42)

(14994, 1000)
(14994,)


In [8]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components = 5)
A = svd.fit_transform(X_art)
T = svd.components_
print(A.shape)
print(T.shape)
D = np.dot(A,T)
print(D.shape)
################ WARNING ###############
# pearson = np.corrcoef(D,X_art)
# print("Pearson =", pearson) 
########################################

rmse = mean_squared_error(D,X_art)
print("RMSE =", rmse)


(14994, 5)
(5, 1000)
(14994, 1000)
RMSE = 0.01402773798497853


In [14]:
X_train, X_test, y_train, y_test = train_test_split(A, y_art, test_size=0.2, random_state=42)

In [15]:
from sklearn.svm import SVC

model = SVC(C=10.)
model.fit(X_train,y_train)
pred_train = model.predict(X_train)

print("Train:")
print(classification_report(y_train,pred_train))
pred_test = model.predict(X_test)

print("Test:")
print(classification_report(y_test,pred_test))

Train:
             precision    recall  f1-score   support

       -1.0       1.00      1.00      1.00      1072
        0.0       0.69      0.79      0.74      2962
        1.0       0.86      0.91      0.88      2804
        2.0       1.00      1.00      1.00        87
        3.0       0.66      0.65      0.65      1584
        4.0       0.74      0.64      0.69       539
        5.0       0.63      0.48      0.54      1564
        6.0       0.99      0.97      0.98       183
        7.0       0.98      0.98      0.98       939
        8.0       1.00      0.27      0.42        15
        9.0       0.90      0.80      0.85       246

avg / total       0.78      0.79      0.78     11995

Test:
             precision    recall  f1-score   support

       -1.0       1.00      1.00      1.00       263
        0.0       0.70      0.78      0.74       782
        1.0       0.85      0.86      0.86       656
        2.0       1.00      1.00      1.00        22
        3.0       0.69      0

In [12]:
X_train_art,y_train_art = generate_artificial_data(X,y,15001)
X_test_art, y_test_art = generate_artificial_data(X,y,5001)

df_train_X = pd.DataFrame(X_train_art)
df_train_X.to_csv("data/challenge/train/data.csv")
df_train_y = pd.DataFrame(y_train_art)
df_train_y.to_csv("data/challenge/train/metadata.csv")

df_test_X = pd.DataFrame(X_test_art)
df_test_X.to_csv("data/challenge/test/data.csv")
df_test_y = pd.DataFrame(y_test_art)
df_test_y.to_csv("data/challenge/test/metadata.csv")

(14994, 1000)
(14994,)
(4996, 1000)
(4996,)


In [14]:
ar1 = np.zeros((1,5))
print(ar1)
print(ar1 + np.random.normal(0,1, ar1.shape)*1e-2)

[[0. 0. 0. 0. 0.]]
[[ 0.00163597  0.0052493   0.00546016 -0.00536443  0.0025316 ]]
