# __Mini Projet Apprentissage Artificiel__ 

We start by importing all the modules we will use in this notebook.
We will use __sklearn__ as our main framework for ML.

In [25]:
import numpy as np
import pandas as pd 
import scipy.io
import sklearn
import matplotlib as plt
from urllib.request import urlretrieve
import zipfile

We then load our data and vizualize it.

In [13]:
# path = "./Projet_Expression.zip"
# urlretrieve("https://moodle.psl.eu/pluginfile.php/614743/mod_resource/content/14/Projet_Expression.zip",path)
# with zipfile.ZipFile(path,'r') as zip_ref:
#     zip_ref.extractall("./Projet_Expression")

data_matrix = scipy.io.loadmat("./Projet_Expression/gliome.mat")
X = data_matrix['X']
y = data_matrix['Y'][:, 0] 

print(f"The data has the dimension : \n • Input size : {X.shape} \n • Output size : {y.shape}")

The data has the dimension : 
 • Input size : (50, 4434) 
 • Output size : (50,)


In [24]:
df = pd.DataFrame(X)
df['y'] = y
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4425,4426,4427,4428,4429,4430,4431,4432,4433,y
0,1.877947,1.30103,1.950365,2.765147,2.383456,1.935003,1.805501,1.489958,1.949878,1.984977,...,3.593485,1.686636,1.30103,1.30963,2.062958,1.332438,2.113275,2.609381,3.687966,1
1,2.651036,2.290211,2.436589,2.42753,2.81869,2.22849,2.174424,2.106459,2.09149,2.137432,...,3.498401,2.493015,1.76626,1.869179,2.238474,1.30103,1.929475,2.519645,3.951704,1
2,2.221108,2.302321,3.045137,2.438125,2.462685,1.971209,2.288998,1.436299,2.041398,2.232764,...,3.586848,2.300823,1.886583,1.806535,1.30103,1.861657,1.30103,1.30103,3.705766,1
3,2.16883,2.125105,2.993723,2.253585,2.675738,2.120541,2.464906,1.484127,1.978128,2.364278,...,3.498593,2.439544,1.881537,1.934864,1.30103,1.30103,1.30103,1.838071,3.826369,1
4,2.575785,1.705988,2.513637,2.906606,2.578367,2.20021,2.367455,1.959123,1.8975,1.956192,...,3.437832,2.22466,1.59309,1.761575,1.30103,1.602045,1.410159,1.847608,4.082294,1


This problem is a supervised classification problem. 
We will try 2 different models to classify our data into 4 differents classes (gliome types)

First we need to split our data into 3 sets : Train, Validation, Test
In order to compare performances on different training set, we will generate 2 triplets of sets.

In [27]:
from sklearn.model_selection import train_test_split

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X, y, test_size=0.3,random_state=84, stratify=y)
# X_train_1, X_valid_1, y_train_1, y_valid_1 = train_test_split(X_temp_1, y_temp_1, test_size= 0.3)

X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X, y, test_size=0.3,random_state=84, stratify=y)
# X_train_2, X_valid_2, y_train_2, y_valid_2 = train_test_split(X_temp_1, y_temp_1, test_size= 0.3)

In [None]:
plt.subplot(2, 2, 1)
plt.hist(y_test_1)
plt.xlabel('Test Labels 1')
plt.subplot(2, 2, 2)
plt.hist(y_train_1)
plt.xlabel('Train Labels 1')
plt.subplot(2, 2, 3)
plt.hist(y_test_2)
plt.xlabel('Test Labels 2')
plt.subplot(2, 2, 4)
plt.hist(y_train_2)
plt.xlabel('Train Labels 2')


### Random Forest Decision Tree Classifier (RFD)

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

clf = tree.DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=5, 
                                  min_samples_split=2, min_samples_leaf=1, 
                                  min_weight_fraction_leaf=0.0, max_features=None, 
                                  random_state=None, max_leaf_nodes=None, 
                                  min_impurity_decrease=1e-07, class_weight=None)
clf = clf.fit(X_train_1, y_train_1)


# Evaluate acuracy on test data
print(clf)
score = clf.score(X_test_1, y_test_1)
print("Acuracy (on test set) = ", score)
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
y_true, y_pred = y_test_1, clf.predict(X_test_1)
print( classification_report(y_true, y_pred) )
print("\n CONFUSION MATRIX")
print( confusion_matrix(y_true, y_pred) )


DecisionTreeClassifier(max_depth=5, min_impurity_decrease=1e-07,
                       min_samples_split=4)
Acuracy (on test set) =  0.6
              precision    recall  f1-score   support

           1       0.50      1.00      0.67         4
           2       1.00      0.25      0.40         4
           3       0.50      0.33      0.40         3
           4       0.75      0.75      0.75         4

    accuracy                           0.60        15
   macro avg       0.69      0.58      0.55        15
weighted avg       0.70      0.60      0.56        15


 CONFUSION MATRIX
[[4 0 0 0]
 [3 1 0 0]
 [1 0 1 1]
 [0 0 1 3]]


In [30]:
from sklearn import model_selection

params = {'criterion' : ['entropy','gini'],
          'splitter' : ['best','random'],
          'max_depth' : np.arange(1,10,1,dtype = int),
          'min_samples_split' : np.arange(1,3,1,dtype = int),
          'min_samples_leaf' : np.arange(1,4,1,dtype = int),
          'min_weight_fraction_leaf' : np.logspace(-2,3,10),
          'min_impurity_decrease' : np.logspace(-10,-3,7)
        }
DTC = tree.DecisionTreeClassifier()
grid_cv = model_selection.GridSearchCV(DTC,param_grid= params,cv = 5,scoring='accuracy')
grid_cv.fit(X_train_1,y_train_1)



In [None]:
best_DTC = grid_cv.best_estimator_
best_params = grid_cv.best_params_
print(best_params)
best_DTC = best_DTC.fit(X_train_1, y_train_1)

# Evaluate acuracy on test data
print(best_DTC)
score = best_DTC.score(X_test_1, y_test_1)
print("Acuracy (on test set) = ", score)
y_true, y_pred = y_test_1, best_DTC.predict(X_test_1)
print( classification_report(y_true, y_pred) )
print("\n CONFUSION MATRIX")
print( confusion_matrix(y_true, y_pred) )