# I. Import the libraries/dataset

In [1]:
import pandas as pd
import plotly.express as px
import numpy as np

#Scikit learn models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.ensemble import GradientBoostingClassifier

# Evaluate the model
from sklearn.metrics import classification_report,f1_score,precision_recall_fscore_support


In [2]:
df_train = pd.read_csv("dataset/sat.trn", sep = " ",header=None)
df_test = pd.read_csv("dataset/sat.tst", sep = " ",header=None)

In [3]:
df_train_central = pd.concat([df_train.iloc[:,17:21], df_train.iloc[:,36]],axis=1)
df_test_central = pd.concat([df_test.iloc[:,17:21], df_test.iloc[:,36]],axis=1)

# II. Data exploration

## 1. Contenu du dataset

- La base de donnée consiste a des  valeurs multi spectrale de pixels dans une image satellite. La classe à prédire et le pixel central dans chaque groupe de voisins. La classe à prédire à été transformé et pour des raisons de facilité à des valeurs entre 1 et 7.
Nous avons 4435 valeurs dans le training set et 2000 dans le testing set.

- Chaque image prise par le satellite landsat consiste en 4 différentes images de la meme scène mais prise dans des valeurs spectrales différentes (2 dans le domaine du visible et 2 dans l'infrarouge). Chaque image à une résolution de 2340 x 3380 

- Dans notre dataset, on à une petite partie de l'image (82 x 100 pixels) 

Sur le schéma ci-dessous j'ai représenté 1 ligne du dataframe. La ligne répresente en fait 1 pixel (rouge) qu'on cherche à prédire, ainsi que ces voisins. Comme on peut le voir les pixels sont mis à la ligne les uns a la suite des autres, et chaque pixels est représenté par 4 valeurs entre 0 et 255.

![Getting Started](pixels.png)

C'est un problème de multi classififcation, le but étant de savoir si le pixel de l'image, correspond à de la terre grise, rouge ou des vegetaux etc...

## 2. Classe à prédire

Tout d'abord en regardant les labels, et le nombre dans chaque classe, on se rend compte que elles ne sont pas bien répartis ce qui peut poser plusieurs problèmes
- L'accuracy n'est pas une bonne metrics pour vérifier notre classification 

Nous allons donc utiliser d'autres metrics pour vérifier la prédiction de notre algortihme 
- confusion matrix 
- score F1 (qui est une moyenne de la précision et du rappel) 
- ROC curve

In [28]:
dicto = dict()
def get_dict(x):
    #if dicto[x]:
    dicto[x] +=1
    #else:
     #   dicto["x"] =1
    return x

df_train.apply(lambda x : get_dict(x))
dicto

TypeError: 'Series' objects are mutable, thus they cannot be hashed

In [25]:
def plot_distribution(df):
    index =  [1,2,3,4]
    count = [1,2,3,4]
    d = dict()
    d["col1"] = count
    d["col2"] = index
    df = pd.DataFrame(d)
    fig = px.bar(df, x="col1", y="col2")
    fig.show()

plot_distribution(df_train)

In [11]:
def plot(df):
    index =  df.iloc[:,36].value_counts().index.to_list()
    count = df.iloc[:,36].value_counts().to_list()
    count.append(0)
    index.append(0)
    count_df = df.iloc[:,36].value_counts().to_frame()
    count_df.loc[6]= 0
    count_df.sort_index(inplace = True)
    bins_label = ["1. red soil","2. cotton crop","3. grey soil","4. damp grey soil","5. soil with vegetation stubble","mixture class (all types present)","7. very damp grey soil"]
    count_df["labels"] = bins_label

    fig = px.bar(count_df, x=bins_label, y=36)
    fig.show()

plot(df_train)

In [5]:
def line_plot(x,y,title):
    fig = px.line( x=x, y=y, title=title)
    fig.show()

# III. Models

On définit le la matrice test x pour les valeurs et y pour les labels

In [52]:
df_train[df_train.iloc[:,36] == 7] = 6 
df_test[df_test.iloc[:,36] == 7] = 6 


df_train_central[df_train_central.iloc[:,4] == 7] = 6 
df_test_central[df_test_central.iloc[:,4] == 7] = 6 

In [53]:
df_test.iloc[:,36].value_counts()

6    470
1    461
3    397
5    237
2    224
4    211
Name: 36, dtype: int64

In [32]:
x_train = df_train.iloc[:,:36]
y_train = df_train.iloc[:,36]

x_test = df_test.iloc[:,:36]
y_test = df_test.iloc[:,36]

In [33]:
x_train_central = df_train_central.iloc[:,:4]
y_train_central = df_train_central.iloc[:,4]

x_test_central = df_test_central.iloc[:,:4]
y_test_central = df_test_central.iloc[:,4]

Nous allons tester plusieurs modèles pour répondre à ce problème, et compararer les résultats entre eux.

In [30]:
f1_score_weighted_avg = []
algo_used = []

## 1. KNN

In [108]:
knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
#knn.fit(x_train,y_train)
knn.fit(x_train_central,y_train_central)

KNeighborsClassifier(metric='euclidean')

In [109]:
#y_pred = knn.predict(x_test)
y_pred = knn.predict(x_test_central)

In [27]:
#print(classification_report(y_test,y_pred))
print(classification_report(y_test_central,y_pred))

              precision    recall  f1-score   support

           1       0.95      0.97      0.96       461
           2       0.96      0.93      0.94       224
           3       0.89      0.93      0.91       397
           4       0.83      0.79      0.81       211
           5       0.90      0.87      0.89       237
           6       1.00      1.00      1.00       470

    accuracy                           0.93      2000
   macro avg       0.92      0.91      0.92      2000
weighted avg       0.93      0.93      0.93      2000



In [110]:
algo_used.append("KNN")


score = f1_score(y_test_central,y_pred, average='weighted')
f1_score_weighted_avg.append(round(score,3))
f1_score_weighted_avg

[0.931]

## 2. Decision tree

In [111]:
clf = DecisionTreeClassifier(criterion = "gini")
#clf.fit(x_train,y_train)
clf.fit(x_train_central,y_train_central)

DecisionTreeClassifier()

In [112]:
#y_pred = clf.predict(x_test)
y_pred = clf.predict(x_test_central)

In [33]:
#print(classification_report(y_test,y_pred))
print(classification_report(y_test_central,y_pred))

              precision    recall  f1-score   support

           1       0.93      0.93      0.93       461
           2       0.91      0.92      0.92       224
           3       0.85      0.88      0.86       397
           4       0.74      0.71      0.72       211
           5       0.86      0.83      0.84       237
           6       1.00      1.00      1.00       470

    accuracy                           0.90      2000
   macro avg       0.88      0.88      0.88      2000
weighted avg       0.90      0.90      0.90      2000



In [113]:
algo_used.append("Decision tree")

score = f1_score(y_test,y_pred, average='weighted')
f1_score_weighted_avg.append(round(score,3))
f1_score_weighted_avg

[0.931, 0.902]

## 2. Random Forest

On va maintenant utiliser le random Forst pour essayer d'augmenter la classification. 

In [34]:
clf = RandomForestClassifier(max_depth= 40, min_samples_split= 10, n_estimators= 25)
#clf.fit(x_train, y_train)
clf.fit(x_train_central, y_train_central)

RandomForestClassifier(max_depth=40, min_samples_split=10, n_estimators=25)

In [36]:
#y_pred = clf.predict(x_test)
y_pred = clf.predict(x_test_central)

In [37]:
#print(classification_report(y_test,y_pred))
print(classification_report(y_test_central,y_pred))

              precision    recall  f1-score   support

           1       0.94      0.96      0.95       461
           2       0.97      0.92      0.94       224
           3       0.87      0.93      0.90       397
           4       0.84      0.75      0.79       211
           5       0.88      0.87      0.88       237
           6       1.00      1.00      1.00       470

    accuracy                           0.93      2000
   macro avg       0.92      0.90      0.91      2000
weighted avg       0.93      0.93      0.93      2000



In [38]:
#test 1 : n_estimators(1,200) opt = 50
list_scores = []

n_estimators = 200
for i in range(1,n_estimators):
    clf = RandomForestClassifier(n_estimators = i)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    score = f1_score(y_test,y_pred, average='weighted')
    list_scores.append(score)

In [43]:
line_plot(range(1,200),list_scores,"Numbers of tree")

In [44]:
#test 1 : max_depth(0,200) opt 30
list_scores = []

max_depth = 200
for i in range(1,max_depth):
    clf = RandomForestClassifier(n_estimators = 50,max_depth=i)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    score = f1_score(y_test,y_pred, average='weighted')
    list_scores.append(score)

In [46]:
line_plot(range(1,200),list_scores,"Max depth of each tree")

In [47]:
#test 1 : min split(0,200) opt 3

list_scores = []

min_samples_split = 50
for i in range(2,min_samples_split):
    clf = RandomForestClassifier(n_estimators = 50,max_depth=30,min_samples_split = i)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    score = f1_score(y_test,y_pred, average='weighted')
    list_scores.append(score)

In [49]:
line_plot(range(2,50),list_scores,"Minimu number of sample to split")

In [114]:
algo_used.append("Random Forest")

clf = RandomForestClassifier(n_estimators = 50,max_depth=30,min_samples_split = 3)
#clf.fit(x_train, y_train)
clf.fit(x_train_central, y_train_central)

#y_pred = clf.predict(x_test)
y_pred = clf.predict(x_test_central)

#score = f1_score(y_test,y_pred, average='weighted')
score = f1_score(y_test_central,y_pred, average='weighted')

f1_score_weighted_avg.append(round(score,3))
f1_score_weighted_avg

[0.931, 0.902, 0.922]

### a voir si on garde

In [91]:
rf = RandomForestClassifier()#max_features='auto', oob_score=True, random_state=1, n_jobs=-1

#Create the parameter grid based on the results of random search 
param_grid = {
    'n_estimators': [15,25,35],
    'min_samples_split': [10,15,20],
    "max_depth" : [20,30,40]
}

In [None]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 2, n_jobs = -1, verbose = 2)
grid_search = grid_search.fit(x_train, y_train)
print(grid_search.best_params_)

### 5. Gradient boosting

In [None]:
#test 1 : n_estimators(1,200) opt = 71
list_scores = []

n_estimators = 1000
for i in range(1,n_estimators,10):
    clf = GradientBoostingClassifier(n_estimators=i).fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    score = f1_score(y_test,y_pred, average='weighted')
    list_scores.append(score)

In [82]:
line_plot(range(1,592,10),list_scores,"Numbers of tree")

In [None]:
#test 1 : n_estimators(1,200) opt = 71
list_scores = []

n_estimators = 200
for i in range(1,200):
    clf = GradientBoostingClassifier(n_estimators=i, learning_rate=1.0,
    max_depth=1, random_state=0).fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    score = f1_score(y_test,y_pred, average='weighted')
    list_scores.append(score)

In [98]:
grid = dict()
grid['n_estimators'] = [10, 50, 100, 500]
grid['learning_rate'] = [0.0001, 0.001, 0.01, 0.1, 1.0]
grid['subsample'] = [0.5, 0.7, 1.0]
grid['max_depth'] = [3, 7, 9]


In [99]:
model = GradientBoostingClassifier()
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, scoring='accuracy')

In [100]:
grid_result = grid_search.fit(x_train, y_train)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
#Best: 0.922210 using {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.7}

Best: 0.922210 using {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.7}


In [34]:
algo_used.append("Gradient Boosting")

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,
    max_depth=7, random_state=0,subsample=0.7).fit(x_train, y_train)
y_pred = clf.predict(x_test)
score = f1_score(y_test,y_pred, average='weighted')

f1_score_weighted_avg.append(round(score,3))
f1_score_weighted_avg

[0.906]

# Results of the models

In [124]:
#dictonary = {}
df = pd.DataFrame(list(zip(f1_score_weighted_avg,algo_used)),columns =["F1 score","Models"])
df

Unnamed: 0,F1 score,Models
0,0.931,KNN
1,0.902,Decision tree
2,0.922,Random Forest
3,0.956,Gradient Boosting


In [133]:
algo_used

['KNN', 'Decision tree', 'Random Forest', 'Gradient Boosting']

fig,(ax1) = plt.subplots(figsize=(10,5))
fig.tight_layout()

#fig, axs = plt.subplots(1, 2,figsize=(10,5))
ax1.bar(["real","predictions"],pos,color='#c3d5e8',width=0.5)
ax1.set_xlabel("positives")
ax1.set(frame_on=False)

#ax1.set_ylim(0,1100)
#ax1.set_xticklabels(labels =["real","predictions"], rotation=0)

#for i in range(len(pos)):
  #ax1.text(x = i -0.05, y = pos[i]+15, s = pos[i], size = 8)

In [130]:
fig = px.histogram(df, x="Models",y="F1 score",title = "F1 score of all the models")
fig.show()

In [134]:
line_plot(algo_used,list_scores,"F1 score")

ValueError: All arguments should have the same length. The length of argument `y` is 16, whereas the length of  previously-processed arguments ['x'] is 4

## Save and re load the model

In [40]:
import pickle
import joblib
# from sklearn.externals import joblib

ImportError: cannot import name 'joblib' from 'sklearn.externals' (C:\Users\nicol\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\externals\__init__.py)

In [41]:
filename = "model.pkl"
joblib.dump(clf,filename)

['model.pkl']

In [42]:
model=joblib.load(filename)

In [43]:
y_pred = model.predict(x_test)
f1_score(y_test,y_pred, average='weighted')

0.9059445335985037