# Scikit-learn - Machine Learning models

Do wykonania zadania użyłem algorytmu *Linear Support Vector Classification* oraz *Linear Discriminant Analysis*


**Import danych**

In [1]:
import pandas as ps
import numpy as np
from warnings import simplefilter

simplefilter(action='ignore', category=FutureWarning)

Wilderness_Area = [
    "Rawah",
    "Neota",
    "Comanche Peak",
    "Cache la Poudre"
]

Soil_Type = np.arange(1, 41).astype(str)

Soil_Type = np.char.add("Soil_type_", Soil_Type)

names = ["Elevation", 
         "Aspect", 
         "Slope", 
         "Horizontal_Distance_To_Hydrology", 
         "Vertical_Distance_To_Hydrology", 
         "Horizontal_Distance_To_Roadways", 
         "Hillshade_9am",
         "Hillshade_Noon",
         "Hillshade_3pm",
         "Horizontal_Distance_To_Fire_Points"]

names = np.concatenate([names, Wilderness_Area, Soil_Type, ["Cover_Type"]])
data = ps.read_csv("covtype.data", sep=",", header=0, names=names)
data

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_type_32,Soil_type_33,Soil_type_34,Soil_type_35,Soil_type_36,Soil_type_37,Soil_type_38,Soil_type_39,Soil_type_40,Cover_Type
0,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
1,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
2,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
3,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5
4,2579,132,6,300,-15,67,230,237,140,6031,...,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581006,2396,153,20,85,17,108,240,237,118,837,...,0,0,0,0,0,0,0,0,0,3
581007,2391,152,19,67,12,95,240,237,119,845,...,0,0,0,0,0,0,0,0,0,3
581008,2386,159,17,60,7,90,236,241,130,854,...,0,0,0,0,0,0,0,0,0,3
581009,2384,170,15,60,5,90,230,245,143,864,...,0,0,0,0,0,0,0,0,0,3


***
**Podział danych na wejście i wyjście**

In [2]:
from sklearn.model_selection import train_test_split
x = data.drop(["Cover_Type"], axis=1)
y = data["Cover_Type"]

***
**Utworzenie modeli. Dla algorytmu SVC nie używałem k-foldów, aby zaoszczędzić czas**

In [3]:
from sklearn import model_selection
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

seed=123
kfold = model_selection.KFold(n_splits=10, random_state=seed, shuffle= True)

modelLinearDiscriminant = GridSearchCV(LinearDiscriminantAnalysis(solver = 'svd'),
                    param_grid = {},
                    cv=kfold,
                    refit=True)

modelSVC = make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-5))

# LinearDiscriminant
**Trenowanie**

In [4]:
modelLinearDiscriminant.fit(x, y)

GridSearchCV(cv=KFold(n_splits=10, random_state=123, shuffle=True),
             estimator=LinearDiscriminantAnalysis(), param_grid={})

***
**Testy modelu**

In [15]:
from sklearn import  metrics

def printScore(y, y_pred):
    accuracy_score = metrics.accuracy_score(y, y_pred)
    precision_scores = metrics.precision_score(y, y_pred, average=None)
    recall_scores = metrics.recall_score(y, y_pred, average=None)
    f1_scores = metrics.f1_score(y, y_pred, average=None)

    print("SVC train set ")
    print("Accuracy score: %.3f" % accuracy_score)
    for i, label in enumerate(range(1, 8)):
        print("Label: %i" % label)
        print("Precision: %.3f" % precision_scores[i])
        print("Recall: %.3f" % recall_scores[i])
        print("F1: %.3f" % f1_scores[i])
        print()
        
modelLinearDiscriminant.cv_results_

{'mean_fit_time': array([3.54732494]),
 'std_fit_time': array([0.09423226]),
 'mean_score_time': array([0.02291708]),
 'std_score_time': array([0.00772012]),
 'params': [{}],
 'split0_test_score': array([0.67732608]),
 'split1_test_score': array([0.68055627]),
 'split2_test_score': array([0.68016041]),
 'split3_test_score': array([0.67675255]),
 'split4_test_score': array([0.67959243]),
 'split5_test_score': array([0.68336173]),
 'split6_test_score': array([0.67568544]),
 'split7_test_score': array([0.67539285]),
 'split8_test_score': array([0.6840674]),
 'split9_test_score': array([0.68410182]),
 'mean_test_score': array([0.6796997]),
 'std_test_score': array([0.00319071]),
 'rank_test_score': array([1])}

In [9]:
printScore(y, modelLinearDiscriminant.predict(x))

SVC train set 
Accuracy score: 0.680
Label: 1
Precision: 0.703
Recall: 0.618
F1: 0.658

Label: 2
Precision: 0.757
Recall: 0.762
F1: 0.759

Label: 3
Precision: 0.629
Recall: 0.538
F1: 0.580

Label: 4
Precision: 0.260
Recall: 0.588
F1: 0.360

Label: 5
Precision: 0.292
Recall: 0.222
F1: 0.252

Label: 6
Precision: 0.349
Recall: 0.504
F1: 0.412

Label: 7
Precision: 0.410
Recall: 0.808
F1: 0.544



# SVC
**Trening**

In [10]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0)
modelSVC.fit(x_train,y_train)



Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearsvc', LinearSVC(random_state=0, tol=1e-05))])

**Testy modelu**

In [11]:
printScore(y_test, modelSVC.predict(x_test))

SVC train set 
Accuracy score: 0.714
Label: 1
Precision: 0.706
Recall: 0.687
F1: 0.696

Label: 2
Precision: 0.740
Recall: 0.796
F1: 0.767

Label: 3
Precision: 0.613
Recall: 0.862
F1: 0.716

Label: 4
Precision: 0.607
Recall: 0.272
F1: 0.376

Label: 5
Precision: 0.358
Recall: 0.031
F1: 0.056

Label: 6
Precision: 0.419
Recall: 0.051
F1: 0.092

Label: 7
Precision: 0.696
Recall: 0.518
F1: 0.594



In [12]:
printScore(y_train, modelSVC.predict(x_train))

SVC train set 
Accuracy score: 0.713
Label: 1
Precision: 0.708
Recall: 0.686
F1: 0.697

Label: 2
Precision: 0.737
Recall: 0.796
F1: 0.765

Label: 3
Precision: 0.613
Recall: 0.870
F1: 0.719

Label: 4
Precision: 0.597
Recall: 0.265
F1: 0.367

Label: 5
Precision: 0.349
Recall: 0.030
F1: 0.055

Label: 6
Precision: 0.404
Recall: 0.046
F1: 0.083

Label: 7
Precision: 0.693
Recall: 0.513
F1: 0.590



**Zapis obu modeli do pliku**

In [13]:
import joblib
joblib.dump(modelLinearDiscriminant, 'ModelLinearDiscriminant.sav')
joblib.dump(modelSVC, 'ModelSVC.sav')

['ModelSVC.sav']

# Podsumowanie

Oba modele dały zbliżone *accuaracy* na poziomie 70% (SVC - 71%, LinearDisciminant - 68%). Niepokoją wskaźniki *precision* i *recall* dla rzadziej występujących labeli - szczególnie dla SVC. Dla obu modeli próba detekcji labelu nr 5 i 6 jest bezsensowne 