In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer,OneHotEncoder,StandardScaler,FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

In [3]:
path ='https://frenzy86.s3.eu-west-2.amazonaws.com/fav/iris.data'
df = pd.read_csv(path)
df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'class']

In [13]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sepal length,149.0,5.848322,0.828594,4.3,5.1,5.8,6.4,7.9
sepal width,149.0,3.051007,0.433499,2.0,2.8,3.0,3.3,4.4
petal length,149.0,3.774497,1.759651,1.0,1.6,4.4,5.1,6.9
petal width,149.0,1.205369,0.761292,0.1,0.3,1.3,1.8,2.5


In [4]:
X = df.drop(['class'],axis=1)
y = df['class']

In [5]:
scaler = StandardScaler()

classifier = RandomForestClassifier(bootstrap=True,
                                    max_depth=80,
                                    max_features=2,
                                    min_samples_leaf=3,
                                    min_samples_split=8,
                                    n_estimators=100
                                    )

classifier = LogisticRegression()

In [6]:
model_pipe = Pipeline([
                       ('Scaling and standadize data', scaler),
                       ('Classifier', classifier)
                      ])

In [7]:
cross_val_scores = cross_val_score(model_pipe, X, y, cv=5)
print(cross_val_scores)

# Calcola la media e la deviazione standard dei punteggi
mean_score = np.mean(cross_val_scores)
std_score = np.std(cross_val_scores)

print(f"Mean cross-validation score:{mean_score:.2f}")
print(f"Standard deviation of cross-validation scores:{std_score:.2f}")

[0.96666667 1.         0.93333333 0.9        1.        ]
Mean cross-validation score:0.96
Standard deviation of cross-validation scores:0.04


In [8]:
model_pipe.fit(X, y)

In [9]:
y_pred_tot = model_pipe.predict(X)

In [10]:
classification_report(y,y_pred_tot )

classification_report_result = classification_report(y, y_pred_tot)
print(classification_report_result)

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        49
Iris-versicolor       0.98      0.94      0.96        50
 Iris-virginica       0.94      0.98      0.96        50

       accuracy                           0.97       149
      macro avg       0.97      0.97      0.97       149
   weighted avg       0.97      0.97      0.97       149



In [12]:
import joblib

joblib.dump(model_pipe,"logistic_reg_iris.pkl")

['logistic_reg_iris.pkl']

In [33]:
loaded_model = joblib.load("logistic_reg_iris.pkl")
path_test ="test.xlsx"
df_test = pd.read_excel(path_test)
df_test.columns = ['sepal length', 'sepal width', 'petal length', 'petal width']

test_pred =pd.DataFrame( loaded_model.predict(df_test))
test_pred.columns = ['class']
test_pred

Unnamed: 0,class
0,Iris-virginica
1,Iris-setosa
2,Iris-setosa
3,Iris-virginica
4,Iris-virginica
5,Iris-setosa
6,Iris-setosa
7,Iris-virginica
8,Iris-setosa
9,Iris-virginica


In [34]:
df_ris = pd.concat([df_test,test_pred],axis=1)
df_ris

Unnamed: 0,sepal length,sepal width,petal length,petal width,class
0,6.9,6.4,5.0,8.8,Iris-virginica
1,0.5,6.9,1.0,3.5,Iris-setosa
2,4.3,3.7,4.1,0.5,Iris-setosa
3,6.0,5.0,8.6,1.5,Iris-virginica
4,4.6,6.3,5.2,9.0,Iris-virginica
5,5.2,6.7,1.3,4.5,Iris-setosa
6,0.0,5.8,2.2,0.6,Iris-setosa
7,5.8,7.2,3.6,5.0,Iris-virginica
8,5.0,7.9,1.1,1.5,Iris-setosa
9,5.8,9.8,5.8,6.0,Iris-virginica
