In [None]:
#Data exploration:
from sklearn import datasets
data = datasets.load_iris(return_X_y=False,as_frame=True)
print(data.data.head())
features_name=data.feature_names

In [None]:
#statistical analysis
features=data.data
classes=data.target_names
target=data.target
Iris=features.copy()
Iris['target']=target
Iris.describe()

In [None]:
#explore 3 classes 
classes

In [None]:
features_name

In [None]:
Iris.info()

In [None]:
Iris.corr()['target'].sort_values(ascending=False)
Iris["petal_sepal__length_ratio"] = Iris["sepal length (cm)"] / Iris["petal length (cm)"]
Iris["petal_sepal__width_ratio"] = Iris["sepal width (cm)"] / Iris["petal width (cm)"]
Iris.corr()['target'].sort_values(ascending=False)


In [None]:
#plot each feature among the 3 categorical classes:
from matplotlib import pyplot as plt
#print(Iris.head())
for col in features_name:
    Iris.boxplot(column=col,by='target', figsize=(6,6))
    plt.title(col)
    plt.show()

In [None]:
#check data imbalanced:
Iris.groupby('target').count()

In [None]:
#check data Imbalanced
Iris['target'].value_counts()

In [None]:
#check number of samples,and chek null values
Iris.info()

In [None]:
#split the data for training and testing
from sklearn.model_selection import train_test_split
X=features
y=target
Xtrain,Xtest,ytrain,ytest=train_test_split( X,y, test_size=0.2,random_state=42,shuffle=True)
Xtrain.shape


In [None]:
from sklearn.pipeline import make_pipeline,FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
def column_ratio(X):
    return X[:, [0]] / X[:, [1]]
def ratio_name(function_transformer, feature_names_in):
    return ["ratio"]  # feature names out

def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out=ratio_name),
        #StandardScaler()
        )
num_pipeline=make_pipeline(
        SimpleImputer(strategy="median"),
        #StandardScaler()
        )
preprocessing = ColumnTransformer([
        ("widthratio", ratio_pipeline(), ["petal width (cm)", "sepal width (cm)"]),
        ("lengthratio", ratio_pipeline(), ["petal length (cm)", "sepal length (cm)"]),
        ("features",num_pipeline,["petal width (cm)", "sepal width (cm)","petal length (cm)", 
        "sepal length (cm)"])])


In [None]:

from sklearn.linear_model import SGDClassifier,LogisticRegressionCV,LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
clf={
    'SGDClassifier':SGDClassifier(random_state=42,alpha=.1,max_iter=30000,tol=.0001,loss='squared_hinge'),
    'KNeighborsClassifier':KNeighborsClassifier(n_neighbors=3),
    'DecisionTreeClassifier':DecisionTreeClassifier(random_state=42,max_features=3),
    'RandomForestClassifier':RandomForestClassifier(random_state=42,max_features=3),
    'LinearSVC':LinearSVC(random_state=42,max_iter=30000,class_weight='balanced',
                            multi_class="crammer_singer",C=1,tol=.001),
    'SVC':  SVC(C=1,max_iter=1000,tol=.001),
    'logistic_reg':  LogisticRegression(l1_ratio=1,C=1,penalty="elasticnet",max_iter=1000,solver="saga",
                                        random_state=42,tol=.0004),
    #'VotingClassifier':VotingClassifier()
}

In [None]:
#calculate the score for multible models and find the best estimator:
from sklearn.model_selection import cross_val_score
import numpy as np
results=[]
for key in clf.keys():
    full_pipeline = make_pipeline(preprocessing, clf[key])
    score=cross_val_score(full_pipeline, Xtrain, ytrain, scoring="accuracy", cv=3)
    results.append((key,score.mean()*100))
print('models scores:',results)
best_model_idx=np.array(results)[:,1].argmax()
print('best model:',results[best_model_idx][0],results[best_model_idx][1].round(1))

In [None]:
#test the linear svc on test dataset
score=cross_val_score(clf['LinearSVC'], Xtrain, ytrain, scoring="accuracy", cv=3)
score.mean()


In [None]:
score=cross_val_score(clf['logistic_reg'], Xtrain, ytrain, scoring="accuracy", cv=3)
score.mean()

In [None]:
#try the voting classifier model for the highest 3 models :
eclf = VotingClassifier([("lsvc", clf['LinearSVC']),
     ("log_reg", clf['logistic_reg'])],voting='hard')#,weights=[1,2,1])
eclf.fit(Xtrain, ytrain)


In [None]:
eclf.get_params()

In [None]:
#find the score(accuracy)
score=cross_val_score(eclf,Xtrain,ytrain,scoring='accuracy',cv=3)
accuracy=score.mean()
print('accuracy percentage:',accuracy)



In [None]:
#find confusuin matrix for the three classes

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix


ytrain_pred = cross_val_predict(eclf, Xtrain, ytrain, cv=3)
cm = confusion_matrix(ytrain, ytrain_pred)
cm

In [None]:
#plot confusuin matrix for the three classes
from sklearn.metrics import ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_predictions(ytrain, ytrain_pred)
plt.show()

In [None]:
#precision
from sklearn.metrics import precision_score, recall_score

precision_score(ytrain, ytrain_pred,average='macro')


In [None]:
#recall
recall_score(ytrain, ytrain_pred,average='macro')

In [None]:
#find f1score:
from sklearn.metrics import f1_score

f1_score(ytrain, ytrain_pred,average='macro')

In [None]:
#find the Classification report and scoring details for each class:
from sklearn.metrics import classification_report,confusion_matrix,multilabel_confusion_matrix
import pandas as pd
report=classification_report(ytrain, ytrain_pred,target_names=data.target_names,output_dict=True)
df = pd.DataFrame.from_dict(report)
df

In [None]:
#test the model on dataset:
from sklearn.metrics import accuracy_score
#score=cross_val_score(eclf,Xtest,ytest,scoring='accuracy',cv=3)
ytest_pred = eclf.predict(Xtest)

score=accuracy_score(ytest,ytest_pred)
accuracy=score.mean()*100
print('accuracy percentage:',accuracy)

In [None]:
#plot confusuin matrix for the three classes
from sklearn.metrics import ConfusionMatrixDisplay
#ytest_pred = cross_val_predict(eclf, Xtest, ytest, cv=3)
cm = confusion_matrix(ytest, ytest_pred)
cm
ConfusionMatrixDisplay.from_predictions(ytest, ytest_pred)
plt.show()

In [None]:
report=classification_report(ytest, ytest_pred,target_names=data.target_names,output_dict=True)
df = pd.DataFrame.from_dict(report)
df

In [None]:
#train the best model in the whole data  and Save the final Model:
import joblib

final_model=eclf.fit(features,target)

joblib.dump(final_model,'clf_final_model.pkl')

In [None]:
#load the model for prediction
final_model=joblib.load('clf_final_model.pkl')
new_data=features.iloc[:5]
predictions=final_model.predict(new_data)
predictions

In [None]:
target.iloc[:5]

In [None]:
#perfect!! :)

In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, valid_scores = learning_curve(
    eclf, X, y, train_sizes=np.linspace(0.01, 1.0, 140), cv=5,
    scoring="neg_root_mean_squared_error")
train_errors = -train_scores.mean(axis=1)
valid_errors = -valid_scores.mean(axis=1)

plt.figure(figsize=(6, 4))  # extra code – not needed, just formatting
plt.plot(train_sizes, train_errors, "r-+", linewidth=2, label="train")
plt.plot(train_sizes, valid_errors, "b-", linewidth=3, label="valid")

# extra code – beautifies and saves Figure 4–15
plt.xlabel("Training set size")
plt.ylabel("RMSE")
plt.grid()
plt.legend(loc="upper right")
plt.axis([80, 140, 0, .2])

plt.show()