In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets

In [None]:
iris_datasets = datasets.load_iris()

In [None]:
x = iris_datasets.data

In [None]:
y = iris_datasets.target

In [None]:
feature_names = iris_datasets.feature_names
label_names = iris_datasets.target_names

In [None]:
y_list = list()
label_dict = {i:j for i,j in enumerate(label_names)}
for i in range(len(y)):
    y_list.append(label_dict[y[i]])

In [None]:
x_df = pd.DataFrame(x,columns=feature_names)
y_df = pd.DataFrame(y_list,columns=['label'])
full_df = pd.concat([x_df,y_df],axis=1)

In [None]:
full_df.shape

In [None]:
full_df.head()

In [None]:
full_df.isna().sum()

In [None]:
full_df.describe().transpose()

In [None]:
full_df.duplicated().sum()

# Visualization

In [None]:
sns.countplot(y);

In [None]:
for i in range(len(feature_names)):
    plt.figure(figsize=(12,10));
    sns.distplot(full_df[feature_names[i]]);
    plt.title(f'Distribution plot for {feature_names[i]}');

In [None]:
full_df.skew()

In [None]:
def scatter_plot(feature_1,feature_2,label=None):
    if label == None:
        plt.subplot(231);
        plt.title('scatter plot');
        sns.scatterplot(x=full_df[feature_1],y=full_df[feature_2]);
        plt.subplot(232);
        plt.title('bar plot');
        sns.barplot(y=full_df[feature_1],x=full_df['label']);
        plt.subplot(233);
        sns.barplot(x=full_df['label'],y=full_df[feature_2]);
        plt.title('barplot');
    else:
        plt.subplot(231);
        plt.title('scatter plot');
        sns.scatterplot(x=full_df[feature_1],y=full_df[feature_2],hue=full_df[label]);
        plt.subplot(232);
        plt.title('bar plot');
        sns.barplot(x=full_df['label'],y=full_df[feature_1]);
        plt.subplot(233);
        plt.title('bar plot');
        sns.barplot(x=full_df['label'],y=full_df[feature_2]);

In [None]:
for i in range(1,4):
    plt.figure(figsize=(9,5));
    scatter_plot(feature_names[0],feature_names[i],label='label')

In [None]:
for i in range(2,4):
    plt.figure(figsize=(9,5));
    scatter_plot(feature_names[1],feature_names[i],label='label')

In [None]:
for i in range(3,4):
    plt.figure(figsize=(9,5));
    scatter_plot(feature_names[2],feature_names[i],label='label')

In [None]:
def groupby(features,label):
    print('------------------------mean------------------------------')
    print(full_df.groupby('label')[feature_names].mean())
    print()
    print('-----------------------------median------------------------')
    print(full_df.groupby('label')[feature_names].median())
    print()
    print('-------------------------------max---------------------------')
    print(full_df.groupby('label')[feature_names].max())
    print()
    print('-------------------------------min---------------------------')
    print(full_df.groupby('label')[feature_names].min())
    print()
    print('-----------------------------cummulative sum----------------------')
    print(full_df.groupby('label')[feature_names].cumsum())
    print()

In [None]:
groupby(feature_names,'label')

In [None]:
for i in range(len(feature_names)):
    plt.figure(figsize=(9,5))
    sns.boxplot(full_df[feature_names[i]])

# Preprocessing

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [None]:
full_df.drop_duplicates(inplace=True)
X = full_df[feature_names]
y = full_df['label']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.30,random_state=0)

In [None]:
print(f'X_train shape = {X_train.shape}')
print(f'X_test shape = {X_test.shape}')
print(f'y_train shape = {y_train.shape}')
print(f'y_test shape = {y_test.shape}')

In [None]:
def outlier(data,feature_name):
    percentile_25,percentile_75 = np.percentile(data[feature_name],25),np.percentile(data[feature_name],75)
    IQR = percentile_75 - percentile_25
    cut_off = IQR * 1.5
    lower,upper= percentile_25-cut_off,percentile_75+cut_off
    outliers = [x for x in np.array(data[feature_name]) if x<lower or x>upper]
    if len(outliers) >= 1:
        for i in (outliers):
            if i <= lower:
                data[feature_name] = np.where(data[feature_name]==i,lower,data[feature_name])
                
            if i >= upper:
                data[feature_name] = np.where(data[feature_name]==i,upper,data[feature_name])
        print(f'{len(outliers)} outliers was observed in {feature_name} column and was replaced')
    else:
        print('no outlier was observed')

In [None]:
for i in feature_names:
    outlier(X_train,i)

In [None]:
label_encoder = LabelEncoder()

In [None]:
y_train = label_encoder.fit_transform(y_train)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn_clf = KNeighborsClassifier()

In [None]:
knn_clf.fit(X_scaled,y_train)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(knn_clf.predict(X_scaled),y_train)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
prediction = knn_clf.predict(X_scaled)

In [None]:
accuracy_score(prediction,y_train)

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
scores = cross_val_score(knn_clf,X_scaled,y_train,scoring='accuracy',cv=10)

In [None]:
scores

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rand_clf = RandomForestClassifier()

In [None]:
rand_scores = cross_val_score(rand_clf,X_scaled,y_train,scoring='accuracy',cv=10)

In [None]:
rand_scores

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(prediction,y_train))

In [None]:
rand_clf.fit(X_scaled,y_train)

In [None]:
rand_prediction = rand_clf.predict(X_scaled)

In [None]:
print(classification_report(rand_prediction,y_train))

In [None]:
rand_scores = cross_val_score(rand_clf,X_scaled,y_train,scoring='accuracy',cv=10)

In [None]:
rand_scores

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
tree_clf = DecisionTreeClassifier()

In [None]:
tree_clf.fit(X_scaled,y_train)

In [None]:
tree_prediction = tree_clf.predict(X_scaled)

In [None]:
print(classification_report(tree_prediction,y_train))

In [None]:
tree_scores = cross_val_score(tree_clf,X_scaled,y_train,scoring='accuracy',cv=10)

In [None]:
tree_scores

In [None]:
X_prepared = scaler.transform(X_test)

In [None]:
y_test = label_encoder.transform(y_test)

In [None]:
tree = knn_clf.predict(X_prepared)

In [None]:
print(classification_report(tree,y_test))

In [None]:
accuracy_score(tree,y_test)

In [None]:
accuracy_score(rand_clf.predict(X_prepared),y_test)

In [None]:
accuracy_score(knn_clf.predict(X_prepared),y_test)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer
predictors_pipeline = Pipeline([
    ('std_scaler',Normalizer()),
    ('classifier',DecisionTreeClassifier())
])
predictors_pipeline.fit(X_train,y_train)

In [None]:
accuracy_score(predictors_pipeline.predict(X_test,),y_test)

In [None]:
import joblib

In [None]:
saved_model = joblib.dump(predictors_pipeline,'iris_model.joblib')