iris_classification.py

# -*- coding: utf-8 -*-
"""Iris Classification.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/10V121p_DFVcPRhqkgoZ37GLLcodxAUFi

#Iris Classification

####***Load The Data***
"""

#1. sys - access to some variables used or maintained by the interpreter and to functions that interact strongly with the interpreter.
#SciPy -  Used for scientific and technical computing
# Seaborn - provides a high-level interface for drawing attractive and informative statistical graphics.

import sys, numpy, scipy, pandas as pd, matplotlib, sklearn
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")  # This will ignore any warnings

"""####***Import Library***"""

# Loading required Libraries
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt

from sklearn import model_selection
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
#for the training of the dataset
from sklearn.model_selection import train_test_split #80-20

#These Ml algorithm are used for classification tasks
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
#XGBoost is an efficient and scalable implementation of gradient boosting. It is known for its speed and performance.
import xgboost as xgb

#Load the dataset from UCI
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

#column names in the dataset
names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']

#dataset into datframe form
dataset = pd.read_csv(url, names= names)

"""###***Data Exploration***"""

#First Rows of the dataset
dataset.head(5)   #first 10 rows

#Checking the dimension of the dataset
r, c = dataset.shape
print('This dataset has ',r,' rows and ' ,c,' columns.')

# Grouping by Class
dataset.groupby('class').size()   #counts occurance of the each values in the dataset

"""####***Statistical Summary***"""

# All descriptive statistics of a DataFrame.
dataset.describe()

#Concise summary of a DataFrame
dataset.info()

"""###***Data Visualization of Iris dataset***

###***Univariate Plots - For better understanding of the each attributes***
"""

# plotting each variable box and wisker plot- sepal_legth, sepal_width, petal_length and petal_width
# plotting box and whiskers plot
dataset.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False, figsize=(10,8))
plt.show()

#plotting the histogram plot of the dataset
dataset.plot(kind='hist', subplots=True, layout = (2,2), sharex=False, sharey=False, figsize=(12,10))
plt.show()  #Every particular column visualization

"""###***Multivariate Plots - To understand the relationships between attributes.***

"""

scatter_matrix(dataset, figsize=(15,10)) ###Ploting the scatter plot
plt.show()

"""###***Evaluate Algorithms - Implementing the tools and techniques for the model***

###***Create a Validation Dataset***
"""

#Spliting the dataset in .8 and .2 using train-test-split
from sklearn.model_selection import train_test_split
array = dataset.values
X = array[:, 0:4]   #Defining X and y array
y = array[:, 4]
test = 0.2
seed = 53
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test, random_state = seed)

scoring = 'accuracy'

"""####***Model***

"""

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb

#multiple models for comparison or ensemble methods.
models = []
# Create a list of models

models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CRT', DecisionTreeClassifier()))
models.append(('GNN', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('RF', RandomForestClassifier()))

## Evaluate each model using k-fold cross-validation
from sklearn.model_selection import KFold, cross_val_score

results = []
names = []

# looping models in the list
for name, model in models:
#KFold is used to split the data into training and validation sets in a k-fold fashion.
#cross_val_score then performs cross-validation on the model,
#and you can specify the scoring metric (in this case, accuracy).

#create kfold object
    kfold = KFold(n_splits=10, random_state=seed, shuffle=True)
    cv_results = cross_val_score(model, X_train, y_train, cv = kfold, scoring = scoring)
    results.append(cv_results)
    names.append(name)
    print(name, ': ', cv_results.mean(), cv_results.std())

#Compare algorithms
import matplotlib.pyplot as plt

plt.boxplot(results, labels=names)
plt.xlabel('Algorithms')
plt.ylabel('Accuracy')
plt.title('Algorithm Comparison')
plt.show()

"""####***Dimentionality Reduction of the dataset***

####***ETC(ExtraTreesClassifier)***
"""

#importing model for ExtraTreesClassifier
from sklearn.ensemble import ExtraTreesClassifier

#setting the random_state parameter to 53.
model = ExtraTreesClassifier(random_state = 53)

# Selecting columns 0 to 3 (0-based index) for features
X = dataset.iloc[:, 0:4]
# Selecting the last column for the target variable
y = dataset.iloc[:, -1:]

# Assuming 'dataset' is your DataFrame
X = dataset.iloc[:, 0:4]
y = dataset.iloc[:, -1:]


#fitting the model
model.fit(X, y)

#extracting feature importance from model and making a dataframe of it in descending order
ETC_feature_importances = pd.DataFrame(model.feature_importances_, index = X.columns, columns=['ETC']).sort_values('ETC', ascending=False)

#removing traces of this model
model = None

#output
ETC_feature_importances

"""####***RFC***"""

#setting the random_state parameter to 53
model = RandomForestClassifier(random_state = 53)

#training the model
model.fit(X, y)

# extracting feature importance from model and making a dataframe of it in descending order
RFC_feature_importances = pd.DataFrame(model.feature_importances_, index = X.columns, columns=['RFC']).sort_values('RFC', ascending=False)

#removing traces of this model
model = None

#top 5 features
print("Five Features:"
      , RFC_feature_importances.head(5))

"""####***ADBC***"""

#importing model for feature importance
from sklearn.ensemble import AdaBoostClassifier

#setting the random_state parameter to 53
model = AdaBoostClassifier(random_state = 53)

#fitting the model
model.fit(X, y)

# extracting feature importance from model and making a dataframe of it in descending order
ADB_feature_importances = pd.DataFrame(model.feature_importances_, index = X.columns, columns=['ADB']).sort_values('ADB', ascending=False)

model = None
ADB_feature_importances

"""####***GBC***"""

#importing model for feature importance
from sklearn.ensemble import GradientBoostingClassifier   #xgboost gradient boosting classifier

#passing the model
model = GradientBoostingClassifier(random_state = 53)

#fitting the model
model.fit(X, y)

#extracting feature importance from model and making a dataframe of it in descending order
GBC_feature_importances = pd.DataFrame(model.feature_importances_, index = X.columns, columns=['GBC']).sort_values('GBC', ascending=False)

#removing traces of this model
model = None

#top 5 features
GBC_feature_importances.head(5)

"""####***K Best Classifier***"""

from sklearn.feature_selection import SelectKBest

kbest = SelectKBest(k = 3).fit(X,y)
mask = kbest.get_support()
new_features = X.columns[mask]

new_features

"""###Prediction"""

#importing accuracy score, classification report and confusion matrix
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

lda =  DecisionTreeClassifier()
#fitting the model
lda.fit(X_train, y_train)

#predicting the X_test
predict = lda.predict(X_test)

#printing the test output
print(accuracy_score(y_test, predict))
print(confusion_matrix(y_test, predict))
print(classification_report(y_test, predict))

lda = None