In [75]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn import tree

In [76]:
iris_data = pd.read_csv('Iris.csv')

# dimensions (no. of rows & columns)
print(iris_data.shape)
# list of columns/features
print(iris_data.columns) 
# peek some data
print(iris_data.head(10))
# statistical summary
print(iris_data.describe())


(150, 6)
Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')
   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa
5   6            5.4           3.9            1.7           0.4  Iris-setosa
6   7            4.6           3.4            1.4           0.3  Iris-setosa
7   8            5.0           3.4            1.5           0.2  Iris-setosa
8   9            4.4           2.9            1.4           0.2  Iris-setosa
9  10            4.9           3.1            1.5           0.1  Iris-setosa
               Id  Sepal

In [77]:
# target variable
target = iris_data['Species']
# distribution of class labels or categories
print(pd.value_counts(target))
# alternative of finding class distribution
print(iris_data.groupby('Species').size())

Species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64
Species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
dtype: int64


In [78]:
seed = 7
train_data, test_data = train_test_split(iris_data, test_size=0.3, random_state=seed, stratify=target)
# shape of the datasets
print('\nShape of training data :',train_data.shape)
print('\nShape of testing data :',test_data.shape)
# class distribution of the training data
print(pd.value_counts(train_data['Species']))
# class distribution of the test data
print(pd.value_counts(test_data['Species']))


Shape of training data : (105, 6)

Shape of testing data : (45, 6)
Species
Iris-versicolor    35
Iris-setosa        35
Iris-virginica     35
Name: count, dtype: int64
Species
Iris-versicolor    15
Iris-virginica     15
Iris-setosa        15
Name: count, dtype: int64


In [79]:
# separate the independent and target variables from training data
train_x = train_data.drop(columns=['Species'],axis=1)
train_y = train_data['Species']
# separate the independent and target variables from test data
test_x = test_data.drop(columns=['Species'],axis=1)
test_y = test_data['Species']

# DecisionTreeClassifier

In [80]:
# create a classifier object/model 
model = tree.DecisionTreeClassifier()
# train the model with fit function
model.fit(train_x, train_y)
# make predictions on training data
predictions_train = model.predict(train_x)
print('\nTraining Accuracy (Decision Tree):', accuracy_score(train_y, predictions_train))
# make predictions on test data
predictions_test = model.predict(test_x)
print('Test Accuracy (Decision Tree):', accuracy_score(test_y, predictions_test))


Training Accuracy (Decision Tree): 1.0
Test Accuracy (Decision Tree): 0.9555555555555556


# LR

In [81]:
# create a classifier object/model 
model = LogisticRegression(max_iter=1000)
# train the model with fit function
model.fit(train_x , train_y)
# make predictions on training data
predictions_train = model.predict(train_x)
print('\nTraining Accuracy (Logistic Regression):', accuracy_score(train_y, predictions_train))
# make predictions on test data
predictions_test = model.predict(test_x)
print('Test Accuracy (Logistic Regression):', accuracy_score(test_y, predictions_test))



Training Accuracy (Logistic Regression): 1.0
Test Accuracy (Logistic Regression): 0.9777777777777777


# KNN

In [82]:
# create a classifier object/model 
model = KNeighborsClassifier()
# train the model with fit function
model.fit(train_x, train_y)
# make predictions on training data
predictions_train = model.predict(train_x)
print('\nTraining Accuracy (K-Nearest Neighbors):', accuracy_score(train_y, predictions_train))
# make predictions on test data
predictions_test = model.predict(test_x)
print('Test Accuracy (K-Nearest Neighbors):', accuracy_score(test_y, predictions_test))


Training Accuracy (K-Nearest Neighbors): 1.0
Test Accuracy (K-Nearest Neighbors): 0.9333333333333333


# normalization 

In [83]:
# Initialize MinMaxScaler 
scaler = MinMaxScaler()

# Fit and transform the scaler on the training data
train_x_scaled = scaler.fit_transform(train_x)

# Transform the test data using the same scaler
test_x_scaled = scaler.transform(test_x)

# After normalization 

In [84]:
# Initialize results DataFrame outside the loop
results_df = pd.DataFrame(columns=['Classifier', 'Training Accuracy', 'Test Accuracy'])

model = tree.DecisionTreeClassifier()
# train the model with fit function
model.fit(train_x_scaled, train_y)
# make predictions on training data
predictions_train = model.predict(train_x_scaled)
#print('\nTraining Accuracy (Decision Tree - with normalization):', accuracy_score(train_y, predictions_train))
# make predictions on test data
predictions_test = model.predict(test_x_scaled)
#print('Test Accuracy (Decision Tree - with normalization):', accuracy_score(test_y, predictions_test))
new_results_df = pd.DataFrame({
    'Classifier': ['Decision Tree '],
    'Training Accuracy': [accuracy_score(train_y, predictions_train)],
    'Test Accuracy': [accuracy_score(test_y, predictions_test)]
})

# Concatenate results to the existing DataFrame
results_df = pd.concat([results_df, new_results_df], ignore_index=True)

# Display the results DataFrame
print(results_df)

       Classifier  Training Accuracy  Test Accuracy
0  Decision Tree                 1.0       0.955556


# LR

In [85]:
# create a classifier object/model 
model = LogisticRegression()
# train the model with fit function
model.fit(train_x_scaled, train_y)
# make predictions on training data
predictions_train = model.predict(train_x_scaled)
#print('\nTraining Accuracy (Logistic Regression - with normalization):', accuracy_score(train_y, predictions_train))
# make predictions on test data
predictions_test = model.predict(test_x_scaled)
#print('Test Accuracy (Logistic Regression - with normalization):', accuracy_score(test_y, predictions_test))
# Create a DataFrame for results
new_results_df = pd.DataFrame({
    'Classifier': ['Logistic Regression'],
    'Training Accuracy': [accuracy_score(train_y, predictions_train)],
    'Test Accuracy': [accuracy_score(test_y, predictions_test)]
})

# Concatenate results to the existing DataFrame
results_df = pd.concat([results_df, new_results_df], ignore_index=True)

# Display the results DataFrame
print(results_df)


            Classifier  Training Accuracy  Test Accuracy
0       Decision Tree            1.000000       0.955556
1  Logistic Regression           0.990476       0.977778


# KNN

In [86]:
# create a classifier object/model 
model = KNeighborsClassifier()
# train the model with fit function
model.fit(train_x_scaled, train_y)
# make predictions on training data
predictions_train = model.predict(train_x_scaled)
#print('\nTraining Accuracy (K-Nearest Neighbors - with normalization):', accuracy_score(train_y, predictions_train))
# make predictions on test data
predictions_test = model.predict(test_x_scaled)
#print('Test Accuracy (K-Nearest Neighbors - with normalization):', accuracy_score(test_y, predictions_test))

# Create a DataFrame for results
new_results_df = pd.DataFrame({
    'Classifier': ['K-Nearest Neighbors'],
    'Training Accuracy': [accuracy_score(train_y, predictions_train)],
    'Test Accuracy': [accuracy_score(test_y, predictions_test)]
})

# Concatenate results to the existing DataFrame
results_df = pd.concat([results_df, new_results_df], ignore_index=True)

# Display the results DataFrame
print(results_df)


            Classifier  Training Accuracy  Test Accuracy
0       Decision Tree            1.000000       0.955556
1  Logistic Regression           0.990476       0.977778
2  K-Nearest Neighbors           1.000000       0.977778
