In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
# Importing the dataset
dataset = pd.read_csv("../input/iris/Iris.csv", header=0)
dataset

In [4]:
del dataset['Id']

In [5]:
# perprocessing dataset 
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()
dataset['Species'] = LE.fit_transform(dataset['Species'])

In [6]:
print('Unique of Species_data' , dataset['Species'].unique())

In [7]:
dataset['Species'] = dataset['Species'].replace(to_replace = 2, value = 1)
dataset

In [8]:
dataset.describe()

In [9]:
# Check dataset if any values contain a null values
dataset.isnull().sum()

In [10]:
dataset_corr = dataset.corr()
dataset_corr['Species'].sort_values(ascending=False)

In [11]:
from sklearn.utils import shuffle
dataset = shuffle(dataset)

In [12]:
dataset.hist(bins=50, figsize=(10, 8))
plt.show()

In [13]:
dataset.columns

In [14]:
x = np.array(dataset.drop('Species', axis=1)) 
y = np.array(dataset['Species'])

In [15]:
# spilting the dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [16]:
# future scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [17]:
def plot_confusion_matrix(y, y_predict):
    "this function plots the confusion matrix"
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y, y_predict)
    
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax); #annot=True to annotate cells
    
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    ax.set_title('Confusion Matrix'); 
    ax.xaxis.set_ticklabels(['did not land', 'land']); ax.yaxis.set_ticklabels(['did not land', 'landed'])

In [21]:
# Allows us to test parameters of classification algorithms and find the best one
from sklearn.model_selection import GridSearchCV
# Logistic Regression classification algorithm
from sklearn.linear_model import LogisticRegression
# Create a logistic regression object
parameters ={"C":[0.01, 0.1, 1], 'penalty':['l2'], 'solver':['lbfgs']}# l1 lasso l2 ridge
lr= LogisticRegression()
logreg_cv = GridSearchCV(estimator=lr, param_grid=parameters, cv=10)
logreg_cv.fit(x_train, y_train)

In [24]:
print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

In [35]:
logreg_cv.score(x_test, y_test)

In [29]:
import seaborn as sns
yhat= logreg_cv.predict(x_test)
plot_confusion_matrix(y_test, yhat)

In [31]:
# Support Vector Machine classification algorithm
from sklearn.svm import SVC
parameters = {'kernel':('linear', 'rbf','poly','rbf', 'sigmoid'),
              'C': np.logspace(-3, 3, 5),
              'gamma':np.logspace(-3, 3, 5)}
# Create Object svm 
svm = SVC()

In [32]:
svm_cv = GridSearchCV(estimator= svm, param_grid= parameters, cv= 10)
svm_cv.fit(x_train, y_train)

In [33]:
print("tuned hpyerparameters :(best parameters) ",svm_cv.best_params_)
print("accuracy :",svm_cv.best_score_)

In [34]:
svm_cv.score(x_test, y_test)

In [37]:
yhat=svm_cv.predict(x_test)
plot_confusion_matrix(y_test, yhat)

In [39]:
# Decision Tree classification algorithm
from sklearn.tree import DecisionTreeClassifier
parameters = {'criterion': ['gini', 'entropy'],
     'splitter': ['best', 'random'],
     'max_depth': [2*n for n in range(1,10)],
     'max_features': ['auto', 'sqrt'],
     'min_samples_leaf': [1, 2, 4],
     'min_samples_split': [2, 5, 10]}
# Create object tree to init DTC
tree = DecisionTreeClassifier()

In [42]:
tree_cv = GridSearchCV(estimator= tree, param_grid= parameters, cv = 10)
tree_cv.fit(x_train, y_train)

In [43]:
print("tuned hpyerparameters :(best parameters) ",tree_cv.best_params_)
print("accuracy :",tree_cv.best_score_)

In [44]:
tree_cv.score(x_test, y_test)

In [45]:
yhat = tree_cv.predict(x_test)
plot_confusion_matrix(y_test, yhat)

In [46]:
# K Nearest Neighbors classification algorithm
from sklearn.neighbors import KNeighborsClassifier
parameters = {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
              'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
              'p': [1,2]}

KNN = KNeighborsClassifier()

In [48]:
knn_cv = GridSearchCV(estimator= KNN, param_grid= parameters, cv= 10)
knn_cv.fit(x_train, y_train)

In [49]:
print("tuned hpyerparameters :(best parameters) ",knn_cv.best_params_)
print("accuracy :",knn_cv.best_score_)

In [51]:
knn_cv.score(x_test, y_test)

In [52]:
yhat = knn_cv.predict(x_test)
plot_confusion_matrix(y_test,yhat)