In [2]:
#Import the required libraries
import pandas as pd
import numpy as np


# read the dataset 
df = pd.read_csv("./data/Iris.csv")

# view the head
print(df.head())

X = df.iloc[:,:-1].values # means select all columns except the last one
y = df.iloc[:,5].values #selects all the values from the 5th column of the DataFrame

   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa


### K-Nearest Neighboor (KNN)

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# this should be added in the preprocessing stage
from sklearn.preprocessing import LabelEncoder

#Encode labels to integers
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.fit_transform(y_test)


from sklearn.neighbors import KNeighborsClassifier , KNeighborsTransformer
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)

# Predict and evaluate
y_pred = classifier.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test,y_pred))
print (confusion_matrix (y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00        11
           2       1.00      1.00      1.00         7

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

[[12  0  0]
 [ 0 11  0]
 [ 0  0  7]]


### Preprocessing without Pipeline

In [4]:
# from sklearn.preprocessing import OneHotEncoder

# # Make a transformer
# ohe = OneHotEncoder(categories="auto", handle_unknown="ignore", sparse=False)

# # Create transformed dataframe
# category_encoded = ohe.fit_transform(df[["category"]])
# category_encoded = pd.DataFrame(
#     category_encoded,
#     columns=ohe.categories_[0],
#     index=df.index
    
# )

# # Replace categorical data with encoded data
# df.drop("Species", axis=1, inplace=True)
# df = pd.concat([category_encoded, df], axis=1)

# # Visually inspect dataframe
# df

### Feature Engeeneering 

In [5]:
# from sklearn.preprocessing import FunctionTransformer

# def is_odd(data):
#     """
#     Helper function that returns 1 if odd, 0 if even
#     """
#     return data % 2

# # Instantiate transformer
# func_transformer = FunctionTransformer(is_odd)

# # Create transformed column
# number_odd = func_transformer.fit_transform(example_X["number"])

# # Add engineered column
# example_X["number_odd"] = number_odd
# example_X

### scaling

In [6]:
# from sklearn.preprocessing import StandardScaler

# # Instantiate transformer
# scaler = StandardScaler()

# # Create transformed dataset
# data_scaled = scaler.fit_transform(example_X)

# # Replace dataset with transformed one
# example_X = pd.DataFrame(
#     data_scaled,
#     columns=example_X.columns,
#     index=example_X.index
# )
# example_X

### Pipeline

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.tree import DecisionTreeClassifier

# create a pipeline 
pipe = Pipeline([("mms", MinMaxScaler()), 
                ("tree", DecisionTreeClassifier())])

# fit to the training data
pipe.fit(X_train, y_train)

# calculate the Score on test data
pipe.score(X_test, y_test)


1.0

### Column transformer

In [None]:
cat_cols = ["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"]
num_cols = ["Species"]

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from  sklearn.preprocessing import OneHotEncoder

# create a transformer
transformer = ColumnTransformer([
    ("cat",Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe",OneHotEncoder())
    ]),cat_cols),
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", MinMaxScaler())
    ]),num_cols)
])

# base estimator for ensemble methods
base = DecisionTreeClassifier(class_weight="balanced")

# create a pipeline
pipe = Pipeline([
    ("pre_pro",transformer),
    ("model", DecisionTreeClassifier(class_weight="balanced"))
])

# # fit and predict
# pipe.fit(X_train, y_train)

# y_pred = pipe.predict(X_test)

# print report
print(classification_report(y_test,y_pred))
print (confusion_matrix (y_test, y_pred))


# set up a BaggingClassifier 
from sklearn.ensemble import BaggingClassifier

pipe.set_params(model=BaggingClassifier(base_estimator=base))
# fit and predict
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
# print report
print(classification_report(y_test,y_pred))


### GridSearch in Pipeline

In [33]:
# create a pipeline
pipe = Pipeline([("mms", MinMaxScaler()),
                 ("tree", DecisionTreeClassifier(random_state=123))])

# create the grid parameter
grid = [{"n_estimators":[100,200,300,400],
         "max_depth": [None, 2,6,10],
         "min_samples_splits":[5,10],
         "min_samples_leaf": [1,2,4]}]

from sklearn.model_selection import GridSearchCV

# create the grid, with "Pipe" as the estimator 
gridsearch = GridSearchCV(estimator=pipe,
                          param_grid=grid,
                          scoring="accuracy",
                          cv=5)

# fit using grid search
gridsearch.fit(X_train, y_train)

# Predict and evaluate
y_pred = gridsearch.predict(X_test) 

#calculate the test score
test_score = gridsearch.score(y_pred, y_test)
print("Test Score:", test_score)

# Display the best parameters
print("Best Parameters:", gridsearch.best_params_)
print(classification_report(y_test,y_pred))


AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

### Random Forest model
