In [1]:
#Import the required libraries
import pandas as pd
import numpy as np


# read the dataset 
df = pd.read_csv("./data/Iris.csv")
df.drop(columns=["Id"], inplace=True)


X = df.iloc[:,:-1].values # means select all columns except the last one
y = df.iloc[:,4].values #selects all the values from the 5th column of the DataFrame

### K-Nearest Neighboor (KNN)

In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# this should be added in the preprocessing stage
from sklearn.preprocessing import LabelEncoder

#Encode labels to integers
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
# y_train = label_encoder.fit_transform(y_train)
# y_test = label_encoder.fit_transform(y_test)


from sklearn.neighbors import KNeighborsClassifier , KNeighborsTransformer
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred = classifier.predict(X_test_scaled)




In [3]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test,y_pred))
print (confusion_matrix (y_test, y_pred))

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00         6
Iris-versicolor       0.94      1.00      0.97        15
 Iris-virginica       1.00      0.89      0.94         9

       accuracy                           0.97        30
      macro avg       0.98      0.96      0.97        30
   weighted avg       0.97      0.97      0.97        30

[[ 6  0  0]
 [ 0 15  0]
 [ 0  1  8]]


### Preprocessing without Pipeline

In [4]:
# from sklearn.preprocessing import OneHotEncoder

# # Make a transformer
# ohe = OneHotEncoder(categories="auto", handle_unknown="ignore", sparse=False)

# # Create transformed dataframe
# category_encoded = ohe.fit_transform(df[["category"]])
# category_encoded = pd.DataFrame(
#     category_encoded,
#     columns=ohe.categories_[0],
#     index=df.index
    
# )

# # Replace categorical data with encoded data
# df.drop("Species", axis=1, inplace=True)
# df = pd.concat([category_encoded, df], axis=1)

# # Visually inspect dataframe
# df

### Feature Engeeneering 

In [5]:
# from sklearn.preprocessing import FunctionTransformer

# def is_odd(data):
#     """
#     Helper function that returns 1 if odd, 0 if even
#     """
#     return data % 2

# # Instantiate transformer
# func_transformer = FunctionTransformer(is_odd)

# # Create transformed column
# number_odd = func_transformer.fit_transform(example_X["number"])

# # Add engineered column
# example_X["number_odd"] = number_odd
# example_X

### scaling

In [6]:
# from sklearn.preprocessing import StandardScaler

# # Instantiate transformer
# scaler = StandardScaler()

# # Create transformed dataset
# data_scaled = scaler.fit_transform(example_X)

# # Replace dataset with transformed one
# example_X = pd.DataFrame(
#     data_scaled,
#     columns=example_X.columns,
#     index=example_X.index
# )
# example_X

### Pipeline

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.tree import DecisionTreeClassifier

# create a pipeline 
pipe = Pipeline([("mms", MinMaxScaler()), 
                ("tree", DecisionTreeClassifier())])

# fit to the training data
pipe.fit(X_train, y_train)

# calculate the Score on test data
pipe.score(X_test, y_test)


0.8666666666666667

### Column transformer

In [8]:
X_train = pd.DataFrame(X_train, columns=["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"])
X_test = pd.DataFrame(X_test, columns=["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"])

num_cols = ["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"]
cat_cols = []

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from  sklearn.preprocessing import OneHotEncoder

# create a transformer
transformer = ColumnTransformer([
    ("cat",Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe",OneHotEncoder())
    ]),cat_cols),
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", MinMaxScaler())
    ]),num_cols)
])

# base estimator for ensemble methods
base = DecisionTreeClassifier(class_weight="balanced")

# create a pipeline
pipe = Pipeline([
    ("pre_pro",transformer),
    ("model", DecisionTreeClassifier(class_weight="balanced"))
])

# fit and predict
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

# print report
print(classification_report(y_test,y_pred))
print (confusion_matrix (y_test, y_pred))


# set up a BaggingClassifier 
from sklearn.ensemble import BaggingClassifier

pipe.set_params(model=BaggingClassifier(base_estimator=base))
# fit and predict
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
# print report
print(classification_report(y_test,y_pred))


                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00         6
Iris-versicolor       0.92      0.80      0.86        15
 Iris-virginica       0.73      0.89      0.80         9

       accuracy                           0.87        30
      macro avg       0.88      0.90      0.89        30
   weighted avg       0.88      0.87      0.87        30

[[ 6  0  0]
 [ 0 12  3]
 [ 0  1  8]]
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00         6
Iris-versicolor       0.94      1.00      0.97        15
 Iris-virginica       1.00      0.89      0.94         9

       accuracy                           0.97        30
      macro avg       0.98      0.96      0.97        30
   weighted avg       0.97      0.97      0.97        30



### GridSearch in Pipeline

In [9]:
# create a pipeline
pipe = Pipeline([("mms", MinMaxScaler()),
                 ("tree", DecisionTreeClassifier(random_state=123))])

# create the grid parameter
grid = [{"max_depth": [None, 2,6,10],
         "min_samples_split":[5,10],
         "min_samples_leaf": [1,2,4]}]

from sklearn.model_selection import GridSearchCV

# create the grid, with "Pipe" as the estimator 
# gridsearch = GridSearchCV(estimator=pipe,
#                           param_grid=grid,
#                           scoring="accuracy",
#                           cv=5)
grid_search = GridSearchCV(pipe, grid, cv=5, n_jobs=-1)

# fit using grid search
grid_search.fit(X_train, y_train)

# # Predict and evaluate
# y_pred = gridsearch.predict(X_test) 

# #calculate the test score
# test_score = gridsearch.score(y_pred, y_test)
# print("Test Score:", test_score)

# # Display the best parameters
# print("Best Parameters:", gridsearch.best_params_)
# print(classification_report(y_test,y_pred))


ValueError: Invalid parameter max_depth for estimator Pipeline(steps=[('mms', MinMaxScaler()),
                ('tree', DecisionTreeClassifier(random_state=123))]). Check the list of available parameters with `estimator.get_params().keys()`.

In [79]:
estimator.get_params().keys()

NameError: name 'estimator' is not defined

### Random Forest model


Just simmilar

In [51]:
import pandas as pd
import numpy as np 

data = pd.read_csv("./data/salaries_final.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,Age,Education,Occupation,Relationship,Race,Sex,Target
0,0,39,Bachelors,Adm-clerical,Not-in-family,White,Male,<=50K
1,1,50,Bachelors,Exec-managerial,Husband,White,Male,<=50K
2,2,38,HS-grad,Handlers-cleaners,Not-in-family,White,Male,<=50K
3,3,53,11th,Handlers-cleaners,Husband,Black,Male,<=50K
4,4,28,Bachelors,Prof-specialty,Wife,Black,Female,<=50K


In [52]:
# encode to numeric 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Initialize LabelEncoder for binary categorical columns
label_encoder = LabelEncoder()

# Encode binary categorical columns: 'Sex' and 'Target'
data['Sex'] = label_encoder.fit_transform(data['Sex'])
data['Education'] = label_encoder.fit_transform(data['Education'])
data['Occupation'] = label_encoder.fit_transform(data['Occupation'])
data['Relationship'] = label_encoder.fit_transform(data['Relationship'])
data['Race'] = label_encoder.fit_transform(data['Race'])
data['Target'] = label_encoder.fit_transform(data['Target'])

# # One-Hot Encode the other categorical features
# data = pd.get_dummies(data, columns=['Education', 'Occupation', 'Relationship', 'Race'], drop_first=True)


In [53]:
data.head()

Unnamed: 0.1,Unnamed: 0,Age,Education,Occupation,Relationship,Race,Sex,Target
0,0,39,9,1,1,4,1,0
1,1,50,9,4,0,4,1,0
2,2,38,11,6,1,4,1,0
3,3,53,1,6,0,2,1,0
4,4,28,9,10,5,2,0,0


In [54]:
X = data[["Age", "Education", "Occupation", "Relationship", "Race", "Sex"]] # the predictor
y = data[["Target"]] # income as the predicted

print(X.shape)
print(y.shape)

(32561, 6)
(32561, 1)


In [55]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(26048, 6)
(6513, 6)
(26048, 1)
(6513, 1)


In [56]:
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(strategy='mean')

from sklearn.linear_model import LogisticRegression
log = LogisticRegression()

from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(imp_mean, log)



# # Define the pipeline steps
# pipe = Pipeline(steps=[
#     ('scaler', StandardScaler()),  # Example of a transformer step
#     ('classifier', LogisticRegression())  # Example of an estimator step
# ])


In [57]:
pipe.fit(X_train, y_train)

  return f(**kwargs)


Pipeline(steps=[('simpleimputer', SimpleImputer()),
                ('logisticregression', LogisticRegression())])

In [58]:
pipe.score(X_train, y_train)

0.7539542383292384

In [59]:
pipe.score(X_test, y_test)

0.7541839398126823