In [14]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [15]:
df_train = pd.read_csv("train.csv")

In [16]:
df_train.head(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,2.5,3.0,1.1,versicolor
1,6.2,2.2,4.5,1.5,versicolor
2,5.1,3.8,1.5,0.3,setosa
3,6.8,3.2,5.9,2.3,virginica
4,5.7,2.8,4.1,1.3,versicolor


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder



X = df_train.drop(columns=["species"])
y = df_train["species"]
le = LabelEncoder()
y = le.fit_transform(y)

num_cols = X.select_dtypes(include=["int64", "float64"]).columns
# cat_cols = X.select_dtypes(include=["object"]).columns

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)



[2 2 0 2 0 2 2 2 2 2 0 1 2 1 0 2 2 0 0 0 0 2 0 0 0 2 1 0 1 0 0 2 2 1 1 2 0
 2 0 2 0 0 1 1 2 1 1 1 0 0 1 1 1 0 0 1 1 1 1 1 2 1 1 2 2 1 1 1 1 2 0 2 2 0
 2 1 2 0 1 2 1 1 0 0]


In [18]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import f1_score

In [19]:
numerical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler" , MinMaxScaler())
])



preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_pipeline, num_cols),
], remainder="drop")



In [23]:
prams = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 10, 20],
    "max_features": [None,"sqrt", "log2"],
    "n_jobs": [-1]
}

scores = []
for config in ParameterGrid(prams):
    model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(random_state=42, **config))
])
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = f1_score(y_test, y_pred, average="weighted")
    print(config, "F1 Score:", score)
    scores.append((score))

{'max_depth': None, 'max_features': None, 'n_estimators': 100, 'n_jobs': -1} F1 Score: 1.0
{'max_depth': None, 'max_features': None, 'n_estimators': 200, 'n_jobs': -1} F1 Score: 1.0
{'max_depth': None, 'max_features': None, 'n_estimators': 300, 'n_jobs': -1} F1 Score: 1.0
{'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 100, 'n_jobs': -1} F1 Score: 1.0
{'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 200, 'n_jobs': -1} F1 Score: 1.0
{'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 300, 'n_jobs': -1} F1 Score: 1.0
{'max_depth': None, 'max_features': 'log2', 'n_estimators': 100, 'n_jobs': -1} F1 Score: 1.0
{'max_depth': None, 'max_features': 'log2', 'n_estimators': 200, 'n_jobs': -1} F1 Score: 1.0
{'max_depth': None, 'max_features': 'log2', 'n_estimators': 300, 'n_jobs': -1} F1 Score: 1.0
{'max_depth': 10, 'max_features': None, 'n_estimators': 100, 'n_jobs': -1} F1 Score: 1.0
{'max_depth': 10, 'max_features': None, 'n_estimators': 200, 'n_jobs': -1} F1 Sc

In [26]:
best_score = max(scores)
best_index = scores.index(best_score)
best_config = list(ParameterGrid(prams))[best_index]

final_model = Pipeline(steps=[
    ("preprocessor", preprocessor), 
    ("model", RandomForestClassifier(random_state=42, **best_config))
])

In [47]:
model.fit(X,y)
df_test = pd.read_csv("test.csv")
df_test = df_test.rename(columns={"seepal_width": "sepal_width"})

X_test_submit = df_test.drop(columns=["id"])
y_pred_test = model.predict(X_test_submit)
submission = pd.DataFrame({
    "id": df_test["id"],
    "species": le.inverse_transform(y_pred_test)
})
submission.to_csv("submission.csv", index=False)