In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
df = pd.read_csv("data\\train.csv")


In [None]:

print(f"Kształt danych: {df.shape}")
display(df.head())

🔢 Kształt danych: (0, 3)


Unnamed: 0,dummy,data,1


In [None]:
print("Informacje o zbiorze danych:")
df.info()

ℹ️ Informacje o zbiorze danych:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   dummy   0 non-null      object
 1   data    0 non-null      object
 2   1       0 non-null      object
dtypes: object(3)
memory usage: 132.0+ bytes


In [None]:
print("Opis statystyczny danych numerycznych:")
display(df.describe())

📊 Opis statystyczny danych numerycznych:


Unnamed: 0,dummy,data,1
count,0.0,0.0,0.0
unique,0.0,0.0,0.0
top,,,
freq,,,


In [None]:
missing = df.isna().sum()
missing = missing[missing > 0].sort_values(ascending=False)
print("Kolumny z brakami danych:")
display(missing)

🧱 Kolumny z brakami danych:


Series([], dtype: int64)

In [None]:
cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = df.select_dtypes(include=["number", "bool"]).columns.tolist()

print(f" Kolumny kategoryczne: {cat_cols}")
print(f" Kolumny numeryczne: {num_cols}")

In [None]:
# Unikalne wartości w zmiennych kategorycznych
for col in cat_cols:
    print(f"{col}: {df[col].nunique()} unikalnych wartości")
    print(df[col].value_counts(dropna=False).head())
    print("")

In [None]:
#Sprawdzenie duplikatów
duplicates = df.duplicated().sum()
print(f" Liczba duplikatów: {duplicates}")

In [None]:
target_col = "target"  
if target_col in df.columns:
    print(f"Rozkład zmiennej docelowej: {target_col}")
    print(df[target_col].value_counts(normalize=True))
    sns.countplot(x=df[target_col])
    plt.title("Rozkład targetu")
    plt.show()

In [None]:
# Boxploty dla wykrywania outliers
for col in num_cols:
    if df[col].nunique() > 10:  # tylko dla sensownych zmiennych ciągłych
        plt.figure(figsize=(6, 1.5))
        sns.boxplot(data=df, x=col)
        plt.title(f"Outliers in {col}")
        plt.show()

In [None]:
#korelacje numeryczne
corr = df[num_cols].corr()
plt.figure(figsize=(10, 6))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", square=True)
plt.title(" Korelacje między cechami numerycznymi")
plt.show()

## Pipeline - encoding

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# 1. Rozdzielenie kolumn
cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = df.select_dtypes(include=["number", "bool"]).drop(columns=[target_col]).columns.tolist()

print(f" Kolumny kategoryczne: {cat_cols}")
print(f" Kolumny numeryczne: {num_cols}")

# 2. Imputacja + kodowanie kategorii
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# 3. Imputacja + skalowanie numerów
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# 4. ColumnTransformer – łączy oba pipeline’y
preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])

## Encoding with column names

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# ✳️ Lista kolumn kategorycznych
cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()

# 🔠 One-hot encoding
encoder = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), cat_cols)
    ],
    remainder="passthrough"  # zachowaj kolumny numeryczne bez zmian
)

X = df.drop(columns=[target_col])
y = df[target_col]

X_encoded = encoder.fit_transform(X)

print("Kształt po kodowaniu:", X_encoded.shape)


In [None]:
# Zakładamy, że encoder został dopasowany
encoder = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), cat_cols)
    ],
    remainder="passthrough"  # zachowaj kolumny numeryczne bez zmian
)
ohe = encoder.named_transformers_["cat"]
encoded_feature_names = ohe.get_feature_names_out(cat_cols)

# Inne kolumny (numeryczne) - nie zostały zakodowane
non_cat_cols = X.drop(columns=cat_cols).columns

# Połączenie nazw wszystkich cech
all_feature_names = list(encoded_feature_names) + list(non_cat_cols)

# Konwersja do DataFrame
import pandas as pd
X_encoded_df = pd.DataFrame(X_encoded, columns=all_feature_names)


## Walidacja

In [None]:
X = df.drop(columns=[target_col])
y = df[target_col]

X_transformed = preprocessor.fit_transform(X)

# Jeśli używasz OneHotEncoder, dane będą jako NumPy array — sprawdź shape
print("Po przetworzeniu:")
print("X shape:", X_transformed.shape)
print("y shape:", y.shape)

### Train Model Regression

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

# --- Klasyfikacja ---
target_col = "target"  # <- zmień według danych

# --- Dane ---
df = pd.read_csv("train.csv")
X = df.drop(columns=[target_col])
y = df[target_col]

# --- Podział danych ---
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# --- Rozdzielenie kolumn ---
cat_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = X_train.select_dtypes(include=["number", "bool"]).columns.tolist()

# --- Imputacja braków ---
X_train[cat_cols] = X_train[cat_cols].fillna("missing")
X_val[cat_cols] = X_val[cat_cols].fillna("missing")

for col in num_cols:
    median = X_train[col].median()
    X_train[col] = X_train[col].fillna(median)
    X_val[col] = X_val[col].fillna(median)

# --- Kodowanie kategorii ---
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe.fit(X_train[cat_cols])

X_train_cat = pd.DataFrame(ohe.transform(X_train[cat_cols]), columns=ohe.get_feature_names_out(cat_cols), index=X_train.index)
X_val_cat = pd.DataFrame(ohe.transform(X_val[cat_cols]), columns=ohe.get_feature_names_out(cat_cols), index=X_val.index)

# --- Skalowanie numeryczne ---
scaler = StandardScaler()
scaler.fit(X_train[num_cols])

X_train_num = pd.DataFrame(scaler.transform(X_train[num_cols]), columns=num_cols, index=X_train.index)
X_val_num = pd.DataFrame(scaler.transform(X_val[num_cols]), columns=num_cols, index=X_val.index)

# --- Finalne dane ---
X_train_full = pd.concat([X_train_num, X_train_cat], axis=1)
X_val_full = pd.concat([X_val_num, X_val_cat], axis=1)

# --- Modele klasyfikacyjne ---
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "DecisionTree": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier(),
    "GradientBoosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# --- Trening modeli ---
for name, model in models.items():
    print(f"\n\U0001F4C8 Trenowanie modelu: {name}")
    model.fit(X_train_full, y_train)
    y_pred = model.predict(X_val_full)

    acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred, average='weighted')
    print(f"Accuracy: {acc:.4f} | F1-score: {f1:.4f}")


### Optuna Classification

In [None]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, make_scorer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

# --- Dane ---
df = pd.read_csv("train.csv")
target_col = "target"  # <- zmień
X = df.drop(columns=[target_col])
y = df[target_col]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# --- Kolumny ---
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = X.select_dtypes(include=["number", "bool"]).columns.tolist()

# --- Preprocessing ---
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])

# --- Optuna objective ---
def objective(trial):
    model_name = trial.suggest_categorical("model", [
        "logistic", "tree", "forest", "gboost", "xgb"
    ])

    if model_name == "logistic":
        model = LogisticRegression(
            C=trial.suggest_float("lr_c", 1e-3, 10.0, log=True),
            max_iter=1000
        )
    elif model_name == "tree":
        model = DecisionTreeClassifier(
            max_depth=trial.suggest_int("tree_max_depth", 2, 20),
            min_samples_split=trial.suggest_int("tree_min_samples", 2, 20)
        )
    elif model_name == "forest":
        model = RandomForestClassifier(
            n_estimators=trial.suggest_int("rf_n", 50, 300),
            max_depth=trial.suggest_int("rf_max_depth", 3, 20)
        )
    elif model_name == "gboost":
        model = GradientBoostingClassifier(
            n_estimators=trial.suggest_int("gb_n", 50, 300),
            learning_rate=trial.suggest_float("gb_lr", 0.01, 0.3),
            max_depth=trial.suggest_int("gb_max_depth", 2, 10)
        )
    elif model_name == "xgb":
        model = XGBClassifier(
            use_label_encoder=False,
            eval_metric="logloss",
            n_estimators=trial.suggest_int("xgb_n", 50, 300),
            learning_rate=trial.suggest_float("xgb_lr", 0.01, 0.3),
            max_depth=trial.suggest_int("xgb_max_depth", 2, 10)
        )

    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("model", model)
    ])

    scores = cross_val_score(pipe, X_train, y_train, cv=3,
                             scoring=make_scorer(f1_score, average="weighted"))
    return scores.mean()

# --- Optuna run ---
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

print("\n\U0001F947 Najlepszy wynik:")
print(study.best_trial)


### Clean optuna for hackathon

In [None]:
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, make_scorer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier


# X_train_full, y_train

def objective(trial):
    model_name = trial.suggest_categorical("model", ["logistic", "tree", "forest", "gboost", "xgb"])

    if model_name == "logistic":
        model = LogisticRegression(
            C=trial.suggest_float("lr_c", 1e-3, 10.0, log=True),
            max_iter=1000
        )
    elif model_name == "tree":
        model = DecisionTreeClassifier(
            max_depth=trial.suggest_int("tree_max_depth", 2, 20),
            min_samples_split=trial.suggest_int("tree_min_samples", 2, 20)
        )
    elif model_name == "forest":
        model = RandomForestClassifier(
            n_estimators=trial.suggest_int("rf_n", 50, 300),
            max_depth=trial.suggest_int("rf_max_depth", 3, 20)
        )
    elif model_name == "gboost":
        model = GradientBoostingClassifier(
            n_estimators=trial.suggest_int("gb_n", 50, 300),
            learning_rate=trial.suggest_float("gb_lr", 0.01, 0.3),
            max_depth=trial.suggest_int("gb_max_depth", 2, 10)
        )
    elif model_name == "xgb":
        model = XGBClassifier(
            use_label_encoder=False,
            eval_metric="logloss",
            n_estimators=trial.suggest_int("xgb_n", 50, 300),
            learning_rate=trial.suggest_float("xgb_lr", 0.01, 0.3),
            max_depth=trial.suggest_int("xgb_max_depth", 2, 10)
        )

    # Walidacja krzyżowa
    score = cross_val_score(
        model,
        X_train_full,
        y_train,
        cv=3,
        scoring=make_scorer(f1_score, average="weighted")
    ).mean()

    return score

# Uruchomienie Optuny
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

print("🏆 Najlepszy wynik:")
print(study.best_trial)


### After EDA, with X_train, y_train, X_val, y_val, X_test ready for training(after encoding, feature engineering)

In [None]:
X_test = "" #dummy
assert X_train.shape[1] == X_val.shape[1] == X_test.shape[1], "Mismatch in features!"
assert not X_train.isna().any().any(), "Missing values in train!"

#### imports 

In [None]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import shap
import optuna

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os

### ===== SETUP =====

In [None]:
mlflow.set_experiment("hackathon_classification")
SEED = 42

### ===== MODELS TO EVALUATE =====

In [None]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=SEED),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=SEED),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=SEED)
}


### ===== TRAIN + VALIDATE =====

In [None]:


results = {}

for name, model in models.items():
    with mlflow.start_run(run_name=f"{name}_baseline"):
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        f1 = f1_score(y_val, preds)

        # Log params and metrics
        mlflow.log_param("model", name)
        mlflow.log_metric("val_f1", f1)

        # === Confusion Matrix ===
        cm = confusion_matrix(y_val, preds)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm)
        disp.plot(cmap="Blues", values_format='d')
        
        # Save plot as PNG
        cm_path = f"{name}_confusion_matrix.png"
        plt.savefig(cm_path, bbox_inches='tight')
        plt.close()

        # Log to MLflow
        mlflow.log_artifact(cm_path)

        # Clean up file
        if os.path.exists(cm_path):
            os.remove(cm_path)

        # Save results
        results[name] = {
            "model": model,
            "f1": f1
        }

        print(f"{name} - F1: {f1:.4f}")


### ===== SELECT BEST MODEL =====

In [None]:
best_model_name = max(results, key=lambda x: results[x]['f1'])
print(f"\n✅ Best baseline model: {best_model_name}")

### ===== OPTUNA TUNING (for best model) =====

In [None]:
def objective(trial):
    if best_model_name == "XGBoost":
        params = {
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "n_estimators": trial.suggest_int("n_estimators", 100, 500),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0)
        }
        model = XGBClassifier(**params, use_label_encoder=False, eval_metric='logloss', random_state=SEED)
    elif best_model_name == "RandomForest":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 500),
            "max_depth": trial.suggest_int("max_depth", 3, 15),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 10)
        }
        model = RandomForestClassifier(**params, random_state=SEED)
    else:  # LogisticRegression
        params = {
            "C": trial.suggest_float("C", 0.01, 10, log=True)
        }
        model = LogisticRegression(**params, max_iter=1000, random_state=SEED)

    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    return f1_score(y_val, preds)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

best_params = study.best_params
print(f"\n🎯 Best Optuna Params for {best_model_name}: {best_params}")

### ===== FINAL TRAIN ON TRAIN+VAL =====

In [None]:
X_full = pd.concat([X_train, X_val])
y_full = pd.concat([y_train, y_val])

if best_model_name == "XGBoost":
    final_model = XGBClassifier(**best_params, use_label_encoder=False, eval_metric='logloss', random_state=SEED)
elif best_model_name == "RandomForest":
    final_model = RandomForestClassifier(**best_params, random_state=SEED)
else:
    final_model = LogisticRegression(**best_params, max_iter=1000, random_state=SEED)

with mlflow.start_run(run_name=f"{best_model_name}_final"):
    final_model.fit(X_full, y_full)
    mlflow.sklearn.log_model(final_model, "model")
    mlflow.log_params(best_params)

### ===== Top Features =====

In [None]:
importances = final_model.feature_importances_
features = X_full.columns
feat_imp_df = pd.DataFrame({"Feature": features, "Importance": importances})
feat_imp_df.sort_values("Importance", ascending=False).head(10)


### ===== SHAP EXPLANATION =====

In [None]:
explainer = shap.Explainer(final_model, X_full)
shap_values = explainer(X_full)

# Optional summary plot (comment if running headless)
shap.plots.beeswarm(shap_values)

### ===== FINAL PREDICTIONS =====

In [None]:
X_test = "" #dummy
y_test_pred = final_model.predict(X_test)