## Setup

In [None]:
import sys

sys.path.append("../")

In [None]:
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

## Load data

In [None]:
# Read csv data as pandas dataframe
data = pd.read_csv("../data/pg15training.csv")
data.head()

## Prep data

In [None]:
# Create the target column
data["target"] = data["Numtppd"].apply(lambda x: 1 if x != 0 else 0)

# Drop some columns
X = data.drop(columns=["Numtppd", "Numtpbi", "Indtppd", "Indtpbi", "target"])
y = data["target"]

# Add one hot encoder processor
categorical_columns = data.select_dtypes(include=["object"]).columns.tolist()
preprocessor = ColumnTransformer(
    transformers=[("cat", OneHotEncoder(handle_unknown="ignore"), categorical_columns)], remainder="passthrough"
)

## Split data

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Train model

In [None]:
# Initialize the LightGBM model
model = LGBMClassifier(objective="binary", n_estimators=100, learning_rate=0.1, max_depth=5)

# Set up pipeline of scikit-learn
pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

# Train the model
pipeline.fit(X_train, y_train)

## Evaluate model

In [None]:
# Make predictions
y_pred = pipeline.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")