Приступаем к созданию первой модели

In [2]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import numpy as np

def predict_parking_purchase_probability_catboost(filepath, client_id_col='client_id', target_col='target', test_size=0.2, random_state=42):
    try:
        df = pd.read_csv(filepath)

        if client_id_col not in df.columns or target_col not in df.columns:
            print(f"Error: Columns '{client_id_col}' and '{target_col}' not found in '{filepath}'.")
            return None, None, None

        # Separate features and target
        X = df.drop(columns=[client_id_col, target_col])
        y = df[target_col]

        # Identify categorical features and handle potential numerical values in categorical columns.
        categorical_features = X.select_dtypes(include=['object', 'category']).columns
        for col in categorical_features:
            #Convert to string if not already string, handle NaNs, and ensure no numeric values
            if X[col].dtype != 'object':
                X[col] = X[col].astype(str)
            X[col] = X[col].fillna('Unknown')
            #If any values are numbers after above changes, force them to strings.
            X[col] = X[col].apply(lambda x: str(x) if isinstance(x, (int, float)) else x )


        # Get indices of categorical features for CatBoost
        categorical_features_indices = np.where(X.dtypes == 'object')[0]

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

        # Initialize and train the CatBoost model
        model = CatBoostClassifier(iterations=4819,
                                   learning_rate=0.1,
                                   loss_function='Logloss',
                                   random_seed=random_state,
                                   verbose=100,
                                   early_stopping_rounds=4819)

        model.fit(X_train, y_train, cat_features=categorical_features_indices, eval_set=(X_test, y_test))

        # Make predictions (probabilities)
        probabilities = model.predict_proba(X_test)[:, 1]

        # Evaluate the model
        accuracy = accuracy_score(y_test, model.predict(X_test))
        roc_auc = roc_auc_score(y_test, probabilities)
        report = classification_report(y_test, model.predict(X_test))

        evaluation_metrics = {
            "accuracy": accuracy,
            "roc_auc": roc_auc,
            "classification_report": report
        }

        return model, probabilities, evaluation_metrics

    except FileNotFoundError:
        print(f"Error: File '{filepath}' not found.")
        return None, None, None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None, None, None

# Example Usage (Remember to replace 'your_file.csv' with your actual file path)
filepath = 'train_cleaned.csv'
model, probabilities, evaluation_metrics = predict_parking_purchase_probability_catboost(filepath)

if model:
    print("Model trained successfully.")
    print("Evaluation Metrics:", evaluation_metrics)
    print("Probabilities:", probabilities)

0:	learn: 0.5450158	test: 0.5443958	best: 0.5443958 (0)	total: 201ms	remaining: 16m 9s
100:	learn: 0.0501137	test: 0.0343604	best: 0.0342792 (99)	total: 7.22s	remaining: 5m 37s
200:	learn: 0.0363466	test: 0.0315170	best: 0.0315170 (200)	total: 14.9s	remaining: 5m 41s
300:	learn: 0.0272262	test: 0.0305597	best: 0.0305346 (298)	total: 22.2s	remaining: 5m 33s
400:	learn: 0.0208506	test: 0.0304533	best: 0.0299653 (338)	total: 28.7s	remaining: 5m 16s
500:	learn: 0.0162035	test: 0.0308115	best: 0.0299653 (338)	total: 35.3s	remaining: 5m 4s
600:	learn: 0.0130501	test: 0.0315959	best: 0.0299653 (338)	total: 42.4s	remaining: 4m 57s
700:	learn: 0.0105049	test: 0.0319029	best: 0.0299653 (338)	total: 50.1s	remaining: 4m 54s
800:	learn: 0.0087593	test: 0.0323730	best: 0.0299653 (338)	total: 58.1s	remaining: 4m 51s
900:	learn: 0.0072705	test: 0.0331628	best: 0.0299653 (338)	total: 1m 5s	remaining: 4m 44s
1000:	learn: 0.0063583	test: 0.0334985	best: 0.0299653 (338)	total: 1m 12s	remaining: 4m 36s
110

Присутпаем к созданию второй модели

In [2]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import numpy as np

def predict_parking_purchase_probability_catboost(filepath, client_id_col='client_id', target_col='target', test_size=0.2, random_state=42):
    try:
        df = pd.read_csv(filepath)

        if client_id_col not in df.columns or target_col not in df.columns:
            print(f"Error: Columns '{client_id_col}' and '{target_col}' not found in '{filepath}'.")
            return None, None, None

        # Separate features and target
        X = df.drop(columns=[client_id_col, target_col])
        y = df[target_col]

        # Identify categorical features and handle potential numerical values in categorical columns.
        categorical_features = X.select_dtypes(include=['object', 'category']).columns
        for col in categorical_features:
            #Convert to string if not already string, handle NaNs, and ensure no numeric values
            if X[col].dtype != 'object':
                X[col] = X[col].astype(str)
            X[col] = X[col].fillna('Unknown')
            #If any values are numbers after above changes, force them to strings.
            X[col] = X[col].apply(lambda x: str(x) if isinstance(x, (int, float)) else x )


        # Get indices of categorical features for CatBoost
        categorical_features_indices = np.where(X.dtypes == 'object')[0]

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

        # Initialize and train the CatBoost model
        model = CatBoostClassifier(iterations=2934,
                                   learning_rate=0.1,
                                   loss_function='Logloss',
                                   random_seed=random_state,
                                   verbose=100,
                                   early_stopping_rounds=2934)

        model.fit(X_train, y_train, cat_features=categorical_features_indices, eval_set=(X_test, y_test))

        # Make predictions (probabilities)
        probabilities = model.predict_proba(X_test)[:, 1]

        # Evaluate the model
        accuracy = accuracy_score(y_test, model.predict(X_test))
        roc_auc = roc_auc_score(y_test, probabilities)
        report = classification_report(y_test, model.predict(X_test))

        evaluation_metrics = {
            "accuracy": accuracy,
            "roc_auc": roc_auc,
            "classification_report": report
        }

        return model, probabilities, evaluation_metrics

    except FileNotFoundError:
        print(f"Error: File '{filepath}' not found.")
        return None, None, None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None, None, None

# Example Usage (Remember to replace 'your_file.csv' with your actual file path)
filepath = 'valid_cleaned.csv'
model, probabilities, evaluation_metrics = predict_parking_purchase_probability_catboost(filepath)

if model:
    print("Model trained successfully.")
    print("Evaluation Metrics:", evaluation_metrics)
    print("Probabilities:", probabilities)

0:	learn: 0.5450158	test: 0.5443958	best: 0.5443958 (0)	total: 200ms	remaining: 9m 46s
100:	learn: 0.0501137	test: 0.0343604	best: 0.0342792 (99)	total: 7.43s	remaining: 3m 28s
200:	learn: 0.0363466	test: 0.0315170	best: 0.0315170 (200)	total: 15.7s	remaining: 3m 34s
300:	learn: 0.0272262	test: 0.0305597	best: 0.0305346 (298)	total: 23.1s	remaining: 3m 22s
400:	learn: 0.0208506	test: 0.0304533	best: 0.0299653 (338)	total: 30.5s	remaining: 3m 12s
500:	learn: 0.0162035	test: 0.0308115	best: 0.0299653 (338)	total: 37.6s	remaining: 3m 2s
600:	learn: 0.0130501	test: 0.0315959	best: 0.0299653 (338)	total: 45s	remaining: 2m 54s
700:	learn: 0.0105049	test: 0.0319029	best: 0.0299653 (338)	total: 52.3s	remaining: 2m 46s
800:	learn: 0.0087593	test: 0.0323730	best: 0.0299653 (338)	total: 59.6s	remaining: 2m 38s
900:	learn: 0.0072705	test: 0.0331628	best: 0.0299653 (338)	total: 1m 6s	remaining: 2m 30s
1000:	learn: 0.0063583	test: 0.0334985	best: 0.0299653 (338)	total: 1m 14s	remaining: 2m 23s
1100: