In [61]:
# packages
import pandas as pd
from mod02_build_bot_predictor import train_model


### Define a function to extract predictions from the model

In [62]:
def predict_bot(df, model=None):
    """
    Predict whether each account is a bot (1) or human (0).
    """
    if model is None:
        model = train_model()

    preds = model.predict(df)
    return pd.Series(preds, index=df.index)

### Define a function to evaluate model error

In [63]:
def confusion_matrix_and_metrics(y_true, y_pred):
    """
    Computes confusion matrix and common error rates for binary classification.

    Assumes labels:
      0 = negative class
      1 = positive class

    Returns:
      dict with:
        tn, fp, fn, tp
        misclassification_rate
        false_positive_rate
        false_negative_rate
    """
    tn = fp = fn = tp = 0

    for yt, yp in zip(y_true, y_pred):
        if yt == 0 and yp == 0:
            tn += 1
        elif yt == 0 and yp == 1:
            fp += 1
        elif yt == 1 and yp == 0:
            fn += 1
        elif yt == 1 and yp == 1:
            tp += 1
        else:
            raise ValueError("Labels must be 0 or 1")

    total = tn + fp + fn + tp

    misclassification_rate = (fp + fn) / total if total > 0 else 0.0
    false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0.0
    false_negative_rate = fn / (fn + tp) if (fn + tp) > 0 else 0.0

    return {
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn,
        "misclassification_rate": misclassification_rate,
        "false_positive_rate": false_positive_rate,
        "false_negative_rate": false_negative_rate,
    }


### Load the data

In [64]:
TRAIN_PATH = "mod02_data/train.csv"
train = pd.read_csv(TRAIN_PATH)

TEST_PATH = "mod02_data/test.csv"
test = pd.read_csv(TEST_PATH)

### Format the data by independent vs. dependent variables

In [65]:
X_train = train.drop(columns=["is_bot"])
y_train = train['is_bot']

X_test = test.drop(columns=["is_bot"])
y_test = test['is_bot']

### Build the model on training data

In [66]:
model = train_model(X_train, y_train)

### Get the model predictions on training and test data

In [67]:
y_pred_train = predict_bot(X_train, model)
y_pred_test = predict_bot(X_test, model)

### Check results on the training set (data used to build the model)

In [68]:
confusion_matrix_and_metrics(y_train, y_pred_train)

{'tp': 126,
 'tn': 874,
 'fp': 0,
 'fn': 0,
 'misclassification_rate': 0.0,
 'false_positive_rate': 0.0,
 'false_negative_rate': 0.0}

### Check results on the test set (new data not yet seen by the model)

In [69]:
confusion_matrix_and_metrics(y_test, y_pred_test)

{'tp': 126,
 'tn': 874,
 'fp': 0,
 'fn': 0,
 'misclassification_rate': 0.0,
 'false_positive_rate': 0.0,
 'false_negative_rate': 0.0}

# Discussion Questions

### Based on the misclassification rate of your model, discuss your confidence in the ability to predict a bot.

Type your answer here.


 The misclassification rate was around like 13% that is still high when you really think about in a huge scale.

### What are potential ramifications of false positives from the model?

well if you falsely identify regular humans as bots then that is pretty bad. Like how ice is taking a bunch of people that they are misclassifying as illegal. People can get hurt through false positives


### What are potential ramifications of false negatives from the model?

Same thing with one as well, if there is a lot of false negatives then the model itself is flawed and cannot be used in really determining if someone is a bot or not