In [11]:
# packages
import pandas as pd
from mod02_build_bot_predictor import train_model

### Define a function to extract predictions from the model

In [12]:
def predict_bot(df, model=None):
    """
    Predict whether each account is a bot (1) or human (0).
    """
    if model is None:
        model = train_model()

    preds = model.predict(df)
    return pd.Series(preds, index=df.index)

### Define a function to evaluate model error

In [13]:
def confusion_matrix_and_metrics(y_true, y_pred):
    """
    Computes confusion matrix and common error rates for binary classification.

    Assumes labels:
      0 = negative class
      1 = positive class

    Returns:
      dict with:
        tn, fp, fn, tp
        misclassification_rate
        false_positive_rate
        false_negative_rate
    """
    tn = fp = fn = tp = 0

    for yt, yp in zip(y_true, y_pred):
        if yt == 0 and yp == 0:
            tn += 1
        elif yt == 0 and yp == 1:
            fp += 1
        elif yt == 1 and yp == 0:
            fn += 1
        elif yt == 1 and yp == 1:
            tp += 1
        else:
            raise ValueError("Labels must be 0 or 1")

    total = tn + fp + fn + tp

    misclassification_rate = (fp + fn) / total if total > 0 else 0.0
    false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0.0
    false_negative_rate = fn / (fn + tp) if (fn + tp) > 0 else 0.0

    return {
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn,
        "misclassification_rate": misclassification_rate,
        "false_positive_rate": false_positive_rate,
        "false_negative_rate": false_negative_rate,
    }


### Load the data

In [14]:
TRAIN_PATH = "mod03_data/train.csv"
train = pd.read_csv(TRAIN_PATH)

TEST_PATH = "mod03_data/test.csv"
test = pd.read_csv(TEST_PATH)

FileNotFoundError: [Errno 2] No such file or directory: 'mod03_data/train.csv'

### Format the data by independent vs. dependent variables

In [None]:
X_train = train.drop(columns=["is_bot"])
y_train = train['is_bot']

X_test = test.drop(columns=["is_bot"])
y_test = test['is_bot']

### Build the model on training data

In [None]:
model = train_model(X_train, y_train)

### Get the model predictions on training and test data

In [None]:
y_pred_train = predict_bot(X_train, model)
y_pred_test = predict_bot(X_test, model)

### Check results on the training set (data used to build the model)

In [None]:
confusion_matrix_and_metrics(y_train, y_pred_train)

{'tp': 174,
 'tn': 2625,
 'fp': 12,
 'fn': 189,
 'misclassification_rate': 0.067,
 'false_positive_rate': 0.004550625711035267,
 'false_negative_rate': 0.5206611570247934}

### Check results on the test set (new data not yet seen by the model)

In [None]:
confusion_matrix_and_metrics(y_test, y_pred_test)

{'tp': 32,
 'tn': 845,
 'fp': 29,
 'fn': 94,
 'misclassification_rate': 0.123,
 'false_positive_rate': 0.03318077803203661,
 'false_negative_rate': 0.746031746031746}

# Discussion Questions

### Based on the misclassification rate of your model, discuss your confidence in the ability to predict a bot. 

I have fairly low confidence in my models ability to predict a bot based on the misclassification rate only being 12.3%. Although this number doesnt tell the full story as the dataset has many more humans than bots. The main issue is the false negative rate being 74.6%. Therefor the model is missing on over 3 quarters of the bots in this set.

### What are potential ramifications of false positives from the model?

The false posatives 

### What are potential ramifications of false negatives from the model?

Type your answer here.