# Mapper3.py

In [3]:
#!/usr/bin/env python3

import sys
import pickle
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score

def train_decision_tree(X_train, y_train):
    # Create and train Decision Tree classifier
    decision_tree = DecisionTreeClassifier()
    decision_tree.fit(X_train, y_train)

    # Evaluate the model
    y_pred_train = decision_tree.predict(X_train)

    accuracy = accuracy_score(y_train, y_pred_train)
    precision = precision_score(y_train, y_pred_train, average='binary')
    recall = recall_score(y_train, y_pred_train, average='binary')

    print(f"Training Accuracy: {accuracy}")
    print(f"Training Precision: {precision}")
    print(f"Training Recall: {recall}")

    # Save the trained model as a pickle file
    with open('decision_tree_model.pkl', 'wb') as model_file:
        pickle.dump(decision_tree, model_file, protocol=2)

    return decision_tree, accuracy, precision, recall

def main():
    # Specify the number of rows to include for training
    #num_rows_train = 30000  # You can change this number as needed

    # Read training data from "flights_train.txt"
    train_data = pd.read_csv("flights_train.txt", sep='\t')

    # Extract features and labels
    X_train = train_data["Features"].apply(eval).tolist()
    y_train = train_data["delay"].apply(lambda x: 1 if x > 0 else 0).tolist()

    # Train Decision Tree classifier
    decision_tree_model, training_accuracy, training_precision, training_recall = train_decision_tree(X_train, y_train)

if __name__ == "__main__":
    main()


Training Accuracy: 0.9999951604551108
Training Precision: 1.0
Training Recall: 0.9999923837377569


# Reducer3.py

In [4]:
#!/usr/bin/env python3

import sys
import pickle
import pandas as pd
from sklearn.metrics import precision_score, recall_score, accuracy_score

def test_decision_tree(X_test, y_test):
    # Load the trained model from pickle file
    with open('decision_tree_model.pkl', 'rb') as model_file:
        decision_tree = pickle.load(model_file)

    # Convert the "delay" column to binary (0 for not delayed, 1 for delayed)
    y_test_binary = [1 if x > 0 else 0 for x in y_test]

    # Evaluate the model on the test set
    y_pred_test = decision_tree.predict(X_test)

    test_accuracy = accuracy_score(y_test_binary, y_pred_test)
    test_precision = precision_score(y_test_binary, y_pred_test)
    test_recall = recall_score(y_test_binary, y_pred_test)

    print(f"Test Accuracy: {test_accuracy}")
    print(f"Test Precision: {test_precision}")
    print(f"Test Recall: {test_recall}")

def main():
    # Specify the number of rows to include for testing
    #num_rows_test = 500  # You can change this number as needed

    # Read testing data from "flights_test.txt"
    test_data = pd.read_csv("flights_test.txt", sep='\t')

    # Extract features and labels
    X_test = test_data["Features"].apply(eval).tolist()
    y_test = test_data["delay"].tolist()

    # Test Decision Tree classifier
    test_decision_tree(X_test, y_test)

if __name__ == "__main__":
    main()


Test Accuracy: 0.6608269774284719
Test Precision: 0.7308975182839133
Test Recall: 0.7323991168894886
