# Mapper3.py

In [3]:
#!/usr/bin/env python3

import sys
import pickle
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_score, recall_score, accuracy_score

def train_naive_bayes(X_train, y_train):
    # Convert the list of feature lists to a list of strings
    X_train_str = [' '.join(map(str, features)) for features in X_train]

    # Vectorize the features using CountVectorizer
    vectorizer = CountVectorizer()
    X_train_vec = vectorizer.fit_transform(X_train_str)

    # Create and train Naive Bayes classifier
    naive_bayes = MultinomialNB()
    naive_bayes.fit(X_train_vec, y_train)

    # Evaluate the model
    y_pred_train = naive_bayes.predict(X_train_vec)

    accuracy = accuracy_score(y_train, y_pred_train)
    precision = precision_score(y_train, y_pred_train, average='binary')
    recall = recall_score(y_train, y_pred_train, average='binary')

    print(f"Training Accuracy: {accuracy}")
    print(f"Training Precision: {precision}")
    print(f"Training Recall: {recall}")

    # Save the trained model as a pickle file
    with open('naive_bayes_model.pkl', 'wb') as model_file:
        pickle.dump(naive_bayes, model_file, protocol=2)

    # Save the vectorizer as a pickle file
    with open('vectorizer.pkl', 'wb') as vectorizer_file:
        pickle.dump(vectorizer, vectorizer_file, protocol=2)

    return naive_bayes, accuracy, precision, recall

def main():
    # Specify the number of rows to include for training
    #num_rows_train = 30000  # You can change this number as needed

    # Read training data from "flights_train.txt"
    train_data = pd.read_csv("flights_train.txt", sep='\t')

    # Extract features and labels
    X_train = train_data["Features"].apply(eval).tolist()
    y_train = train_data["delay"].apply(lambda x: 1 if x > 0 else 0).tolist()

    # Train Naive Bayes classifier
    naive_bayes_model, training_accuracy, training_precision, training_recall = train_naive_bayes(X_train, y_train)

if __name__ == "__main__":
    main()


Training Accuracy: 0.6482521983632659
Training Precision: 0.6709419655876349
Training Recall: 0.8761214946153026


# Reducer3.py

In [4]:
#!/usr/bin/env python3

import sys
import pickle
import pandas as pd
from sklearn.metrics import precision_score, recall_score, accuracy_score

def test_naive_bayes(X_test, y_test):
    # Load the trained model from pickle file
    with open('naive_bayes_model.pkl', 'rb') as model_file:
        naive_bayes = pickle.load(model_file)

    # Load the vectorizer from pickle file
    with open('vectorizer.pkl', 'rb') as vectorizer_file:
        vectorizer = pickle.load(vectorizer_file)

    # Convert the list of feature lists to a list of strings
    X_test_str = [' '.join(map(str, features)) for features in X_test]

    # Vectorize the features using the loaded vectorizer
    X_test_vec = vectorizer.transform(X_test_str)

    # Convert the "delay" column to binary (0 for not delayed, 1 for delayed)
    y_test_binary = [1 if x > 0 else 0 for x in y_test]

    # Evaluate the model on the test set
    y_pred_test = naive_bayes.predict(X_test_vec)

    test_accuracy = accuracy_score(y_test_binary, y_pred_test)
    test_precision = precision_score(y_test_binary, y_pred_test)
    test_recall = recall_score(y_test_binary, y_pred_test)

    print(f"Test Accuracy: {test_accuracy}")
    print(f"Test Precision: {test_precision}")
    print(f"Test Recall: {test_recall}")

def main():
    # Specify the number of rows to include for testing
    #num_rows_test = 200  # You can change this number as needed

    # Read testing data from "flights_test.txt"
    test_data = pd.read_csv("flights_test.txt", sep='\t')

    # Extract features and labels
    X_test = test_data["Features"].apply(eval).tolist()
    y_test = test_data["delay"].tolist()

    # Test Naive Bayes classifier
    test_naive_bayes(X_test, y_test)

if __name__ == "__main__":
    main()


Test Accuracy: 0.6435595648302296
Test Precision: 0.6658955485453908
Test Recall: 0.8738194529620998
