# Mapper3.py

In [23]:
#!/usr/bin/env python3

import sys
import pickle
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, accuracy_score

def train_logistic_regression(X_train, y_train):
    # Standardize features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)

    # Create and train logistic regression model with increased max_iter
    logistic = LogisticRegression(max_iter=1000)
    logistic.fit(X_train, y_train)

    # Evaluate the model
    y_pred_train = logistic.predict(X_train)

    accuracy = accuracy_score(y_train, y_pred_train)
    precision = precision_score(y_train, y_pred_train)
    recall = recall_score(y_train, y_pred_train)

    print(f"Training Accuracy: {accuracy}")
    print(f"Training Precision: {precision}")
    print(f"Training Recall: {recall}")

    # Save the trained model as a pickle file
    with open('logistic_model.pkl', 'wb') as model_file:
        pickle.dump(logistic, model_file, protocol=2)

    # Save the vectorizer as a pickle file
    with open('vectorizer.pkl', 'wb') as vectorizer_file:
        pickle.dump(scaler, vectorizer_file, protocol=2)

    return logistic, accuracy, precision, recall

def main():
    # Specify the number of rows to include for training
    #num_rows_train = 30000  # You can change this number as needed

    # Read training data from "flights_train.txt"
    train_data = pd.read_csv("flights_train.txt", sep='\t')

    # Extract features and labels
    X_train = train_data["Features"].apply(eval).tolist()

    # Convert the "delay" column to binary (0 for not delayed, 1 for delayed)
    y_train = train_data["delay"].apply(lambda x: 1 if x > 0 else 0).tolist()

    # Train logistic regression model
    logistic_model, training_accuracy, training_precision, training_recall = train_logistic_regression(X_train, y_train)

    
if __name__ == "__main__":
    main()


Training Accuracy: 0.6475117479952185
Training Precision: 0.6561920588188147
Training Recall: 0.935330317293485


# Reducer.py

In [24]:
#!/usr/bin/env python3

import sys
import pickle
import pandas as pd
from sklearn.metrics import precision_score, recall_score, accuracy_score

def test_logistic_regression(X_test, y_test):
    # Load the trained model from pickle file
    with open('logistic_model.pkl', 'rb') as model_file:
        logistic = pickle.load(model_file)

    # Load the vectorizer from pickle file
    with open('vectorizer.pkl', 'rb') as vectorizer_file:
        scaler = pickle.load(vectorizer_file)

    # Standardize features
    X_test = scaler.transform(X_test)

    # Convert the "delay" column to binary (0 for not delayed, 1 for delayed)
    y_test_binary = [1 if x > 0 else 0 for x in y_test]

    # Evaluate the model on the test set
    y_pred_test = logistic.predict(X_test)

    test_accuracy = accuracy_score(y_test_binary, y_pred_test)
    test_precision = precision_score(y_test_binary, y_pred_test)
    test_recall = recall_score(y_test_binary, y_pred_test)

    print(f"Test Accuracy: {test_accuracy}")
    print(f"Test Precision: {test_precision}")
    print(f"Test Recall: {test_recall}")

def main():
    # Specify the number of rows to include for testing
    #num_rows_test = 500  # You can change this number as needed

    # Read testing data from "flights_test.txt"
    test_data = pd.read_csv("flights_test.txt", sep='\t')

    # Extract features and labels
    X_test = test_data["Features"].apply(eval).tolist()
    y_test = test_data["delay"].tolist()

    # Test logistic regression model
    test_logistic_regression(X_test, y_test)

if __name__ == "__main__":
    main()


Test Accuracy: 0.6465600681404623
Test Precision: 0.6537159991432855
Test Recall: 0.9359131607997057
