# Mapper3.py

In [4]:
#!/usr/bin/env python3

import sys
import pickle
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.preprocessing import StandardScaler

def train_svm(X_train, y_train):
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)

    # Create and train Linear Support Vector Machine (SVM) classifier with parallelization
    svm_classifier = LinearSVC(dual=False)  # Set dual=False for better performance with large datasets
    svm_classifier.fit(X_train_scaled, y_train)

    # Evaluate the model
    y_pred_train = svm_classifier.predict(X_train_scaled)

    accuracy = accuracy_score(y_train, y_pred_train)
    precision = precision_score(y_train, y_pred_train, average='binary')
    recall = recall_score(y_train, y_pred_train, average='binary')

    print(f"Training Accuracy: {accuracy}")
    print(f"Training Precision: {precision}")
    print(f"Training Recall: {recall}")

    # Save the trained model as a pickle file
    with open('svm_model.pkl', 'wb') as model_file:
        pickle.dump(svm_classifier, model_file, protocol=2)

    return svm_classifier, accuracy, precision, recall

def main():
    # Specify the number of rows to include for training
    #num_rows_train = 30000  # You can change this number as needed

    # Read training data from "flights_train.txt"
    train_data = pd.read_csv("flights_train.txt", sep='\t')

    # Extract features and labels
    X_train = train_data["Features"].apply(eval).tolist()
    y_train = train_data["delay"].apply(lambda x: 1 if x > 0 else 0).tolist()

    # Train Linear Support Vector Machine (SVM) classifier
    svm_model, training_accuracy, training_precision, training_recall = train_svm(X_train, y_train)

if __name__ == "__main__":
    main()


Training Accuracy: 0.6474246361872129
Training Precision: 0.6543444618740526
Training Recall: 0.9435711130405642


# Reducer3.py

In [5]:
#!/usr/bin/env python3

import sys
import pickle
import pandas as pd
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.preprocessing import StandardScaler

def test_svm(X_test, y_test):
    # Load the trained model from pickle file
    with open('svm_model.pkl', 'rb') as model_file:
        svm_classifier = pickle.load(model_file)

    # Scale the features
    scaler = StandardScaler()
    X_test_scaled = scaler.fit_transform(X_test)

    # Convert the "delay" column to binary (0 for not delayed, 1 for delayed)
    y_test_binary = [1 if x > 0 else 0 for x in y_test]

    # Evaluate the model on the test set
    y_pred_test = svm_classifier.predict(X_test_scaled)

    test_accuracy = accuracy_score(y_test_binary, y_pred_test)
    test_precision = precision_score(y_test_binary, y_pred_test)
    test_recall = recall_score(y_test_binary, y_pred_test)

    print(f"Test Accuracy: {test_accuracy}")
    print(f"Test Precision: {test_precision}")
    print(f"Test Recall: {test_recall}")

def main():
    # Specify the number of rows to include for testing
    #num_rows_test = 500  # You can change this number as needed

    # Read testing data from "flights_test.txt"
    test_data = pd.read_csv("flights_test.txt", sep='\t')

    # Extract features and labels
    X_test = test_data["Features"].apply(eval).tolist()
    y_test = test_data["delay"].tolist()

    # Test Linear Support Vector Machine (SVM) classifier
    test_svm(X_test, y_test)

if __name__ == "__main__":
    main()


Test Accuracy: 0.6462890549382477
Test Precision: 0.6516625772061935
Test Recall: 0.9446829387955353
