# Necessary Imports

In [1]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
import pandas as pd

# Loading the cora dataset and extracting the features and labels

In [2]:
def load_data():
    """
    Loading the Cora dataset from the provided files.
    
    Returns:
    X (numpy array): Features representing word attributes.
    y (numpy array): Class labels.
    """
    with open('downloads/cora/cora.content', 'r') as file:
        lines = file.readlines()
    data = [line.strip().split() for line in lines]
    X = np.array([list(map(int, row[1:-1])) for row in data])
    y = np.array([row[-1] for row in data])
    return X, y

# Loading the citation information in the form of a graph

In [3]:
def load_citation_graph():
    """
    Loading the citation graph from the .cites file.
    
    Returns:
    citation_graph (list): List of citation information.
    """
    with open('downloads/cora/cora.cites', 'r') as file:
        lines = file.readlines()
    citation_graph = [line.strip().split() for line in lines]
    return citation_graph

# Splitting the dataset into 10 folds

In [4]:
def split_dataset(X, y):
    """
    Spliting the dataset into train and test sets using Stratified K-Fold cross-validation.
    
    Args:
    X (numpy array): Features representing word attributes.
    y (numpy array): Class labels.
    
    Returns:
    folds: Generator object yielding train and test indices for each fold.
    """
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    return skf.split(X, y)

# Naives Bayes Classifer which is the common model for text based classification is used

In [5]:
def train_and_predict(X_train, y_train, X_test):
    """
    Training a Multinomial Naive Bayes classifier to make predictions.
    
    Args:
    X_train (numpy array): Features of the training set.
    y_train (numpy array): Labels of the training set.
    X_test (numpy array): Features of the test set.
    
    Returns:
    predictions (numpy array): Predicted class labels for the test set.
    """
    nb_classifier = MultinomialNB()
    nb_classifier.fit(X_train, y_train)
    predictions = nb_classifier.predict(X_test)
    return predictions

# Saving the predictions in the form of a tab seperated values format as given in the challenge

In [6]:
def save_predictions(predictions, test_indices, y_true):
    """
    Save the predictions to a TSV file.
    
    Args:
    predictions (numpy array): Predicted class labels.
    test_indices (numpy array): Indices of the test set samples.
    y_true (numpy array): True class labels of the test set.
    """
    predictions_df = pd.DataFrame({'paper_id': test_indices, 'class_label': predictions, 'true_label': y_true})
    predictions_df.to_csv('predictions.tsv', sep='\t', index=False)


# Evaluating the predictions by accuracy score

In [7]:
def evaluate_accuracy(y_true, predictions):
    """
    Calculate the accuracy of the predictions.
    
    Args:
    y_true (numpy array): True class labels.
    predictions (numpy array): Predicted class labels.
    
    Returns:
    accuracy (float): Accuracy of the predictions.
    """
    accuracy = accuracy_score(y_true, predictions)
    return accuracy

# Main functions to carryout the training and evaluation process

In [8]:
if __name__ == "__main__":
    # Load data
    X, y = load_data()
    citation_graph = load_citation_graph()

    accuracies = []

    # Split dataset
    folds = split_dataset(X, y)

    # Perform cross-validation
    for fold, (train_indices, test_indices) in enumerate(folds):
        print(f"Fold {fold+1}")

        # Split data into train and test sets
        X_train, X_test = X[train_indices], X[test_indices]
        y_train, y_test = y[train_indices], y[test_indices]

        # Train model and make predictions
        predictions = train_and_predict(X_train, y_train, X_test)

        # Save predictions
        save_predictions(predictions, test_indices, y_test)

        # Evaluate accuracy
        accuracy = evaluate_accuracy(y_test, predictions)
        accuracies.append(accuracy)
        print(f"Accuracy: {accuracy}")

    # Overall performance
    mean_accuracy = np.mean(accuracies)
    print(f"Mean Accuracy across all folds: {mean_accuracy}")

Fold 1
Accuracy: 0.7675276752767528
Fold 2
Accuracy: 0.7822878228782287
Fold 3
Accuracy: 0.7749077490774908
Fold 4
Accuracy: 0.8044280442804428
Fold 5
Accuracy: 0.7749077490774908
Fold 6
Accuracy: 0.7933579335793358
Fold 7
Accuracy: 0.7527675276752768
Fold 8
Accuracy: 0.7675276752767528
Fold 9
Accuracy: 0.7666666666666667
Fold 10
Accuracy: 0.7481481481481481
Mean Accuracy across all folds: 0.7732526991936586
