In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load data from TSV files and exclude the first column.
# Assuming the first column is not needed, e.g., it could be an ID or a word that we don't need for vector processing.
vec_1 = pd.read_csv("corpus/space_vec.tsv", delimiter='\t', header=None).iloc[:, 1:]
vec_2 = pd.read_csv("corpus/elec_vec.tsv", delimiter='\t', header=None).iloc[:, 1:]
vec_3 = pd.read_csv("corpus/med_vec.tsv", delimiter='\t', header=None).iloc[:, 1:]

# Assign target values to each dataset. These will be used as labels in a classification task.
vec_1['target'] = 0  # Directly assign value instead of using np.ones and multiplication
vec_2['target'] = 1
vec_3['target'] = 2

In [7]:
# Split each dataset into training and testing sets with a consistent random state for reproducibility.
train_1, test_1 = train_test_split(vec_1, test_size=0.2, random_state=42)
train_2, test_2 = train_test_split(vec_2, test_size=0.2, random_state=42)
train_3, test_3 = train_test_split(vec_3, test_size=0.2, random_state=42)

# Combine the individual training and testing datasets into single datasets.
train_set = pd.concat([train_1, train_2, train_3], ignore_index=True)
test_set = pd.concat([test_1, test_2, test_3], ignore_index=True)

# Separate features (X) and labels (y) for both training and testing sets.
X_train, y_train = train_set.drop(columns='target'), train_set['target']
X_test, y_test = test_set.drop(columns='target'), test_set['target']

In [9]:
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix

def calculate_metrics(y_true, y_pred):
    """
    Calculate precision, recall, F1 score, and accuracy from the true and predicted labels.
    
    :param y_true: The ground truth (correct) target values.
    :param y_pred: The estimated targets as returned by a classifier.
    :return: A tuple of (precision, recall, f1_score, accuracy)
    """
    # Compute the confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Extract the counts for True Positives (TP), False Positives (FP), False Negatives (FN), and True Negatives (TN)
    TP = np.diag(cm)  # Number of correctly classified samples per class
    FP = cm.sum(axis=0) - TP  # Number of misclassified samples as each class
    FN = cm.sum(axis=1) - TP  # Number of samples that should have been classified as each class but were not
    TN = cm.sum() - (FP + FN + TP)  # Number of samples that are neither true nor false positives or negatives

    # Calculate precision, recall, F1 score, and accuracy
    # Use np.nan_to_num to handle division by zero, which can occur if there are no positive predictions or true positives
    precision = np.nan_to_num(TP / (TP + FP))
    recall = np.nan_to_num(TP / (TP + FN))
    f1_score = np.nan_to_num(2 * (precision * recall) / (precision + recall))
    accuracy = np.nan_to_num((TP + TN) / (TP + FP + FN + TN))

    # Return the mean of the metrics across all classes
    return np.nanmean(precision), np.nanmean(recall), np.nanmean(f1_score), np.nanmean(accuracy)

In [None]:
from sklearn.svm import SVC
import time
def train_svm_with_kernels(X_train, X_test, y_train, y_test):
    kernels = ['linear', 'poly', 'rbf', 'sigmoid']
    results = []

    for kernel in kernels:
        model = SVC(kernel=kernel, max_iter=1000)
        start_time = time.time()
        model.fit(X_train, y_train)
        training_time = time.time() - start_time

        y_pred = model.predict(X_test)
        precision, recall, f1, accuracy = calculate_metrics(y_test, y_pred)

        results.append({
            'kernel': kernel,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'accuracy': accuracy,
            'training_time': training_time
        })
    return results
results = train_svm_with_kernels(X_train, X_test, y_train, y_test)

pd.DataFrame(results)

In [None]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import time
from sklearn.preprocessing import FunctionTransformer
import pandas as pd

def reduce_dimensions_pca(X_train, X_test, n_components):
    # Apply PCA to reduce dimensions
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    return X_train_pca, X_test_pca

def add_features(X_train, X_test):
    # Add log and sin transformed features
    transformer_log = FunctionTransformer(np.log1p, validate=True)
    transformer_sin = FunctionTransformer(np.sin, validate=True)

    X_train_log = transformer_log.transform(X_train)
    X_test_log = transformer_log.transform(X_test)
    X_train_sin = transformer_sin.transform(X_train)
    X_test_sin = transformer_sin.transform(X_test)

    # Combine original and new features
    X_train_new = np.hstack([X_train, X_train_log, X_train_sin])
    X_test_new = np.hstack([X_test, X_test_log, X_test_sin])

    return X_train_new, X_test_new

def preprocess_data(X_train, X_test):
    # Impute missing values with mean
    imputer = SimpleImputer(strategy='mean')
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)
    return X_train_imputed, X_test_imputed

def train_svm_with_kernels(X_train, X_test, y_train, y_test):
    kernels = ['linear', 'poly', 'rbf', 'sigmoid']
    results = []

    for kernel in kernels:
        model = SVC(kernel=kernel, max_iter=1000)
        start_time = time.time()
        model.fit(X_train, y_train)
        training_time = time.time() - start_time

        y_pred = model.predict(X_test)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        accuracy = accuracy_score(y_test, y_pred)

        results.append({
            'kernel': kernel,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'accuracy': accuracy,
            'training_time': training_time
        })
    return results

def run_experiment(X_train, X_test, y_train, y_test, pca_components):
    # Run experiment with PCA
    X_train_pca, X_test_pca = reduce_dimensions_pca(X_train, X_test, pca_components)
    pca_results = train_svm_with_kernels(X_train_pca, X_test_pca, y_train, y_test)

    # Run experiment with additional features
    X_train_new, X_test_new = add_features(X_train, X_test)
    X_train_new, X_test_new = preprocess_data(X_train_new, X_test_new)
    feature_results = train_svm_with_kernels(X_train_new, X_test_new, y_train, y_test)

    return pca_results, feature_results

# Experiment parameters
pca_components = 90

# Run the experiment
pca_results, feature_results = run_experiment(X_train, X_test, y_train, y_test, pca_components)

# Print results
print("Results with PCA:")
print(pd.DataFrame(pca_results))

print("Results with new features:")
print(pd.DataFrame(feature_results))