In [23]:
import ps3_implementation
from ps3_implementation import cv, krr

## Load data

In [14]:
import numpy as np
datasets = ['banana', 'diabetis', 'flare-solar', 'image', 'ringnorm']
for d in datasets:
    Xtrain = np.loadtxt('/Users/yanqingluo/Desktop/LabML/git/problem_set3/data/U04_%s-xtrain.dat'%d, unpack=True)
    ytrain = np.loadtxt('/Users/yanqingluo/Desktop/LabML/git/problem_set3/data/U04_%s-ytrain.dat'%d)
    Xtest = np.loadtxt('/Users/yanqingluo/Desktop/LabML/git/problem_set3/data/U04_%s-xtest.dat'%d, unpack=True)
    Ytest = np.loadtxt('/Users/yanqingluo/Desktop/LabML/git/problem_set3/data/U04_%s-ytest.dat'%d)

In [53]:
import numpy as np
import pickle
from sklearn.metrics import zero_one_loss
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.kernel_ridge import KernelRidge

# Define paths to your data
base_path = '/Users/yanqingluo/Desktop/LabML/git/problem_set3/data/'
datasets = ['banana', 'diabetis', 'flare-solar', 'image', 'ringnorm']

# Define kernels and their respective parameter grids for GridSearchCV
param_grids = {
    'linear': {'alpha': [0.01, 0.1, 1.0], 'kernel': ['linear']},
    'poly': {'alpha': [0.01, 0.1, 1.0], 'kernel': ['poly'], 'degree': [2, 3, 4]},
    'rbf': {'alpha': [0.01, 0.1, 1.0], 'kernel': ['rbf'], 'gamma': [0.1, 1.0, 10.0]}
}

results = {}

def cv(X, y, method, params, loss_function, nfolds=10, nrepetitions=5):
    '''Performs cross-validation to find the best model parameters.'''
    best_loss = float('inf') 
    best_params = None
    best_model = None
    kf = KFold(n_splits=nfolds, shuffle=True, random_state=42)
    
    # Iterate through all parameter combinations
    for kernel in params:
        grid_search = GridSearchCV(method(kernel=kernel), params[kernel], cv=kf, scoring='neg_mean_squared_error')
        grid_search.fit(X, y)
        avg_loss = -grid_search.best_score_
        
        if avg_loss < best_loss:
            best_loss = avg_loss
            best_params = grid_search.best_params_
            best_model = grid_search.best_estimator_
    
    return best_model, best_loss

# Iterate over datasets and perform KRR for each
for d in datasets:
    # Load data for current dataset
    X_train = np.loadtxt(base_path + f'U04_{d}-xtrain.dat', unpack=True)
    y_train = np.loadtxt(base_path + f'U04_{d}-ytrain.dat')
    X_test = np.loadtxt(base_path + f'U04_{d}-xtest.dat', unpack=True)
    y_test = np.loadtxt(base_path + f'U04_{d}-ytest.dat')
    
    # Perform cross-validation to find best parameters
    best_cv_model, best_cv_loss = cv(X_train, y_train, KernelRidge, param_grids, loss_function=zero_one_loss, nfolds=10, nrepetitions=5)
    
    # Extract best parameters
    best_params = {
        'alpha': best_cv_model.alpha,
        'kernel': best_cv_model.kernel,
        'kernelparameter': getattr(best_cv_model, 'degree', getattr(best_cv_model, 'gamma', None))
    }
    
    # Predict on test set
    # Predict on test set
    y_pred = best_cv_model.predict(X_test)

    # Convert predictions to binary by thresholding at 0.5
    y_pred_binary = (y_pred > 0.5).astype(int)

    # Calculate zero-one loss on test set, using binary predictions
    test_loss = zero_one_loss(y_test, y_pred_binary)

    # Store results for the current dataset
    results[d] = {
        'cvloss': best_cv_loss,
        'kernel': best_params['kernel'],
        'kernelparameter': best_params['kernelparameter'],
        'regularization': best_params['alpha'],
        'y_pred': y_pred_binary.tolist(),  # Convert numpy array to list for serialization
        'test_loss': test_loss
    }


# Save results dictionary using pickle
with open('results.p', 'wb') as f:
    pickle.dump(results, f)

print("Results saved to results.p")


Results saved to results.p


In [57]:
import pickle

# Load the results from results.p
with open('results.p', 'rb') as f:
    results = pickle.load(f)

datasets = ['banana', 'diabetis', 'flare-solar', 'image', 'ringnorm']
for d in datasets:
    d_result = results[d]
    print(f"Results for {d} Dataset:")
    print(d_result)



Results for banana Dataset:
{'cvloss': 0.32906511641018193, 'kernel': 'rbf', 'kernelparameter': 3, 'regularization': 0.1, 'y_pred': [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 

In [46]:
from sklearn.metrics import roc_curve
from sklearn.model_selection import KFold
from sklearn.kernel_ridge import KernelRidge
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler


def roc_fun(X, y, model, biases):
    TPRs = []
    FPRs = []
    y_pred_base = model.predict(X)
    
    for bias in biases:
        # 应用阈值将连续预测转换为二进制预测
        y_pred = (y_pred_base + bias > 0.5).astype(int)
        fpr, tpr, _ = roc_curve(y, y_pred)
        TPRs.append(tpr)
        FPRs.append(fpr)
    
    return TPRs, FPRs





def cv_with_roc(X, y, params, nfolds=5):
    kf = KFold(n_splits=nfolds, shuffle=True, random_state=42)
    biases = np.linspace(-2, 2, 20)  # Bias values range from -2 to 2
    all_TPRs = []
    all_FPRs = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model = KernelRidge(alpha=params['alpha'], kernel=params['kernel'])
        if 'degree' in params:
            model.degree = params['degree']
        if 'gamma' in params:
            model.gamma = params['gamma']
        
        model.fit(X_train, y_train)
        TPRs, FPRs = roc_fun(X_test, y_test, model, biases)
        
        all_TPRs.append(TPRs)
        all_FPRs.append(FPRs)

    return all_TPRs, all_FPRs  # Return aggregated results from all folds



In [48]:
mat = scipy.io.loadmat('/Users/yanqingluo/Desktop/LabML/git/problem_set3/data/qm7.mat')
X = mat['X'].reshape(mat['X'].shape[0], -1)  # Reshape if X is 3D
y = mat['T'][0]  # Assuming 'T' is the correct key for target variable
print(y)
# Define optimal parameters (adjust these as necessary)
optimal_params = {'alpha': 1.0, 'kernel': 'rbf', 'gamma': 0.5}  # Example parameters

# Execute cross-validation with ROC analysis
all_TPRs, all_FPRs = cv_with_roc(X, y, optimal_params)

# Plot the average ROC curves for different biases (example)
biases = np.linspace(-2, 2, 20)
for i in range(len(biases)):
    avg_TPR = np.mean([tpr[i] for tpr in all_TPRs], axis=0)
    avg_FPR = np.mean([fpr[i] for fpr in all_FPRs], axis=0)
    plt.plot(avg_FPR, avg_TPR, label=f'Bias = {biases[i]:.2f}')

plt.title('Average ROC Curves for Different Biases')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()


[ -417.96  -712.42  -564.21 ... -1662.1  -1782.01 -1919.  ]


ValueError: continuous format is not supported