In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score
X_train = pd.read_csv("data/x_train.txt",header=None,sep=" ")
y_train = pd.read_csv("data/y_train.txt",header=None,sep=" ")
X_train, y_train = np.array(X_train), np.array(y_train).T[0]
def eval_proba(probas,y,n_features, num_target = 1000):
    sorted_probas = np.sort(probas)[::-1]  
    threshold = sorted_probas[num_target - 1]

    y_pred = (probas >= threshold).astype(int)
    gain = 10*precision_score(y, y_pred)*num_target
    return np.round(gain - 200*n_features)

Test kerneli

In [29]:
from sklearn.model_selection import ParameterGrid, cross_val_predict, StratifiedKFold
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

best_gain = -np.inf
best_params = None
history = []
param_grid = {
    'C': [0.1, 1, 5],
    'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['auto'],
    'n_features': [1,2]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
mi = mutual_info_classif(X_scaled, y_train, random_state=42)
top_idx = np.argsort(mi)[::-1][:6]
best_score = 0
for params in ParameterGrid(param_grid):
    clf = SVC(
            C=params['C'],
            kernel=params['kernel'],
            gamma=params['gamma'],
            probability=True,    
            random_state=42
        )
    selected =top_idx[:params['n_features']]
    probas = cross_val_predict(clf, X_scaled[:,selected], y_train, cv=cv, method='predict_proba')[:,1]
    score = eval_proba(probas, y_train,params['n_features'])
    print(f"Params: {params}, score:{score}, features:{selected}")

    history.append({**params,
                    "score": score,
                    "features":selected})
    if score > best_score:
        best_score = score
        best_params = params
        best_features = selected

print(f"Best parameters: {best_params}")
print(f"Best features: {best_features}")
print(f"Max gain: {best_score}")

Params: {'C': 0.1, 'gamma': 'auto', 'kernel': 'linear', 'n_features': 1}, score:7090.0, features:[2]
Params: {'C': 0.1, 'gamma': 'auto', 'kernel': 'linear', 'n_features': 2}, score:6860.0, features:[2 6]
Params: {'C': 0.1, 'gamma': 'auto', 'kernel': 'poly', 'n_features': 1}, score:7081.0, features:[2]
Params: {'C': 0.1, 'gamma': 'auto', 'kernel': 'poly', 'n_features': 2}, score:6810.0, features:[2 6]
Params: {'C': 0.1, 'gamma': 'auto', 'kernel': 'rbf', 'n_features': 1}, score:7120.0, features:[2]
Params: {'C': 0.1, 'gamma': 'auto', 'kernel': 'rbf', 'n_features': 2}, score:7010.0, features:[2 6]
Params: {'C': 0.1, 'gamma': 'auto', 'kernel': 'sigmoid', 'n_features': 1}, score:6950.0, features:[2]
Params: {'C': 0.1, 'gamma': 'auto', 'kernel': 'sigmoid', 'n_features': 2}, score:6570.0, features:[2 6]
Params: {'C': 1, 'gamma': 'auto', 'kernel': 'linear', 'n_features': 1}, score:7090.0, features:[2]
Params: {'C': 1, 'gamma': 'auto', 'kernel': 'linear', 'n_features': 2}, score:6880.0, feature

Konkretny grid search

In [None]:
from sklearn.model_selection import ParameterGrid, cross_val_predict, StratifiedKFold
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

best_gain = -np.inf
best_params = None
history = []
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto'],
    'n_features': [1, 2, 3, 4, 5, 6]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
mi = mutual_info_classif(X_scaled, y_train, random_state=42)
top_idx = np.argsort(mi)[::-1][:6]
best_score = 0
for params in ParameterGrid(param_grid):
    clf = SVC(
            C=params['C'],
            kernel=params['kernel'],
            gamma=params['gamma'],
            probability=True,    
            random_state=42
        )
    selected =top_idx[:params['n_features']]
    probas = cross_val_predict(clf, X_scaled[:,selected], y_train, cv=cv, method='predict_proba')[:,1]
    score = eval_proba(probas, y_train,params['n_features'])
    print(f"Params: {params}, score:{score}, features:{selected}")

    history.append({**params,
                    "score": score,
                    "features":selected})
    if score > best_score:
        best_score = score
        best_params = params
        best_features = selected

print(f"Best parameters: {best_params}")
print(f"Best features: {best_features}")
print(f"Max gain: {best_score}")

Params: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear', 'n_features': 1}, score:7090.0, features:[2]
Params: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear', 'n_features': 2}, score:6860.0, features:[2 6]
Params: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear', 'n_features': 3}, score:6630.0, features:[2 6 3]
Params: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear', 'n_features': 4}, score:6470.0, features:[2 6 3 4]
Params: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear', 'n_features': 5}, score:6260.0, features:[2 6 3 4 5]
Params: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear', 'n_features': 6}, score:6100.0, features:[2 6 3 4 5 8]
Params: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf', 'n_features': 1}, score:7120.0, features:[2]
Params: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf', 'n_features': 2}, score:7020.0, features:[2 6]
Params: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf', 'n_features': 3}, score:6850.0, features:[2 6 3]
Params: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf', '

Jeszcze raz ale tylko dla innych C

In [18]:
from sklearn.model_selection import ParameterGrid, cross_val_predict, StratifiedKFold
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler

best_gain = -np.inf
best_params = None
history = []
param_grid = {
    'C': [2,3,4,5,6,7,8,9,10,12,13,14,15],
    'gamma': ['scale', 'auto'],
    'n_features': [1]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
mi = mutual_info_classif(X_scaled, y_train, random_state=42)
top_idx = np.argsort(mi)[::-1][:6]
best_score = 0
for params in ParameterGrid(param_grid):
    clf = SVC(
            C=params['C'],
            kernel='rbf',
            gamma=params['gamma'],
            probability=True,    
            random_state=42
        )
    #faster
    selected =[2]
    probas = cross_val_predict(clf, X_scaled[:,selected], y_train, cv=cv, method='predict_proba')[:,1]
    score = eval_proba(probas, y_train,params['n_features'])
    print(f"Params: {params}, score:{score}, features:{selected}")

    history.append({**params,
                    "score": score,
                    "features":selected})
    if score > best_score:
        best_score = score
        best_params = params
        best_features = selected

print(f"Best parameters: {best_params}")
print(f"Best features: {best_features}")
print(f"Max gain: {best_score}")

Params: {'C': 2, 'gamma': 'scale', 'n_features': 1}, score:7230.0, features:[2]
Params: {'C': 2, 'gamma': 'auto', 'n_features': 1}, score:7230.0, features:[2]
Params: {'C': 3, 'gamma': 'scale', 'n_features': 1}, score:7270.0, features:[2]
Params: {'C': 3, 'gamma': 'auto', 'n_features': 1}, score:7210.0, features:[2]
Params: {'C': 4, 'gamma': 'scale', 'n_features': 1}, score:7250.0, features:[2]
Params: {'C': 4, 'gamma': 'auto', 'n_features': 1}, score:7280.0, features:[2]
Params: {'C': 5, 'gamma': 'scale', 'n_features': 1}, score:7280.0, features:[2]
Params: {'C': 5, 'gamma': 'auto', 'n_features': 1}, score:7250.0, features:[2]
Params: {'C': 6, 'gamma': 'scale', 'n_features': 1}, score:7280.0, features:[2]
Params: {'C': 6, 'gamma': 'auto', 'n_features': 1}, score:7250.0, features:[2]
Params: {'C': 7, 'gamma': 'scale', 'n_features': 1}, score:7260.0, features:[2]
Params: {'C': 7, 'gamma': 'auto', 'n_features': 1}, score:7250.0, features:[2]
Params: {'C': 8, 'gamma': 'scale', 'n_features

Kilka miało taki sam więc zrobi się kilka razy to samo i wyciągnie srednia

In [36]:
from sklearn.model_selection import ParameterGrid, cross_val_predict, StratifiedKFold
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler

history = []
param_grid = {
    'C': [4,5,6],
    'gamma': ['scale', 'auto'],
    'n_features': [1]
}
random_states = np.random.randint(0, 10000, size=25)
print("All unique",len(np.unique(random_states)) == 25)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
mi = mutual_info_classif(X_scaled, y_train, random_state=42)
top_idx = np.argsort(mi)[::-1][:6]
best_score = 0
for params in ParameterGrid(param_grid):
    clf = SVC(
            C=params['C'],
            kernel='rbf',
            gamma=params['gamma'],
            probability=True,    
            random_state=42
        )
    #faster
    selected =[2]
    scores = []
    for rs in random_states:
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=rs)
        probas = cross_val_predict(clf, X_scaled[:,selected], y_train, cv = cv, method='predict_proba')[:,1]
        score = eval_proba(probas, y_train,params['n_features'])
        scores.append(score)
    score = np.mean(scores)
    print(f"Params: {params}, avg_score:{score}, min_score{min(scores)}, std = {np.std(scores)}, features:{selected}")

    history.append({**params,
                    "score": score,
                    "features":selected})
    if score > best_score:
        best_score = score
        best_params = params
        best_features = selected

print(f"Best parameters: {best_params}")
print(f"Best features: {best_features}")
print(f"Max gain: {best_score}")

All unique True
Params: {'C': 4, 'gamma': 'scale', 'n_features': 1}, avg_score:7162.0, min_score7040.0, std = 63.30876716537765, features:[2]
Params: {'C': 4, 'gamma': 'auto', 'n_features': 1}, avg_score:7160.0, min_score7060.0, std = 54.99090833947008, features:[2]
Params: {'C': 5, 'gamma': 'scale', 'n_features': 1}, avg_score:7171.6, min_score7060.0, std = 60.77367851298784, features:[2]
Params: {'C': 5, 'gamma': 'auto', 'n_features': 1}, avg_score:7166.0, min_score7070.0, std = 55.78530272392541, features:[2]
Params: {'C': 6, 'gamma': 'scale', 'n_features': 1}, avg_score:7179.6, min_score7050.0, std = 65.93815284037004, features:[2]
Params: {'C': 6, 'gamma': 'auto', 'n_features': 1}, avg_score:7175.6, min_score7040.0, std = 63.754529250869695, features:[2]
Best parameters: {'C': 6, 'gamma': 'scale', 'n_features': 1}
Best features: [2]
Max gain: 7179.6


In [38]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)
for rs in random_states:
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=rs)
        probas = cross_val_predict(clf, X_scaled[:,[3]], y_train, cv = cv, method='predict_proba')[:,1]
        score = eval_proba(probas, y_train,params['n_features'])
        scores.append(score)
score = np.mean(scores)
print(f"Params: {params}, avg_score:{score}, min_score{min(scores)}, std = {np.std(scores)}, features:{[3]}")

Params: {'C': 6, 'gamma': 'auto', 'n_features': 1}, avg_score:7098.266666666666, min_score6710.0, std = 127.93095359954481, features:[3]
