In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("../data/train.csv")
validation = pd.read_csv("../data/valid.csv")
test = pd.read_csv("../data/test.csv")

In [3]:
train['label_2'].fillna(train['label_2'].mean(), inplace=True)
validation['label_2'].fillna(validation['label_2'].mean(), inplace=True)

In [4]:
train['label_2'] = train['label_2'].astype(int)
validation['label_2'] = validation['label_2'].astype(int)

In [5]:
from scipy import stats

def remove_outliers(df, threshold):
    # Copy the DataFrame
    df_copy = df.copy()
    
    # Remove the columns to exclude from the copy
    df_copy = df_copy.drop(["label_1", "label_2", "label_3", "label_4"], axis=1)
    
    # Calculate Z-scores only on remaining columns
    z_scores = stats.zscore(df_copy)
    abs_z_scores = np.abs(z_scores)
    
    # Identify outliers
    filtered_entries = (abs_z_scores < threshold).all(axis=1)
    
    # Apply the mask to the original DataFrame
    new_df = df[filtered_entries]
    
    return new_df

# Usage:
# exclude_cols is a list of column names to exclude
# train_df = remove_outliers(train_df, 3, exclude_cols)


In [6]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [7]:
def separate_dataset(df, label):
    X = df.drop(["label_1", "label_2", "label_3", "label_4"], axis=1)
    y = df[label]
    
    return X, y

def train_model(X_train, y_train, C=1.0, kernel='rbf', degree=3, gamma='scale'):
    model = svm.SVC(C=C, kernel=kernel, degree=degree, gamma=gamma, random_state=42)
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_train, X_test, y_train, y_test):
    print(f"Training score: {model.score(X_train, y_train)}")
    print(f"Testing score: {model.score(X_test, y_test)}")

def evaluate_model_detailed(model, X_test, y_test):
    y_pred = model.predict(X_test)

    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

In [8]:
test.head()

Unnamed: 0,ID,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_759,feature_760,feature_761,feature_762,feature_763,feature_764,feature_765,feature_766,feature_767,feature_768
0,1,0.124623,0.196628,0.257004,-0.156045,-0.054916,0.006071,-0.035149,-0.092019,-0.196302,...,-0.221466,0.140292,0.123622,-0.175572,-0.10703,-0.087621,-0.026501,0.139337,-0.08303,0.059507
1,2,0.109655,0.170158,0.227644,-0.127088,-0.044476,-0.046852,-0.090026,-0.061321,-0.227288,...,-0.20493,0.110203,0.085665,-0.286787,-0.113195,-0.057312,-0.05568,0.143939,-0.04576,0.106113
2,3,0.014854,0.030051,0.115092,-0.017179,0.00272,-0.011692,-0.078855,-0.042991,-0.096283,...,-0.032937,0.075821,0.030987,-0.14985,-0.003155,-0.010207,-0.001427,0.000934,-0.017069,0.048123
3,4,0.196893,0.113314,0.352175,-0.108499,-0.064472,-0.073239,-0.086402,0.008671,-0.342217,...,-0.255167,0.096579,0.069413,-0.215386,-0.075168,-0.035071,-0.023375,0.067768,-0.18153,0.174444
4,5,0.033004,0.013373,0.124001,-0.016143,0.01012,0.010635,-0.055789,-0.036282,-0.059422,...,-0.035814,0.093764,0.027321,-0.116009,0.010096,-0.042293,0.005347,0.007722,-0.007731,0.058799


In [9]:
X_train, y_train = separate_dataset(train, "label_4")
X_valid, y_valid = separate_dataset(validation, "label_4")
# X_test, y_test = separate_dataset(test, "label_2")
test_md = test.drop("ID", axis=1)

In [10]:
from sklearn.decomposition import PCA

# Create a PCA that will retain 99% of variance
pca = PCA(n_components=0.98, whiten=True)

# Fit PCA on the training set
X_train_pca = pca.fit_transform(X_train)

# Apply the transformation to the validation and test sets
X_valid_pca = pca.transform(X_valid)
X_test_pca = pca.transform(test_md)

print('Original number of features:', X_train.shape[1])
print('Reduced number of features:', X_train_pca.shape[1])


Original number of features: 768
Reduced number of features: 203


In [11]:
from sklearn.preprocessing import StandardScaler

def scale_features(X_train, X_test, X_val):
    # Initialize the scaler
    scaler = StandardScaler()
    
    # Save the column names
    columns = X_train.columns
    
    # Fit the scaler to the training data and transform
    X_train_scaled = scaler.fit_transform(X_train)
    
    # Transform the test and validation data
    X_test_scaled = scaler.transform(X_test)
    X_val_scaled = scaler.transform(X_val)
    
    # Convert the scaled features into DataFrames
    X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=columns)
    X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=columns)
    X_val_scaled_df = pd.DataFrame(X_val_scaled, columns=columns)
    
    return X_train_scaled_df, X_test_scaled_df, X_val_scaled_df


In [12]:
X_train_df = pd.DataFrame(X_train_pca)
X_test_df = pd.DataFrame(X_test_pca)
X_valid_df = pd.DataFrame(X_valid_pca)

In [13]:
X_train_scaled, X_test_scaled, X_valid_scaled = scale_features(X_train_df, X_test_df, X_valid_df)

In [14]:
model = train_model(X_train_scaled, y_train, degree=3, C=1.0, gamma="auto", kernel="rbf")

In [15]:
evaluate_model(model, X_train_scaled, X_valid_scaled, y_train, y_valid)
evaluate_model_detailed(model, X_valid_scaled, y_valid)

Training score: 0.9649018232819074
Testing score: 0.9253333333333333
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.90      0.93        21
           1       1.00      0.82      0.90        11
           2       1.00      0.70      0.83        27
           3       1.00      0.75      0.86         8
           4       1.00      0.47      0.64        15
           5       1.00      0.91      0.95        11
           6       0.91      1.00      0.95       532
           7       1.00      0.78      0.88        32
           8       1.00      0.58      0.73        19
           9       1.00      0.71      0.83        17
          10       1.00      0.90      0.95        10
          11       1.00      0.73      0.84        11
          12       1.00      0.73      0.84        26
          13       1.00      0.80      0.89        10

    accuracy                           0.93       750
   macro avg       0.99      0.77      0.8

In [17]:
from sklearn.experimental import enable_halving_search_cv  # explicitly require this experimental feature
from sklearn.model_selection import HalvingRandomSearchCV  # noqa

param_dist = {
    'C': np.logspace(-4, 3, 8),  # expanded range
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    #'degree': [2, 3, 4, 5],  # added degree for 'poly' kernel
    'gamma': ['scale', 'auto'] + list(np.logspace(-4, 3, 8))  # expanded range
}

# Assuming X_train and y_train are your training data and labels
halving_random_search = HalvingRandomSearchCV(svm.SVC(random_state=42), param_dist, cv=3, verbose=10, n_jobs=-1, n_candidates=250)
halving_random_search.fit(X_train_scaled, y_train)

# Print the best parameters
print(halving_random_search.best_params_)

n_iterations: 6
n_required_iterations: 6
n_possible_iterations: 6
min_resources_: 84
max_resources_: 28520
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 250
n_resources: 84
Fitting 3 folds for each of 250 candidates, totalling 750 fits
----------
iter: 1
n_candidates: 84
n_resources: 252
Fitting 3 folds for each of 84 candidates, totalling 252 fits
----------
iter: 2
n_candidates: 28
n_resources: 756
Fitting 3 folds for each of 28 candidates, totalling 84 fits
----------
iter: 3
n_candidates: 10
n_resources: 2268
Fitting 3 folds for each of 10 candidates, totalling 30 fits
----------
iter: 4
n_candidates: 4
n_resources: 6804
Fitting 3 folds for each of 4 candidates, totalling 12 fits
----------
iter: 5
n_candidates: 2
n_resources: 20412
Fitting 3 folds for each of 2 candidates, totalling 6 fits


In [None]:
param_dist = {
    'C': np.logspace(-4, 3, 8),  # expanded range
    'kernel': [ 'poly' ],
    'degree': [2, 3, 4, 5],  # added degree for 'poly' kernel
    'gamma': ['scale', 'auto'] + list(np.logspace(-4, 3, 8))  # expanded range
}

# Assuming X_train and y_train are your training data and labels
halving_random_search = HalvingRandomSearchCV(svm.SVC(random_state=42), param_dist, cv=3, verbose=10, n_jobs=-1, n_candidates=250)
halving_random_search.fit(X_train_scaled, y_train)

# Print the best parameters
print(halving_random_search.best_params_)

n_iterations: 4
n_required_iterations: 6
n_possible_iterations: 4
min_resources_: 360
max_resources_: 28456
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 320
n_resources: 360
Fitting 3 folds for each of 320 candidates, totalling 960 fits
----------
iter: 1
n_candidates: 107
n_resources: 1080
Fitting 3 folds for each of 107 candidates, totalling 321 fits
----------
iter: 2
n_candidates: 36
n_resources: 3240
Fitting 3 folds for each of 36 candidates, totalling 108 fits
----------
iter: 3
n_candidates: 12
n_resources: 9720
Fitting 3 folds for each of 12 candidates, totalling 36 fits
{'kernel': 'poly', 'gamma': 0.01, 'degree': 2, 'C': 10.0}


In [None]:
model_tuned = train_model(X_train_scaled, y_train, kernel="rbf", gamma="auto", degree=4, C=10.0)

In [None]:
evaluate_model(model_tuned, X_train_scaled, X_valid_scaled, y_train, y_valid)
evaluate_model_detailed(model_tuned, X_valid_scaled, y_valid)

Training score: 0.9984186111892044
Testing score: 0.9344919786096256
Classification Report:
              precision    recall  f1-score   support

           1       0.76      1.00      0.87        13
           2       0.80      0.89      0.84         9
           3       0.92      1.00      0.96        12
           4       1.00      0.81      0.90        16
           5       1.00      0.83      0.91        18
           6       1.00      0.89      0.94         9
           7       0.94      1.00      0.97        16
           8       0.86      0.86      0.86        14
           9       1.00      0.91      0.95        11
          10       0.89      1.00      0.94         8
          11       1.00      0.95      0.97        19
          12       1.00      1.00      1.00         7
          13       1.00      0.91      0.95        11
          14       1.00      0.87      0.93        15
          15       1.00      0.88      0.93        16
          16       1.00      0.93      0.96

In [None]:
model_tuned1 = train_model(X_train_scaled, y_train, kernel="poly", gamma=0.01, degree=2, C=10.0)

In [None]:
evaluate_model(model_tuned1, X_train_scaled, X_valid_scaled, y_train, y_valid)
evaluate_model_detailed(model_tuned1, X_valid_scaled, y_valid)

Training score: 0.9993323025021085
Testing score: 0.9224598930481284
Classification Report:
              precision    recall  f1-score   support

           1       0.75      0.92      0.83        13
           2       0.90      1.00      0.95         9
           3       0.92      0.92      0.92        12
           4       1.00      0.88      0.93        16
           5       0.88      0.78      0.82        18
           6       0.89      0.89      0.89         9
           7       0.76      1.00      0.86        16
           8       0.67      0.86      0.75        14
           9       0.91      0.91      0.91        11
          10       0.80      1.00      0.89         8
          11       0.94      0.89      0.92        19
          12       1.00      1.00      1.00         7
          13       0.90      0.82      0.86        11
          14       0.93      0.93      0.93        15
          15       0.93      0.88      0.90        16
          16       1.00      1.00      1.00