In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("../data/train.csv")
validation = pd.read_csv("../data/valid.csv")
test = pd.read_csv("../data/test.csv")

In [3]:
train['label_2'].fillna(train['label_2'].mean(), inplace=True)
validation['label_2'].fillna(validation['label_2'].mean(), inplace=True)

In [4]:
train['label_2'] = train['label_2'].astype(int)
validation['label_2'] = validation['label_2'].astype(int)

In [5]:
train["label_2"].value_counts()

label_2
26    4762
27    3326
25    2849
23    2842
31    2385
24    1906
28    1899
30    1894
22    1432
29    1424
33     945
36     481
35     480
34     478
32     476
41     474
61     467
Name: count, dtype: int64

In [6]:
from scipy import stats

def remove_outliers(df, threshold):
    # Copy the DataFrame
    df_copy = df.copy()
    
    # Remove the columns to exclude from the copy
    df_copy = df_copy.drop(["label_1", "label_2", "label_3", "label_4"], axis=1)
    
    # Calculate Z-scores only on remaining columns
    z_scores = stats.zscore(df_copy)
    abs_z_scores = np.abs(z_scores)
    
    # Identify outliers
    filtered_entries = (abs_z_scores < threshold).all(axis=1)
    
    # Apply the mask to the original DataFrame
    new_df = df[filtered_entries]
    
    return new_df

# Usage:
# exclude_cols is a list of column names to exclude
# train_df = remove_outliers(train_df, 3, exclude_cols)


In [7]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [8]:
def separate_dataset(df, label):
    X = df.drop(["label_1", "label_2", "label_3", "label_4"], axis=1)
    y = df[label]
    
    return X, y

def train_model(X_train, y_train, C=1.0, kernel='rbf', degree=3, gamma='scale'):
    model = svm.SVC(C=C, kernel=kernel, degree=degree, gamma=gamma, random_state=42)
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_train, X_test, y_train, y_test):
    print(f"Training score: {model.score(X_train, y_train)}")
    print(f"Testing score: {model.score(X_test, y_test)}")

def evaluate_model_detailed(model, X_test, y_test):
    y_pred = model.predict(X_test)

    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

In [9]:
test.head()

Unnamed: 0,ID,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_759,feature_760,feature_761,feature_762,feature_763,feature_764,feature_765,feature_766,feature_767,feature_768
0,1,0.124623,0.196628,0.257004,-0.156045,-0.054916,0.006071,-0.035149,-0.092019,-0.196302,...,-0.221466,0.140292,0.123622,-0.175572,-0.10703,-0.087621,-0.026501,0.139337,-0.08303,0.059507
1,2,0.109655,0.170158,0.227644,-0.127088,-0.044476,-0.046852,-0.090026,-0.061321,-0.227288,...,-0.20493,0.110203,0.085665,-0.286787,-0.113195,-0.057312,-0.05568,0.143939,-0.04576,0.106113
2,3,0.014854,0.030051,0.115092,-0.017179,0.00272,-0.011692,-0.078855,-0.042991,-0.096283,...,-0.032937,0.075821,0.030987,-0.14985,-0.003155,-0.010207,-0.001427,0.000934,-0.017069,0.048123
3,4,0.196893,0.113314,0.352175,-0.108499,-0.064472,-0.073239,-0.086402,0.008671,-0.342217,...,-0.255167,0.096579,0.069413,-0.215386,-0.075168,-0.035071,-0.023375,0.067768,-0.18153,0.174444
4,5,0.033004,0.013373,0.124001,-0.016143,0.01012,0.010635,-0.055789,-0.036282,-0.059422,...,-0.035814,0.093764,0.027321,-0.116009,0.010096,-0.042293,0.005347,0.007722,-0.007731,0.058799


In [10]:
print(train.shape, validation.shape)

# Assuming train_df is your training DataFrame and valid_df is your validation DataFrame
train = remove_outliers(train, 8)
validation = remove_outliers(validation, 8)

print(train.shape, validation.shape)

(28520, 772) (750, 772)
(28456, 772) (748, 772)


In [11]:
X_train, y_train = separate_dataset(train, "label_2")
X_valid, y_valid = separate_dataset(validation, "label_2")
# X_test, y_test = separate_dataset(test, "label_2")
test_md = test.drop("ID", axis=1)

In [12]:
from sklearn.preprocessing import StandardScaler

def scale_features(X_train, X_test, X_val):
    # Initialize the scaler
    scaler = StandardScaler()
    
    # Save the column names
    columns = X_train.columns
    
    # Fit the scaler to the training data and transform
    X_train_scaled = scaler.fit_transform(X_train)
    
    # Transform the test and validation data
    X_test_scaled = scaler.transform(X_test)
    X_val_scaled = scaler.transform(X_val)
    
    # Convert the scaled features into DataFrames
    X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=columns)
    X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=columns)
    X_val_scaled_df = pd.DataFrame(X_val_scaled, columns=columns)
    
    return X_train_scaled_df, X_test_scaled_df, X_val_scaled_df


In [13]:
X_train_scaled, X_test_scaled, X_valid_scaled = scale_features(X_train, test_md, X_valid)

In [14]:
from sklearn.decomposition import PCA

# Create a PCA that will retain 99% of variance
pca = PCA(n_components=0.97, whiten=True)

# Fit PCA on the training set
X_train_pca = pca.fit_transform(X_train_scaled)

# Apply the transformation to the validation and test sets
X_valid_pca = pca.transform(X_valid_scaled)
X_test_pca = pca.transform(X_test_scaled)

print('Original number of features:', X_train.shape[1])
print('Reduced number of features:', X_train_pca.shape[1])


Original number of features: 768
Reduced number of features: 221


In [15]:
X_train_df = pd.DataFrame(X_train_pca)
X_test_df = pd.DataFrame(X_test_pca)
X_valid_df = pd.DataFrame(X_valid_pca)

In [16]:
X_train_scaled, X_test_scaled, X_valid_scaled = scale_features(X_train_df, X_test_df, X_valid_df)

In [17]:
model = train_model(X_train_scaled, y_train, degree=3, C=1.0, gamma="auto", kernel="rbf")

In [18]:
evaluate_model(model, X_train_scaled, X_valid_scaled, y_train, y_valid)
evaluate_model_detailed(model, X_valid_scaled, y_valid)

Training score: 0.9470410458251335
Testing score: 0.8449197860962567
Classification Report:
              precision    recall  f1-score   support

          22       0.93      0.75      0.83        36
          23       0.89      0.79      0.84        71
          24       0.98      0.87      0.92        46
          25       0.86      0.80      0.83        79
          26       0.72      0.96      0.82       115
          27       0.66      0.91      0.77        80
          28       0.95      0.68      0.79        59
          29       1.00      0.84      0.92        45
          30       0.98      0.85      0.91        48
          31       0.84      0.89      0.87        65
          32       1.00      0.82      0.90        11
          33       0.96      0.87      0.91        30
          34       0.91      0.91      0.91        11
          35       1.00      0.82      0.90        11
          36       1.00      1.00      1.00         8
          41       0.91      0.71      0.80

In [19]:
from sklearn.experimental import enable_halving_search_cv  # explicitly require this experimental feature
from sklearn.model_selection import HalvingRandomSearchCV  # noqa

param_dist = {
    'C': np.logspace(-4, 3, 8),  # expanded range
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    #'degree': [2, 3, 4, 5],  # added degree for 'poly' kernel
    'gamma': ['scale', 'auto'] + list(np.logspace(-4, 3, 8))  # expanded range
}

# Assuming X_train and y_train are your training data and labels
halving_random_search = HalvingRandomSearchCV(svm.SVC(random_state=42), param_dist, cv=3, verbose=10, n_jobs=-1, n_candidates=250)
halving_random_search.fit(X_train_scaled, y_train)

# Print the best parameters
print(halving_random_search.best_params_)

n_iterations: 6
n_required_iterations: 6
n_possible_iterations: 6
min_resources_: 102
max_resources_: 28456
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 250
n_resources: 102
Fitting 3 folds for each of 250 candidates, totalling 750 fits
----------
iter: 1
n_candidates: 84
n_resources: 306
Fitting 3 folds for each of 84 candidates, totalling 252 fits
----------
iter: 2
n_candidates: 28
n_resources: 918
Fitting 3 folds for each of 28 candidates, totalling 84 fits
----------
iter: 3
n_candidates: 10
n_resources: 2754
Fitting 3 folds for each of 10 candidates, totalling 30 fits
----------
iter: 4
n_candidates: 4
n_resources: 8262
Fitting 3 folds for each of 4 candidates, totalling 12 fits
----------
iter: 5
n_candidates: 2
n_resources: 24786
Fitting 3 folds for each of 2 candidates, totalling 6 fits
{'kernel': 'rbf', 'gamma': 'auto', 'C': 1.0}


In [20]:
param_dist = {
    'C': np.logspace(-4, 3, 8),  # expanded range
    'kernel': [ 'poly' ],
    'degree': [2, 3, 4, 5],  # added degree for 'poly' kernel
    'gamma': ['scale', 'auto'] + list(np.logspace(-4, 3, 8))  # expanded range
}

# Assuming X_train and y_train are your training data and labels
halving_random_search = HalvingRandomSearchCV(svm.SVC(random_state=42), param_dist, cv=3, verbose=10, n_jobs=-1, n_candidates=250)
halving_random_search.fit(X_train_scaled, y_train)

# Print the best parameters
print(halving_random_search.best_params_)

n_iterations: 6
n_required_iterations: 6
n_possible_iterations: 6
min_resources_: 102
max_resources_: 28456
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 250
n_resources: 102
Fitting 3 folds for each of 250 candidates, totalling 750 fits
----------
iter: 1
n_candidates: 84
n_resources: 306
Fitting 3 folds for each of 84 candidates, totalling 252 fits
----------
iter: 2
n_candidates: 28
n_resources: 918
Fitting 3 folds for each of 28 candidates, totalling 84 fits
----------
iter: 3
n_candidates: 10
n_resources: 2754
Fitting 3 folds for each of 10 candidates, totalling 30 fits
----------
iter: 4
n_candidates: 4
n_resources: 8262
Fitting 3 folds for each of 4 candidates, totalling 12 fits
----------
iter: 5
n_candidates: 2
n_resources: 24786
Fitting 3 folds for each of 2 candidates, totalling 6 fits
{'kernel': 'poly', 'gamma': 1.0, 'degree': 3, 'C': 10.0}


In [23]:
model_tuned = train_model(X_train_scaled, y_train, kernel="poly", gamma=0.1, degree=2, C=10.0)

KeyboardInterrupt: 

In [24]:
evaluate_model(model_tuned, X_train_scaled, X_valid_scaled, y_train, y_valid)
evaluate_model_detailed(model_tuned, X_valid_scaled, y_valid)

Training score: 0.16697054698457223
Testing score: 0.15333333333333332
Classification Report:
              precision    recall  f1-score   support

          22       0.00      0.00      0.00        36
          23       0.00      0.00      0.00        71
          24       0.00      0.00      0.00        46
          25       0.00      0.00      0.00        79
          26       0.15      1.00      0.27       115
          27       0.00      0.00      0.00        81
          28       0.00      0.00      0.00        60
          29       0.00      0.00      0.00        45
          30       0.00      0.00      0.00        48
          31       0.00      0.00      0.00        65
          32       0.00      0.00      0.00        11
          33       0.00      0.00      0.00        30
          34       0.00      0.00      0.00        11
          35       0.00      0.00      0.00        11
          36       0.00      0.00      0.00         8
          41       0.00      0.00      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
model_tuned1 = train_model(X_train_scaled, y_train, kernel="poly", gamma=1.0, degree=3, C=10.0)

In [22]:
evaluate_model(model_tuned1, X_train_scaled, X_valid_scaled, y_train, y_valid)
evaluate_model_detailed(model_tuned1, X_valid_scaled, y_valid)

Training score: 1.0
Testing score: 0.8609625668449198
Classification Report:
              precision    recall  f1-score   support

          22       1.00      0.89      0.94        36
          23       0.78      0.79      0.78        71
          24       0.95      0.91      0.93        46
          25       0.93      0.81      0.86        79
          26       0.73      0.97      0.83       115
          27       0.80      0.91      0.85        80
          28       0.91      0.66      0.76        59
          29       0.93      0.82      0.87        45
          30       0.98      0.92      0.95        48
          31       0.91      0.92      0.92        65
          32       0.90      0.82      0.86        11
          33       0.96      0.87      0.91        30
          34       0.90      0.82      0.86        11
          35       0.90      0.82      0.86        11
          36       1.00      1.00      1.00         8
          41       0.71      0.71      0.71        14
    