In [10]:
from sklearn.svm import SVC
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.decomposition import PCA
from tqdm import tqdm  # Import tqdm for progress bars

# Data Preprocessing
df = pd.read_csv('./data/train.csv')
test_df  = pd.read_csv('./data/test.csv')

ids = test_df['LoanID']
df = df.drop(columns=['LoanID'])
test_df = test_df.drop(columns='LoanID')

# Convert categorical columns
categorical_cols = [col for col in df.columns if df[col].dtype == 'object']
numerical_cols = [col for col in df.columns if df[col].dtype != 'object' and col != 'Default']
target = 'Default'

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    test_df[col] = le.transform(test_df[col])

# Apply standard scaling
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
test_df[numerical_cols] = scaler.transform(test_df[numerical_cols])

# Split and handle class imbalance
x_train, x_validate, y_train, y_validate = train_test_split(df.drop(columns=[target]), df[target], test_size=0.2, random_state=17, stratify=df[target])

# Oversampling using RandomOverSampler
# oversample = RandomOverSampler(sampling_strategy='minority')
# x_train, y_train = oversample.fit_resample(x_train, y_train)

# Apply PCA (n_components = 0.95)
pca = PCA(n_components=0.95)
x_train = pca.fit_transform(x_train)
x_validate = pca.transform(x_validate)
test_df = pca.transform(test_df)

# Use Linear SVM (Faster)
svm = SVC(random_state=17, kernel='rbf')

# Hyperparameter tuning with RandomizedSearchCV using tqdm to track progress
params = {"C": [1]}
rs = RandomizedSearchCV(svm, param_distributions=params, random_state=17, n_jobs=-1, n_iter=3)  # Adjust n_iter for the number of random search iterations

# Wrap the RandomizedSearchCV with tqdm for progress
# for _ in tqdm(rs.fit(x_train, y_train), desc="Training SVM"):
#     pass

rs = rs.fit(x_train, y_train)

# Get the best model from RandomizedSearchCV
model = rs.best_estimator_

# Predictions
y_train_pred = model.predict(x_train)
y_validate_pred = model.predict(x_validate)

# Evaluation
print('Training accuracy: ', accuracy_score(y_train, y_train_pred))
print('Validation accuracy: ', accuracy_score(y_validate, y_validate_pred))

# Predicting on the test set
y_pred = model.predict(test_df)
new_df = pd.DataFrame({'LoanID': ids, 'Default': y_pred})
new_df.to_csv('./csv_submissions/svm.csv', index=False)
print(rs.best_params_)
print(svm.kernel)




Training accuracy:  0.8837236340494795
Validation accuracy:  0.8837135304484042
{'C': 1}
rbf
