In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import zipfile
from kaggle.api.kaggle_api_extended import KaggleApi
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
import time

In [11]:
api = KaggleApi()
api.authenticate()
api.competition_download_files('digit-recognizer', path='./data')

dataset_path = './data'
dataset_zip = os.path.join(dataset_path, 'digit-recognizer.zip')

with zipfile.ZipFile(dataset_zip, 'r') as zip_ref:
    zip_ref.extractall(dataset_path)

In [12]:
start_time = time.time()

train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

print("Data loaded in {:.2f} seconds".format(time.time() - start_time))

X = train.drop('label', axis=1)
y = train['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data split in {:.2f} seconds".format(time.time() - start_time))

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data scaled in {:.2f} seconds".format(time.time() - start_time))

pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print("PCA transformation done in {:.2f} seconds".format(time.time() - start_time))

param_grid = {
    'C': [1, 10, 100],
    'gamma': [0.1, 1, 10]
}

random_search = RandomizedSearchCV(SVC(kernel='linear', probability=True), param_grid, cv=2, n_iter=2, verbose=2)
random_search.fit(X_train_pca, y_train)

print("Grid search done in {:.2f} seconds".format(time.time() - start_time))

best_model = random_search.best_estimator_

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95)),
    ('svm', best_model)
])

print("Pipeline created in {:.2f} seconds".format(time.time() - start_time))

Data loaded in 7.98 seconds
Data split in 9.24 seconds
Data scaled in 10.96 seconds
PCA transformation done in 13.04 seconds
Fitting 2 folds for each of 2 candidates, totalling 4 fits
[CV] END ....................................C=100, gamma=10; total time=17.6min
[CV] END ....................................C=100, gamma=10; total time=21.9min
[CV] END ....................................C=10, gamma=0.1; total time= 5.1min
[CV] END ....................................C=10, gamma=0.1; total time= 5.5min
Grid search done in 5369.45 seconds
Pipeline created in 5369.46 seconds


In [13]:
def train_model_svm():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    plt.figure(figsize=(10, 8))
    plt.imshow(confusion_matrix(y_test, y_pred), interpolation='nearest', cmap='Blues')
    plt.xlabel('True Labels')
    plt.ylabel('Predicted Labels')
    plt.title('Confusion Matrix')
    plt.colorbar()
    plt.show()

    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    auc_value = auc(fpr, tpr)
    plt.figure(figsize=(10, 8))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % auc_value)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0]) 
    plt.ylim([0.0, 1.05]) 
    plt.xlabel('False Positive Rate') 
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right") 
    plt.show()

    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
    auc_value = auc(recall, precision)
    plt.figure(figsize=(10, 8))
    plt.plot(recall, precision, color='darkorange', lw=2, label='Precision-Recall curve (area = %0.2f)' % auc_value)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc="lower right")
    plt.show()

In [14]:
def test_model_svm():
    print("Testing kernel SVM Model...")
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Kernel SVM model's Accuracy: {accuracy * 100:.2f}%")

    plt.figure(figsize=(10, 8))
    plt.imshow(confusion_matrix(y_test, y_pred), interpolation='nearest', cmap='Blues')
    plt.xlabel('True Labels')
    plt.ylabel('Predicted Labels')
    plt.title('Confusion Matrix')
    plt.colorbar()
    plt.show()

    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    auc_value = auc(fpr, tpr)
    plt.figure(figsize=(10, 8))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % auc_value)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()

    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
    auc_value = auc(recall, precision)
    plt.figure(figsize=(10, 8))
    plt.plot(recall, precision, color='darkorange', lw=2, label='Precision-Recall curve (area = %0.2f)' % auc_value)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc="lower right")
    plt.show()

In [19]:
def main():
    train_model_svm()
    print("Best Parameters:", random_search.best_params_)
    print("Best Score:", random_search.best_score_)

if __name__ == "__main__" : 
    main()

In [18]:
def main2():
    test_model_svm()

if __name__ == "__main__" : 
    main2()