In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler


In [None]:
keep_data = pd.read_csv(r"C:\Users\mjcanudo\Documents\Nova\ML\Project2\datasets\keep_data.csv", index_col='Cust_ID')

In [None]:
keep_data

In [None]:
from sklearn.svm import SVC

In [None]:
X = keep_data.drop(columns="Buy_product").copy()
y = keep_data["Buy_product"].copy()

In [None]:
from sklearn.decomposition import PCA


In [None]:
scaler = MinMaxScaler()
scaled = scaler.fit_transform(X)

In [None]:
df_scaled = pd.DataFrame(scaled, columns=X.columns, index=X.index)

In [None]:
pca_model = PCA(n_components=2)
df_reduced = pca_model.fit_transform(scaled)
plt.scatter(df_reduced[:,0], df_reduced[:,1])

In [None]:
df_x = pd.DataFrame(df_reduced)

In [None]:
svm_linear_model = SVC(kernel='linear', C=100)
svm_linear_model.fit(df_x, y)

Plot the maximum margin separating hyperplane within a two-class separable dataset using a Support Vector Machine classifier with linear kernel.

Source: https://scikit-learn.org/stable/auto_examples/svm/plot_separating_hyperplane.html

In [None]:
def plot_svm_boundary(model,X,y):
    
    X = X.values
    y = y.values
    
    # Scatter Plot
    plt.scatter(X[:, 0], X[:, 1], c=y, s=30,cmap='seismic')

    
    # plot the decision function
    ax = plt.gca()
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()

    # create grid to evaluate model
    xx = np.linspace(xlim[0], xlim[1], 30)
    yy = np.linspace(ylim[0], ylim[1], 30)
    YY, XX = np.meshgrid(yy, xx)
    xy = np.vstack([XX.ravel(), YY.ravel()]).T
    Z = model.decision_function(xy).reshape(XX.shape)

    # plot decision boundary and margins
    ax.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5,
               linestyles=['--', '-', '--'])
    # plot support vectors
    ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[:, 1], s=100,
               linewidth=1, facecolors='none', edgecolors='k')
    plt.show()

Since the data is not linearly separable, the linear kernel is not the appropriate option.

We will try non-linear kernels on SVC like rbf and poly

In [None]:
plot_svm_boundary(svm_linear_model, X=df_x, y=y)

In [None]:
# Gamma auto = 1/n of features
svm_rbf_model = SVC(kernel='rbf', C=0.001, gamma=5)
svm_rbf_model.fit(df_x, y)

LAB 2

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve

################################### MODEL SELECTION & OPTIMIZATION ########################################
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [None]:
def select_best_models(data, model):
    skf = StratifiedKFold(n_splits = 5, random_state = 99, shuffle = True)
    X = data.drop('Buy_product', axis = 1)
    y = data['Buy_product'].copy()

    score_train, score_val = [],[]

    # perform the cross-validation
    for train_index, val_index in skf.split(X,y):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # Scale the data
        scaler = MinMaxScaler().fit(X_train)
        X_train = scaler.transform(X_train)
        X_val = scaler.transform(X_val)

        # Apply model
        model.fit(X_train, y_train)
        predictions_train = model.predict(X_train)
        predictions_val = model.predict(X_val)
        score_train.append(f1_score(y_train, predictions_train))
        score_val.append(f1_score(y_val, predictions_val))

    avg_train = round(np.mean(score_train),3)
    avg_val = round(np.mean(score_val),3)
    std_train = round(np.std(score_train),2)
    std_val = round(np.std(score_val),2)

    return avg_train, std_train, avg_val, std_val

In [None]:
def show_results(df, data, *args):
    count = 0
    # for each instance of model passed as argument
    for arg in args:
        avg_train, std_train, avg_val, std_val = select_best_models(data, arg)
        # store the results in the right row
        df.iloc[count] = str(avg_train) + '+/-' + str(std_train), str(avg_val) + '+/-' + str(std_val)
        count+=1
    return df

In [122]:
svm_rbf_model = SVC(kernel='rbf')
svm_rbf_model_tunned = SVC(kernel='rbf', C=1, gamma=5)
svm_poly_model = SVC(kernel='poly')

In [123]:
df_all = pd.DataFrame(columns = ['Train','Validation'], index = ['RBF'])

In [None]:
show_results(df_all, keep_data, svm_rbf_model, svm_poly_model)

In [124]:
show_results(df_all, all_data, svm_rbf_model_tunned)

Unnamed: 0,Train,Validation
RBF,0.949+/-0.0,0.914+/-0.0


In [None]:
def point_plot(train, validation, values_try):
    sns.pointplot(x=values_try, y=train, color = 'teal', label = 'Train')
    sns.pointplot(x=values_try, y=validation, color = 'goldenrod', label = 'Validation')
    plt.legend()

Documentation:

When training an SVM with the Radial Basis Function (RBF) kernel, two parameters must be considered: C and gamma. The parameter C, common to all SVM kernels, trades off misclassification of training examples against simplicity of the decision surface. A low C makes the decision surface smooth, while a high C aims at classifying all training examples correctly. gamma defines how much influence a single training example has. The larger gamma is, the closer other examples must be to be affected.

Proper choice of C and gamma is critical to the SVM’s performance. One is advised to use GridSearchCV with C and gamma spaced exponentially far apart to choose good values.

In [None]:
param_grid_svm_rbf = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [7, 5, 3, 1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']}

grid_search = GridSearchCV(SVC(),
                        param_grid_svm_rbf,
                        scoring = 'f1',
                        return_train_score = True,
                        cv = 5,
                        verbose=1)

In [None]:
grid_search.fit(df_scaled, y)

# Print the best hyperparameters and corresponding score
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

In [None]:
all_data = pd.read_csv(r"C:\Users\mjcanudo\Documents\Nova\ML\Project2\datasets\all_data.csv", index_col='Cust_ID')

In [None]:
all_data_X = all_data.drop(columns="Buy_product").copy()
all_data_y = all_data["Buy_product"].copy()

In [None]:
scaler = MinMaxScaler()
scaled = scaler.fit_transform(all_data_X)

In [None]:
all_data_X_scaled = pd.DataFrame(scaled, columns=all_data_X.columns, index=all_data_X.index)

In [128]:
param_grid_svm_rbf = {'C': [0.1, 0.3, 0.5, 1, 10, 100, 1000],  
              'gamma': [7, 6, 5, 4, 3, 1, 0.1, 0.01], 
              'kernel': ['rbf']}

grid_search = GridSearchCV(SVC(),
                        param_grid_svm_rbf,
                        scoring = 'f1',
                        return_train_score = True,
                        cv = 5,
                        verbose=1)

In [129]:
grid_search.fit(all_data_X_scaled, all_data_y)

# Print the best hyperparameters and corresponding score
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

Fitting 5 folds for each of 56 candidates, totalling 280 fits
Best Hyperparameters:  {'C': 1, 'gamma': 5, 'kernel': 'rbf'}
Best Score:  0.9148676589466327


In [135]:
svm_rbf_model_tunned = SVC(kernel='rbf', C=0.3, gamma=5)

show_results(df_all, all_data, svm_rbf_model_tunned)

Unnamed: 0,Train,Validation
RBF,0.928+/-0.0,0.91+/-0.0
