In [8]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from joblib import dump, load

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline


In [None]:
df = pd.read_csv('../global_preprocessed.csv', encoding='ISO-8859-1')

row_count = df.shape[0]
column_count = df.shape[1]
print(f'The DataFrame has {row_count} rows.')
print(f'The DataFrame has {column_count} columns.')

In [10]:
x = df.drop("extended", axis=1)
y = df['extended']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0, shuffle=True)

In [11]:


# Define the parameter grid
param_grid = {
    'C': [0.1, 1],
    'kernel': ['rbf'],
    'gamma': ['auto', 'scale'],
}

# Create the grid search
model = GridSearchCV(SVC(), param_grid, scoring='precision_macro', n_jobs=-1, verbose=3)

over = SMOTE(sampling_strategy='auto')   # oversample minority class to have x% the number of majority class
under = RandomUnderSampler(sampling_strategy='auto') # undersample majority class to be 0.7 the size of minority class
steps = [('u', under), ('o', over)]
pipeline = Pipeline(steps=steps)

# transform the dataset
x_train, y_train = pipeline.fit_resample(x_train, y_train)


In [23]:
df1 = y_train.value_counts()
df1

0    5951
1    5951
Name: extended, dtype: int64

In [13]:
model.fit(x_train, y_train)
model.best_params_

Fitting 5 folds for each of 4 candidates, totalling 20 fits


{'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}

In [14]:
dump(model, "svm_model.joblib")

['svm_model.joblib']