In [31]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from joblib import dump, load

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline


In [32]:
df = pd.read_csv('global_preprocessed.csv', encoding='ISO-8859-1')

row_count = df.shape[0]
column_count = df.shape[1]
print(f'The DataFrame has {row_count} rows.')
print(f'The DataFrame has {column_count} columns.')

The DataFrame has 171280 rows.
The DataFrame has 26 columns.


In [33]:
x = df.drop("extended", axis=1)
y = df['extended']

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0, shuffle=True)

In [34]:


# Define the parameter grid
param_grid = {
    'C': [10],
    'kernel': ['rbf'],
    'gamma': [10],
}

# Create the grid search
model = GridSearchCV(SVC(), param_grid, cv=5, scoring='precision_macro', n_jobs=-1, verbose=3)

over = SMOTE(sampling_strategy=0.05)   # oversample minority class to have 20% the number of majority class
under = RandomUnderSampler(sampling_strategy=0.7) # undersample majority class to be 0.5 the size of minority class
steps = [  ('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

# transform the dataset
x_train, y_train = pipeline.fit_resample(x_train, y_train)


In [35]:
df1 = y_train[y_train>0]
row_count = y_train.shape[0]
print(f'The DataFrame (extended == 0) has {row_count} rows.')
df0 = y_train[y_train==0]
row_count = df0.shape[0]
print(f'The DataFrame (extended > 0) has {row_count} rows.')

The DataFrame (extended == 0) has 14875 rows.
The DataFrame (extended > 0) has 8750 rows.


In [36]:
model.fit(x_train, y_train)
model.best_params_

Fitting 5 folds for each of 1 candidates, totalling 5 fits


{'C': 10, 'gamma': 10, 'kernel': 'rbf'}

In [37]:
dump(model, "svm_model.joblib")

['svm_model.joblib']