In [None]:
import pandas, re
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, ParameterGrid

In [None]:
train_dir = 'vacatures_train.csv'

df = pandas.read_csv(train_dir, header = 0)
df.drop_duplicates(subset = ['description'], inplace = True)

X = df['description']
y = df['type']

In [None]:
def preprocessor(application):
    
    """Removes punctuation at the beginning and/or end of a token and/or string, 
    and tokens that consist of punctuation only."""
    
    pattern = r'((\A|\B)[^A-Za-z0-9\s]+)|([^A-Za-z0-9\s]+(\Z|\B))'
    
    return re.sub(pattern, '', application)

In [None]:
tfidf = TfidfVectorizer(strip_accents = 'ascii', analyzer = 'char')

svm = LinearSVC(random_state = 0)

model = Pipeline(steps = [('vectorizer', tfidf), ('classifier', svm)])

param_grid = {'vectorizer__ngram_range': [(2, 6), (3, 7), (4, 8)], 
              'vectorizer__preprocessor': [None, preprocessor],
              'classifier__class_weight': [None, 'balanced'], 
              'classifier__C': [1.0, 5.0, 10.0]}

search = GridSearchCV(model, param_grid, cv=10, scoring='f1_micro', verbose=10, n_jobs=-1)

search.fit(X, y)

print(search.best_params_)
print(search.best_score_)