In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, ParameterGrid, train_test_split
from nltk.corpus import stopwords
from langdetect import detect
from translate import Translator

In [None]:
train_dir = 'vacatures_train.csv'

df = pd.read_csv(train_dir, header = 0)
df.drop_duplicates(subset = ['description'], inplace = True)

X = df['description']
y = df['type']

X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size = .1)

In [None]:
stop_words = stopwords.words('dutch') + stopwords.words('english') + stopwords.words('french') + stopwords.words('german')

In [None]:
#not included, because of memory issues
def preprocessor(application):
    
    """Detects in what language a job application is written.
    If not Dutch, the job application is translated to Dutch."""
    
    translator = Translator(to_lang="nl")
    
    if detect(application) != 'nl':
        translation = translator.translate(application)
        return translation
    
    else:
        return application

In [None]:
tfidf = TfidfVectorizer(strip_accents = 'ascii')

svm = LinearSVC(random_state = 0)

model = Pipeline(steps = [('vectorizer', tfidf), ('classifier', svm)])

param_grid = {'vectorizer__analyzer': ['word', 'char'],
              'vectorizer__ngram_range': [(1, 2), (1, 3), (1, 4), (4, 8)], 
              'vectorizer__max_df': [1.0, .9],
              'vectorizer__stop_words': [None, 'english', stop_words]}

search = GridSearchCV(model, param_grid, cv=10, scoring='f1_micro', verbose=10, n_jobs=-1)

search.fit(X, y)

print(search.best_params_)
print(search.best_score_)