In [1]:
import pandas
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import ParameterGrid, GridSearchCV
from sklearn.metrics import classification_report

In [2]:
train_dir = 'vacatures_train.csv'
test_dir = 'vacatures_test.csv'
train_df = pandas.read_csv(train_dir, header = 0)
test_df = pandas.read_csv(test_dir, header = 0)

In [3]:
#train and test set sizes
print(str(train_df.shape[0]))
print(str(test_df.shape[0]))

24468
5000


In [4]:
#train set size after removing duplicates
train_df.drop_duplicates(subset = ['description'], inplace = True)
print(str(train_df.shape[0]))

24137


In [5]:
#calculate proportions of train set labels
labels = {}

for label in train_df['type']:
    if label in labels:
        labels[label] += 1
    else:
        labels[label] = 1
        
for key, val in labels.items():
    labels[key] = val/24137*100

labels

{'Techniek': 20.570079131623647,
 'Verkoop': 11.633591581389569,
 'Bouw': 4.259021419397605,
 'Dienstverlening': 3.9690102332518538,
 'ICT': 6.450677383270498,
 'Logistiek en transport': 12.669345817624395,
 'Gezondheid': 3.608567759042135,
 'Administratie': 14.53784645979202,
 'Onderhoud': 1.4044827443344243,
 'Juridisch': 0.48473298255789865,
 'Productie': 5.265774537017856,
 'Management': 0.44744583005344496,
 'Aankoop': 1.3879106765546672,
 'Onderzoek en ontwikkeling': 0.47230393172308077,
 'Financieel': 7.652152297302896,
 'Overheid': 0.41015867754899116,
 'Horeca en toerisme': 1.8850727099473836,
 'Onderwijs': 1.168330778472884,
 'Land- en tuinbouw': 0.4391597961635663,
 'Communicatie': 0.7291709823093177,
 'Creatief': 0.40601566060405186,
 'Human resources': 0.149148610017815}

In [6]:
#calculate proportions of test set labels
labels = {}

for label in test_df['type']:
    if label in labels:
        labels[label] += 1
    else:
        labels[label] = 1
        
for key, val in labels.items():
    labels[key] = val/24137*100

labels

{'Dienstverlening': 0.8617475245473754,
 'Techniek': 3.9275800638024614,
 'Gezondheid': 0.6877408128599246,
 'Administratie': 3.5878526743174377,
 'Financieel': 1.615776608526329,
 'Logistiek en transport': 2.3200894891660107,
 'Verkoop': 2.353233624725525,
 'Bouw': 0.7374570161991962,
 'Communicatie': 0.1698636947425115,
 'Productie': 1.006753117620251,
 'Onderwijs': 0.19472179641214732,
 'ICT': 1.2387620665368522,
 'Aankoop': 0.41015867754899116,
 'Juridisch': 0.12843352529311844,
 'Overheid': 0.06628827111902888,
 'Horeca en toerisme': 0.46401789783320213,
 'Management': 0.11600447445830053,
 'Creatief': 0.11186145751336124,
 'Land- en tuinbouw': 0.15329162696275428,
 'Onderhoud': 0.339727389485023,
 'Onderzoek en ontwikkeling': 0.1988648133570866,
 'Human resources': 0.024858101669635826}

In [7]:
X_train = train_df['description']
y_train = train_df['type']
X_test = test_df['description']
y_test = test_df['type']

In [8]:
tfidf = TfidfVectorizer(strip_accents='ascii', analyzer='char', ngram_range=(3, 7), max_df=.5)
svm = LinearSVC(random_state = 0)
model = Pipeline(steps = [("vectorizer", tfidf), ("classifier", svm)])

In [9]:
model.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(3, 7), norm='l2', preprocessor=None, smooth_idf=..., max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0))])

In [10]:
y_test_pred = model.predict(X_test)

In [11]:
print(classification_report(y_test, y_test_pred))

                           precision    recall  f1-score   support

                  Aankoop       0.82      0.46      0.59        99
            Administratie       0.70      0.80      0.75       866
                     Bouw       0.77      0.71      0.74       178
             Communicatie       0.76      0.46      0.58        41
                 Creatief       0.82      0.33      0.47        27
          Dienstverlening       0.71      0.65      0.68       208
               Financieel       0.82      0.86      0.84       390
               Gezondheid       0.86      0.82      0.84       166
       Horeca en toerisme       0.83      0.66      0.74       112
          Human resources       0.67      0.33      0.44         6
                      ICT       0.85      0.82      0.83       299
                Juridisch       0.90      0.61      0.73        31
        Land- en tuinbouw       0.78      0.38      0.51        37
   Logistiek en transport       0.77      0.80      0.79     