In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
train_ds = pd.read_csv('train_set.csv')
print(f"Length train set: {len(train_ds)}")
test_ds = pd.read_csv('test_set.csv')
print(f"Length test set: {len(test_ds)}")

In [None]:
train_ds.groupby(['Label']).count()

In [None]:
test_ds.groupby(['Label']).count()

In [None]:
import sys
!conda install --yes --prefix {sys.prefix} numpy

In [None]:
!python -m spacy download en_core_web_md

In [None]:
len(train_ds['Job_offer'])

In [None]:

#Found some Deutsch job offers. Removed since they are outliers
stop_words_de = sw.words('german')
#Hard removing them
remove_jobs = [12, 32, 569, 834, 893, 1256, 1261]
for r in remove_jobs:
    train_ds = train_ds.drop(r)

In [122]:
import re
import nltk
from nltk.corpus import stopwords as sw
import spacy
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

#Download if needed
#nltk.download('stopwords')

stop_words = sw.words('italian')
stop_words_eng = sw.words('english')


commonverbs = list(pd.read_csv('commonverbs.csv',header=None).values[0])
frequent = list(pd.read_csv('frequentwords.csv',header=None).values[0])
# frequent = []

include_words = ['bene','bel','buon','r&d']
for i in include_words:
    if i in stop_words:
        stop_words.remove(i)


stop_words= stop_words+stop_words_eng+stop_words_de+frequent+commonverbs
stop_words = set(stop_words)
include_words = set(include_words)

#it_core_news_sm, md, lg
lemmatizer = spacy.load("it_core_news_lg", disable=[])#"tokenizer","tagger", "parser", "ner", "textcat"
def CleanSentence(sentence,sw=stop_words):
    sentence = sentence.lower()
    #Remove all non-alphanumeric character, excluding '&' (like: R&D)
    sentence = re.sub("[^\w&-]+|_-|\d+", " ", sentence)
#     sentence = re.sub("[^\w&-]+|_-", " ", sentence)
    
    lemmas = lemmatizer(sentence)
    newSentence = ""
    removed_c = []
    min = 3
    max = 16
    for lemma in lemmas:
        word = lemma.lemma_
        if word not in stop_words:
            if  (min <= len(word) <= max or word in include_words):
                newSentence = newSentence + word + " "
            else:
                removed_c.append(word)
    if(False):
        if(len(removed_c)>0):
            print(removed_c)
            print("--- ---- ----")
    return newSentence

def CleanText(text):
    sentences = []
    for row in text:
        sentences.append((CleanSentence(row)))
    
    return sentences

In [123]:
train_ds['clean'] = CleanText(train_ds['Job_offer'])
test_ds['clean'] = CleanText(test_ds['Job_offer'])

In [None]:
ind = 123

In [None]:
train_ds['Job_offer'][ind]

In [None]:
train_ds['clean'][ind]

In [None]:
train_ds['clean'][ind]

In [124]:
counter_text = []
words_text = {}
ds = train_ds['clean']
for s in ds:
    counter_text.append(len(s.split()))
    for w in s.split():
        if w not in words_text:
            words_text[w] = 1
        words_text[w] = words_text[w] + 1
print(f'Different words in training: {len(words_text)}')

Different words in training: 5576


In [None]:
print({k: v for k, v in sorted(words_text.items(), key=lambda item: -item[1])})

In [125]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.util import ngrams

#'clean', 'Job_offer'
column = 'clean'
max_feature = len(train_ds[column])
# print(f'#Feature {max_feature}')

def X_tfidf(sentences,max_feature = max_feature):  
    tfidf = TfidfVectorizer(min_df=2, max_df=0.999, ngram_range=(1,3),lowercase=True)#,strip_accents='ascii'
    X = tfidf.fit_transform(sentences)
    return X, tfidf

train_vec, vectorizer = X_tfidf(train_ds[column])
# test_vec, vectorizer = X_tfidf(test_ds['clean'])
test_vec = vectorizer.transform(test_ds[column])

# X_train, X_test, y_train, y_test = train_ds['clean'], test_ds['clean'], train_ds['Label'], test_ds['Label']
X_train, y_train, X_test, y_test = train_vec, train_ds['Label'], test_vec, test_ds['Label']
print(f'X_train, y_train, X_test, y_test', len(X_train.todense()), len(y_train), len(X_test.todense()), len(y_test))

X_train, y_train, X_test, y_test 1745 1745 439 439


In [126]:
print(len(vectorizer.get_feature_names()),vectorizer.get_feature_names())

30679 ['abbastanza', 'abbastanza esperienza', 'abbastanza esperienza leader', 'abilità', 'abilità acquisito', 'abilità acquisito sfogare', 'abilità pratico', 'abilità pratico informatico', 'abilità relazionali', 'abilità relazionali comunicativo', 'abilità relazionali differenza', 'abilità trasversale', 'abilità trasversale azienda', 'abituato', 'abitudine', 'abitudine habits', 'abitudine habits persona', 'ac', 'ac technologies', 'ac technologies società', 'academy', 'academy collaborazione', 'academy collaborazione obiettivare', 'academy docenza', 'academy docenza alto', 'academy formazione', 'academy formazione intensivo', 'academy innovativo', 'academy innovativo rivoltare', 'academy inquadramento', 'academy inquadramento contrattuale', 'academy percorrere', 'academy percorrere formativo', 'academy percorrere formazione', 'academy progetto', 'academy progetto formazione', 'academy settimana', 'academy sviluppatori', 'academy sviluppatori firmware', 'academy team', 'academy team intr

## Models

In [127]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score
def train_model(classifier, X_train, X_test, y_train, y_test, printAll=False):
    
    classifier.fit(X_train, y_train)
    
    y_pred = classifier.predict(X_test)
    print(f"#Precision: {precision_score(y_test, y_pred, average='macro'):.5f}")
    print(f"#Recall: {recall_score(y_test, y_pred, average='macro'):.5f}")
    print(f"#f1 Score: {f1_score(y_test, y_pred, average='macro'):.5f}")
        
    if(printAll):
        print(confusion_matrix(y_test,y_pred))      
        print(classification_report(y_test,y_pred))
    print("#--- --- ---")

In [128]:
from sklearn.svm import LinearSVC
# for c in range(1,20,1):
#     print(c)
#     lsvc = LinearSVC(tol=1e-8,max_iter=10000,random_state=0,C=c/10, penalty='l2')
#     train_model(lsvc, X_train, X_test, y_train, y_test)
#c=.5 best

lsvc = LinearSVC(tol=1e-8,max_iter=10000,random_state=0,C=5/10, penalty='l2')
train_model(lsvc, X_train, X_test, y_train, y_test)
#Precision: 0.82239
#Recall: 0.82697
#f1 Score: 0.82413
#--- --- ---

#Precision: 0.83395
#Recall: 0.83635
#f1 Score: 0.83465
#--- --- ---


In [129]:
from sklearn.linear_model import LogisticRegression

# for c in range(1,100):
#     print(c)
#     lr = LogisticRegression(penalty='l2',max_iter=10**6,C=c/10,random_state=0)
#     train_model(lr, X_train, X_test, y_train, y_test)

c=4 #best
lr = LogisticRegression(penalty='l2',max_iter=10**6,C=c,random_state=10)
train_model(lr, X_train, X_test, y_train, y_test)
#Precision: 0.83902
#Recall: 0.83753
#f1 Score: 0.83775


#Precision: 0.83906
#Recall: 0.83644
#f1 Score: 0.83733
#--- --- ---


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB

for i in range(10,110,10):
    print(i/100)
    mNB = MultinomialNB(alpha=i/100)
    print(f'Classifier MultinomialNB')
    train_model(mNB, X_train.todense(), X_test.todense(), y_train, y_test)
    bNB = BernoulliNB(alpha=i/100)
    print(f'Classifier BernoulliNB')
    train_model(bNB, X_train.todense(), X_test.todense(), y_train, y_test)
    print("--- ---- ----")
gNB = GaussianNB()
print(f'Classifier GaussianNB')
train_model(gNB, X_train.todense(), X_test.todense(), y_train, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

#'gini', 'entropy'
classifier = RandomForestClassifier(n_estimators=512*1, random_state=0,max_depth=None,criterion='gini')
train_model(classifier, X_train, X_test, y_train, y_test,True)

In [None]:
from sklearn.linear_model import SGDClassifier

#perceptron, hinge, log, squared_hinge
sgd = SGDClassifier(loss='hinge', penalty='l2', validation_fraction=0.1,max_iter=3*10**6,tol=10**-8,random_state=0)
train_model(sgd, X_train, X_test, y_train, y_test)
#Precision: 0.81388
#Recall: 0.81592
#f1 Score: 0.81442
#--- --- ---


In [None]:
from sklearn.ensemble import VotingClassifier

n_estimators = 512
# clf1 = RandomForestClassifier(n_estimators=n_estimators, random_state=0,max_depth=None,criterion='gini')
clf2 = LogisticRegression(penalty='l2',max_iter=10**6,C=4.1,random_state=1)
clf3 = LinearSVC(tol=1e-8,max_iter=10000,random_state=0,C=5/10, penalty='l2')
clf4 = LinearSVC(tol=1e-8,max_iter=10000,random_state=1,C=.5, penalty='l2')
clf5 = LogisticRegression(penalty='l2',max_iter=10**6,C=4.2,random_state=0)
clf51 = LogisticRegression(penalty='l2',max_iter=10**6,C=4.3,random_state=20)
clf52 = LogisticRegression(penalty='l2',max_iter=10**6,C=4,random_state=10)
clf6 = SGDClassifier(loss='hinge', penalty='l2', validation_fraction=0.1,max_iter=3*10**6,tol=10**-8,random_state=0)
#('clf1', clf1), ('clf3', clf3), ('clf4', clf4),('clf6', clf6)
VCclf = VotingClassifier(estimators=[('clf2', clf2), ('clf5', clf5), ('clf51', clf51), ('clf52', clf52)], voting='hard')
train_model(VCclf, X_train, X_test, y_train, y_test)

In [None]:
#Precision: 0.83565
#Recall: 0.83549
#f1 Score: 0.83541
#--- --- ---


In [None]:
from wordcloud import WordCloud
def Word_Cloud(words):
    text = ""
    for i in words[1:-1]:
        if len(i[0].split(' ')) ==1:
             text = text + " " + i[0]
    plt.figure( figsize=(20,10), facecolor='k', frameon=False)
    wordcloud= WordCloud(width=1200, height=600,min_font_size=8, max_font_size=100, max_words=500, background_color="white", contour_width=0,contour_color='white').generate(text)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()

In [None]:
Word_Cloud(list(vectorizer.vocabulary_.items()))

In [130]:
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(hidden_layer_sizes=256,tol = 0.0001, verbose = True, n_iter_no_change = 3, max_iter=500)
train_model(model, X_train, X_test, y_train, y_test)
#Precision: 0.83057
#Recall: 0.82624
#f1 Score: 0.82788
#--- --- ---

Iteration 1, loss = 1.55565437
Iteration 2, loss = 1.31346932
Iteration 3, loss = 1.04250035
Iteration 4, loss = 0.77837028
Iteration 5, loss = 0.56099713
Iteration 6, loss = 0.40498666
Iteration 7, loss = 0.29703851
Iteration 8, loss = 0.22416227
Iteration 9, loss = 0.17396849
Iteration 10, loss = 0.13855545
Iteration 11, loss = 0.11393838
Iteration 12, loss = 0.09564505
Iteration 13, loss = 0.08201337
Iteration 14, loss = 0.07184577
Iteration 15, loss = 0.06361052
Iteration 16, loss = 0.05785536
Iteration 17, loss = 0.05199999
Iteration 18, loss = 0.04795115
Iteration 19, loss = 0.04370332
Iteration 20, loss = 0.04082072
Iteration 21, loss = 0.03827776
Iteration 22, loss = 0.03603010
Iteration 23, loss = 0.03428023
Iteration 24, loss = 0.03252003
Iteration 25, loss = 0.03099525
Iteration 26, loss = 0.02941589
Iteration 27, loss = 0.02872369
Iteration 28, loss = 0.02759595
Iteration 29, loss = 0.02678057
Iteration 30, loss = 0.02589262
Iteration 31, loss = 0.02525512
Iteration 32, los

In [131]:
for i,x in enumerate(train_ds['clean']):
    if(x.find('r&d')>0):
        print(i,x,'\n')
# [12, 32, 569, 834, 1261]

0 openjobmetis spa azienda multinazionale leader settore sanitario software developer risorsa internare team r&d sviluppare modellazione manutenzione software servizo sistemare sanitario interfacciare inoltre area funzionale convolte progettazione sino sviluppare manutenzione requisito laureare ingegneria informatico matematico fisico equipollente esperienza sviluppare java esperienza sviluppare database relazionali conoscenza sql conoscenza soluzione software object oriented design pattern buono conoscenza inglese titolo preferenziale obbligatorio conoscenza rdbms oracle sql linguaggio sql conoscenza javascript esperienza sviluppare middleware  

157 risorsa inserito reparto r&d opportunità sviluppare scheda elettronico embedded basire sistemare operativo diverso linux windows android ios segnale video audio comunicazione rete gestione bus veloce basso disegnare interfaccia grafico algoritmo efficiente campire deep learning computer vision grande attenzione processo sviluppare valoriz

In [None]:
train_ds['Job_offer'][893]

In [116]:
train_ds['Job_offer'][0]

"Openjobmetis SpA ricerca, per importante azienda multinazionale leader nel settore sanitario e ospedaliero, un Software Developer. La risorsa, all'interno del team di R&D, si occuperà dello sviluppo, modellazione e manutenzione dei software e dei servizi per il sistema sanitario. Si interfaccerà inoltre con tutte le aree funzionali convolte dalla progettazione, sino allo sviluppo e manutenzione dei prodotti. Requisiti richiesti: - Laurea in Ingegneria Informatica, Matematica, Fisica o equipollenti; - Esperienza di sviluppo in Java (2-3 anni); - Esperienza di sviluppo su database relazionali e conoscenza SQL - conoscenza su soluzioni software Object Oriented e sui Design Pattern; - Buona conoscenza dell'inglese; Costituiscono titoli preferenziali ma non obbligatori: - Conoscenza di RDBMS Oracle, SQL e linguaggio PL/SQL; - Conoscenza di Javascript; - Esperienza di sviluppo con strumenti di middleware"

In [117]:
train_ds['clean'][0]

'openjobmetis spa azienda multinazionale leader settore sanitario software developer risorsa internare team r&d sviluppare modellazione manutenzione software servizo sistemare sanitario interfacciare inoltre area funzionale convolte progettazione sino sviluppare manutenzione requisito laureare ingegneria informatico matematico fisico equipollente esperienza sviluppare java esperienza sviluppare database relazionali conoscenza sql conoscenza soluzione software object oriented design pattern buono conoscenza inglese titolo preferenziale obbligatorio conoscenza rdbms oracle sql linguaggio sql conoscenza javascript esperienza sviluppare middleware '