In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [16]:
train_ds = pd.read_csv('train_set.csv')
print(f"Length train set: {len(train_ds)}")
test_ds = pd.read_csv('test_set.csv')
print(f"Length test set: {len(test_ds)}")

Length train set: 1752
Length test set: 439


In [None]:
train_ds.groupby(['Label']).count()

In [None]:
test_ds.groupby(['Label']).count()

In [None]:
import sys
!conda install --yes --prefix {sys.prefix} numpy

In [None]:
!python -m spacy download en_core_web_md

In [None]:
len(train_ds['Job_offer'])

In [17]:
import re
import nltk
from nltk.corpus import stopwords as sw
import spacy
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

#Found some Deutsch job offers. Removed since they are outliers
# stop_words_de = sw.words('german')
#Hard removing them
remove_jobs = [12, 32, 569, 834, 893, 1256, 1261]
for r in remove_jobs:
    train_ds = train_ds.drop(r)

In [31]:
#Download if needed
#nltk.download('stopwords')

stop_words = sw.words('italian')
stop_words_eng = sw.words('english')


commonverbs = list(pd.read_csv('commonverbs.csv',header=None).values[0])
frequent = list(pd.read_csv('frequentwords.csv',header=None).values[0])
# frequent = []

include_words = ['bene','bel','buon','r&d','ar','vr']
for i in include_words:
    if i in stop_words:
        stop_words.remove(i)


stop_words= stop_words+stop_words_eng+frequent+commonverbs
stop_words = set(stop_words)
include_words = set(include_words)

#it_core_news_sm, md, lg
lemmatizer = spacy.load("it_core_news_lg", disable=[])#"tokenizer","tagger", "parser", "ner", "textcat"
def CleanSentence(sentence,sw=stop_words):
    sentence = sentence.lower()
    #Remove all non-alphanumeric character, excluding '&' (like: R&D)
    sentence = re.sub("[^\w&-]+|_|\d+", " ", sentence)
    sentence = re.sub("-", "", sentence)
    
    lemmas = lemmatizer(sentence)
    newSentence = ""
    removed_c = []
    min = 3
    max = 16
    for lemma in lemmas:
        word = lemma.lemma_
        if word not in stop_words:
            if  (min <= len(word) <= max or word in include_words):
                newSentence = newSentence + word + " "
            else:
                removed_c.append(word)
    if(False):
        if(len(removed_c)>0):
            print(removed_c)
            print("--- ---- ----")
    return newSentence

def CleanText(text):
    sentences = []
    for row in text:
        sentences.append((CleanSentence(row)))
    
    return sentences

In [32]:
train_ds['clean'] = CleanText(train_ds['Job_offer'])
test_ds['clean'] = CleanText(test_ds['Job_offer'])

In [None]:
ind = 123

In [None]:
train_ds['Job_offer'][ind]

In [None]:
train_ds['clean'][ind]

In [None]:
train_ds['clean'][ind]

In [33]:
counter_text = []
words_text = {}
ds = train_ds['clean']
for s in ds:
    counter_text.append(len(s.split()))
    for w in s.split():
        if w not in words_text:
            words_text[w] = 1
        words_text[w] = words_text[w] + 1
print(f'Different words in training: {len(words_text)}')

Different words in training: 5576


In [149]:
print({k: v for k, v in sorted(words_text.items(), key=lambda item: item[1])})

{'perito': 2, 'mvvm': 2, 'webdev': 2, 'import': 2, 'export': 2, 'collaborarai': 2, 'contract': 2, 'chiaramente': 2, 'likely': 2, 'includes': 2, 'suddetto': 2, 'prog': 2, 'mid': 2, 'sea': 2, 'elementor': 2, 'wbsadmin': 2, 'seasystem': 2, 't&e': 2, 'paradigms': 2, 'elastic': 2, 'clion': 2, 'srlsi': 2, 'anti': 2, 'predisposto': 2, 'databases': 2, 'cooperjob': 2, 'normalmente': 2, 'cda': 2, 'implementativi': 2, 'rilevazione': 2, 'ventilazione': 2, 'raffrescamento': 2, 'dashboard': 2, 'basandosi': 2, 'predittive': 2, 'rappresentazione': 2, 'modificabilità': 2, 'estensibilità': 2, 'altavilla': 2, 'sottoscrivere': 2, 'spac': 2, 'jave': 2, 'obbiettare': 2, 'young': 2, 'even': 2, 'recent': 2, 'graduato': 2, 'wants': 2, 'challenging': 2, 'lucrare': 2, 'patronato': 2, 'presidente': 2, 'repubblica': 2, 'volontariato': 2, 'essa': 2, 'volontario': 2, 'ausiliario': 2, 'umanitario': 2, 'comitato': 2, 'cicr': 2, 'federazione': 2, 'ficr': 2, 'concorso': 2, 'benchmark': 2, 'pervenire': 2, 'modalita': 2, 

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.util import ngrams

#'clean', 'Job_offer'
column = 'clean'
max_feature = len(train_ds[column])
# print(f'#Feature {max_feature}')

def X_tfidf(sentences,max_feature = max_feature):  
    tfidf = TfidfVectorizer(min_df=2, max_df=0.2, ngram_range=(1,3),lowercase=True)#,strip_accents='ascii'
    X = tfidf.fit_transform(sentences)
    return X, tfidf

train_vec, vectorizer = X_tfidf(train_ds[column])
# test_vec, vectorizer = X_tfidf(test_ds['clean'])
test_vec = vectorizer.transform(test_ds[column])

# X_train, X_test, y_train, y_test = train_ds['clean'], test_ds['clean'], train_ds['Label'], test_ds['Label']
X_train, y_train, X_test, y_test = train_vec, train_ds['Label'], test_vec, test_ds['Label']
print(f'X_train, y_train, X_test, y_test', len(X_train.todense()), len(y_train), len(X_test.todense()), len(y_test))

X_train, y_train, X_test, y_test 1745 1745 439 439


In [40]:
print(len(vectorizer.get_feature_names()))
print(vectorizer.get_feature_names())

30235
['abbastanza', 'abbastanza esperienza', 'abbastanza esperienza leader', 'abilità', 'abilità acquisito', 'abilità acquisito sfogare', 'abilità pratico', 'abilità pratico informatico', 'abilità relazionali', 'abilità relazionali comunicativo', 'abilità relazionali differenza', 'abilità trasversale', 'abilità trasversale azienda', 'abituato', 'abitudine', 'abitudine habits', 'abitudine habits persona', 'ac', 'ac technologies', 'ac technologies società', 'academy', 'academy collaborazione', 'academy collaborazione obiettivare', 'academy docenza', 'academy docenza alto', 'academy formazione', 'academy formazione intensivo', 'academy innovativo', 'academy innovativo rivoltare', 'academy inquadramento', 'academy inquadramento contrattuale', 'academy intraprendere', 'academy intraprendere percorrere', 'academy percorrere', 'academy percorrere formativo', 'academy percorrere formazione', 'academy progetto', 'academy progetto formazione', 'academy settimana', 'academy sviluppatori', 'acade

## Models

In [35]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score
def train_model(classifier, X_train, X_test, y_train, y_test, printAll=False):
    
    classifier.fit(X_train, y_train)
    
    y_pred = classifier.predict(X_test)
    print(f"#Precision: {precision_score(y_test, y_pred, average='macro'):.5f}")
    print(f"#Recall: {recall_score(y_test, y_pred, average='macro'):.5f}")
    print(f"#f1 Score: {f1_score(y_test, y_pred, average='macro'):.5f}")
        
    if(printAll):
        print(confusion_matrix(y_test,y_pred))      
        print(classification_report(y_test,y_pred))
    print("#--- --- ---")

In [41]:
from sklearn.svm import LinearSVC
# for c in range(1,20,1):
#     print(c)
#     lsvc = LinearSVC(tol=1e-8,max_iter=10000,random_state=0,C=c/10, penalty='l2')
#     train_model(lsvc, X_train, X_test, y_train, y_test)
#c=.5 best

lsvc = LinearSVC(tol=1e-8,max_iter=10000,random_state=0,C=5/10, penalty='l2')
train_model(lsvc, X_train, X_test, y_train, y_test)
#Precision: 0.83988
#Recall: 0.84042
#f1 Score: 0.83933
#--- --- ---

#Precision: 0.84126
#Recall: 0.83794
#f1 Score: 0.83934
#--- --- ---


In [42]:
from sklearn.linear_model import LogisticRegression

# for c in range(1,100):
#     print(c)
#     lr = LogisticRegression(penalty='l2',max_iter=10**6,C=c/10,random_state=0)
#     train_model(lr, X_train, X_test, y_train, y_test)

c=4 #best
lr = LogisticRegression(penalty='l2',max_iter=10**6,C=c,random_state=10)
train_model(lr, X_train, X_test, y_train, y_test)
#Precision: 0.84848
#Recall: 0.84146
#f1 Score: 0.84435
#--- --- ---

#Precision: 0.84848
#Recall: 0.84146
#f1 Score: 0.84435
#--- --- ---


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB

for i in range(10,110,10):
    print(i/100)
    mNB = MultinomialNB(alpha=i/100)
    print(f'Classifier MultinomialNB')
    train_model(mNB, X_train.todense(), X_test.todense(), y_train, y_test)
    bNB = BernoulliNB(alpha=i/100)
    print(f'Classifier BernoulliNB')
    train_model(bNB, X_train.todense(), X_test.todense(), y_train, y_test)
    print("--- ---- ----")
gNB = GaussianNB()
print(f'Classifier GaussianNB')
train_model(gNB, X_train.todense(), X_test.todense(), y_train, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

#'gini', 'entropy'
classifier = RandomForestClassifier(n_estimators=512*1, random_state=0,max_depth=None,criterion='gini')
train_model(classifier, X_train, X_test, y_train, y_test,True)

In [None]:
from sklearn.linear_model import SGDClassifier

#perceptron, hinge, log, squared_hinge
sgd = SGDClassifier(loss='hinge', penalty='l2', validation_fraction=0.1,max_iter=3*10**6,tol=10**-8,random_state=0)
train_model(sgd, X_train, X_test, y_train, y_test)
#Precision: 0.81388
#Recall: 0.81592
#f1 Score: 0.81442
#--- --- ---


In [None]:
from sklearn.ensemble import VotingClassifier

n_estimators = 512
# clf1 = RandomForestClassifier(n_estimators=n_estimators, random_state=0,max_depth=None,criterion='gini')
clf2 = LogisticRegression(penalty='l2',max_iter=10**6,C=4.1,random_state=1)
clf3 = LinearSVC(tol=1e-8,max_iter=10000,random_state=0,C=5/10, penalty='l2')
clf4 = LinearSVC(tol=1e-8,max_iter=10000,random_state=1,C=.5, penalty='l2')
clf5 = LogisticRegression(penalty='l2',max_iter=10**6,C=4.2,random_state=0)
clf51 = LogisticRegression(penalty='l2',max_iter=10**6,C=4.3,random_state=20)
clf52 = LogisticRegression(penalty='l2',max_iter=10**6,C=4,random_state=10)
clf6 = SGDClassifier(loss='hinge', penalty='l2', validation_fraction=0.1,max_iter=3*10**6,tol=10**-8,random_state=0)
#('clf1', clf1), ('clf3', clf3), ('clf4', clf4),('clf6', clf6)
VCclf = VotingClassifier(estimators=[('clf2', clf2), ('clf5', clf5), ('clf51', clf51), ('clf52', clf52)], voting='hard')
train_model(VCclf, X_train, X_test, y_train, y_test)

In [None]:
#Precision: 0.83565
#Recall: 0.83549
#f1 Score: 0.83541
#--- --- ---


In [None]:
from wordcloud import WordCloud
def Word_Cloud(words):
    text = ""
    for i in words[1:-1]:
        if len(i[0].split(' ')) ==1:
             text = text + " " + i[0]
    plt.figure( figsize=(20,10), facecolor='k', frameon=False)
    wordcloud= WordCloud(width=1200, height=600,min_font_size=8, max_font_size=100, max_words=500, background_color="white", contour_width=0,contour_color='white').generate(text)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()

In [None]:
Word_Cloud(list(vectorizer.vocabulary_.items()))

In [None]:
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(hidden_layer_sizes=256,tol = 0.0001, verbose = True, n_iter_no_change = 3, max_iter=500)
train_model(model, X_train, X_test, y_train, y_test)
#Precision: 0.83057
#Recall: 0.82624
#f1 Score: 0.82788
#--- --- ---

In [144]:
for i,x in enumerate(train_ds['clean']):
    if(x.find('-trice')>0):
        print(i,x,'\n')
# [12, 32, 569, 834, 1261]

In [None]:
train_ds['Job_offer'][893]

In [None]:
train_ds['Job_offer'][0]

In [None]:
train_ds['clean'][0]