## Libs

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords as sw
import spacy
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.util import ngrams
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score
import pickle

## Import data

In [None]:
train_ds = pd.read_csv('train_set.csv')
print(f"Length train set: {len(train_ds)}")
test_ds = pd.read_csv('test_set.csv')
print(f"Length test set: {len(test_ds)}")

In [None]:
train_ds.groupby(['Label']).count()

In [None]:
test_ds.groupby(['Label']).count()

## Preprocess

In [None]:
#Found some Deutsch job offers. Removed since they are outliers
# stop_words_de = sw.words('german')
#Hard removing them
remove_jobs = [12, 32, 569, 834, 893, 1256, 1261]
train_ds = train_ds.drop(remove_jobs)

In [None]:
#Download if needed
#nltk.download('stopwords')

stop_words = sw.words('italian')
stop_words_eng = sw.words('english')


commonverbs = list(pd.read_csv('commonverbs.csv',header=None).values[0])
frequent = list(pd.read_csv('frequentwords.csv',header=None).values[0])
# frequent = []
# commonverbs = []

include_words = ['bene','bel','buon','r&d','ar','vr']
for i in include_words:
    if i in stop_words:
        stop_words.remove(i)


stop_words= stop_words+stop_words_eng+frequent+commonverbs
stop_words = set(stop_words)
include_words = set(include_words)

#it_core_news_sm, md, lg
lemmatizer = spacy.load("it_core_news_lg", disable=["tagger", "parser", "ner"])#"tokenizer","tagger", "parser", "ner", "textcat"
def CleanSentence(sentence,sw=stop_words):
    sentence = sentence.lower()
    #Remove all non-alphanumeric character, excluding '&' (like: R&D)
    sentence = re.sub("[^\w&-]+|_|\d+", " ", sentence)
    sentence = re.sub("-", "", sentence)
    
    lemmas = lemmatizer(sentence)
    newSentence = ""
    removed_c = []
    min = 3
    max = 16
    for lemma in lemmas:
        word = lemma.lemma_
        if word not in stop_words:
            if  (min <= len(word) <= max or word in include_words):
                newSentence = newSentence + word + " "
            else:
                removed_c.append(word)
    if(False):
        if(len(removed_c)>0):
            print(removed_c)
            print("--- ---- ----")
    return newSentence

def CleanText(text):
    sentences = []
    for row in text:
        sentences.append(CleanSentence(row))
    
    return sentences

In [None]:
train_ds['clean'] = CleanText(train_ds['Job_offer'])
test_ds['clean'] = CleanText(test_ds['Job_offer'])

In [None]:
# ind = []
# for i,s in enumerate(train_ds['clean']):
#     l = len(s.split(' '))
#     if(l<=3):
#         ind.append(i)
#         print(i,l,s,train_ds['Label'][i])
# train_ds = train_ds.drop(ind)

In [None]:
#Test preprocessing step
ind = 123

In [None]:
print(train_ds['Job_offer'][ind])

In [None]:
print(train_ds['clean'][ind])

In [None]:
#Check how many different words there are
counter_text = []
words_text_tr = {}
ds = train_ds['clean']
for s in ds:
    counter_text.append(len(s.split()))
    for w in s.split():
        if w not in words_text_tr:
            words_text_tr[w] = 1
        words_text_tr[w] = words_text_tr[w] + 1
print(f'Different words in training: {len(words_text_tr)}')

counter_text = []
words_text_te = {}
ds = test_ds['clean']
for s in ds:
    counter_text.append(len(s.split()))
    for w in s.split():
        if w not in words_text_te:
            words_text_te[w] = 1
        words_text_te[w] = words_text_te[w] + 1
print(f'Different words in training: {len(words_text_te)}')

In [None]:
tr = list(words_text_tr.keys())
te = list(words_text_te.keys())

In [None]:
for w in te:
    if(w in commonverbs or w in frequent):
        print(w)

In [None]:
for w in te:
    if(w in tr):
        print(w)

In [None]:
list(set(te)-set(tr))

In [None]:
for i in list(set(tr)-set(te)):
    if(i.find('rimbor')>0):
        print(i, end = '')

In [None]:
print(f'Different words in training: {len(words_text_tr)}')
print({k: v for k, v in sorted(words_text_tr.items(), key=lambda item: item[1])})

In [None]:
print(f'Different words in training: {len(words_text)}')
print({k: v for k, v in sorted(words_text.items(), key=lambda item: item[1])})

In [None]:
#Calculate TF-IDF

#'clean', 'Job_offer'
column = 'clean'
max_feature = len(train_ds[column])
# print(f'#Feature {max_feature}')

def X_tfidf(sentences,max_feature = max_feature):  
    tfidf = TfidfVectorizer(min_df=2, max_df=0.2, ngram_range=(1,3),lowercase=True)#,strip_accents='ascii'
    X = tfidf.fit_transform(sentences)
    return X, tfidf

train_vec, vectorizer = X_tfidf(train_ds[column])
test_vec = vectorizer.transform(test_ds[column])

X_train, y_train, X_test, y_test = train_vec, train_ds['Label'], test_vec, test_ds['Label']
print(f'X_train, y_train, X_test, y_test', len(X_train.todense()), len(y_train), len(X_test.todense()), len(y_test))

In [None]:
print(len(vectorizer.get_feature_names()))
print(vectorizer.get_feature_names())

## Models

In [None]:
def get_score(classifier,X_test,y_test):
    y_pred = classifier.predict(X_test)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1_score_ = f1_score(y_test, y_pred, average='macro')
    print(f"#Precision: {precision:.5f}")
    print(f"#Recall: {recall:.5f}")
    print(f"#f1 Score: {f1_score_:.5f}")
    return precision,recall,f1_score_,y_pred

def train_model(classifier, X_train, X_test, y_train, y_test, printAll=False):
    
    classifier.fit(X_train, y_train)
    
    precision,recall,f1_score_,y_pred = get_score(classifier,X_test,y_test)
        
    if(printAll):
        print(confusion_matrix(y_test,y_pred))      
        print(classification_report(y_test,y_pred))
    print("#--- --- ---")
    return precision,recall,f1_score_,y_pred

In [None]:
from sklearn.svm import LinearSVC
# max = [{0:0},{0:0},{0:0}]
# for c in range(1,20,1):
#     print(c)
#     lsvc = LinearSVC(tol=1e-8,max_iter=10000,random_state=0,C=c/10, penalty='l2')
#     p,r,f,_ = train_model(lsvc, X_train, X_test, y_train, y_test)
#     if(p>list(max[0].values())[0]):
#         max[0] = {c:p}
#     if(r>list(max[1].values())[0]):
#         max[1] = {c:r}
#     if(f>list(max[2].values())[0]):
#         max[2] = {c:f}

c=.5 #best
lsvc = LinearSVC(tol=1e-9,max_iter=10**8,random_state=0,C=5/10, penalty='l2')
_ = train_model(lsvc, X_train, X_test, y_train, y_test)
#Precision: 0.83988
#Recall: 0.84042
#f1 Score: 0.83933
#--- --- ---

In [None]:
# max = [{0:0},{0:0},{0:0}]
# for c in range(1,50):
#     print(c)
#     lr = LogisticRegression(penalty='l2',max_iter=10**6,C=c/10,random_state=0, solver='liblinear')
#     p,r,f,_ = train_model(lr, X_train, X_test, y_train, y_test)
#     if(p>list(max[0].values())[0]):
#         max[0] = {c:p}
#     if(r>list(max[1].values())[0]):
#         max[1] = {c:r}
#     if(f>list(max[2].values())[0]):
#         max[2] = {c:f}
        

In [None]:
from sklearn.linear_model import LogisticRegression

c=4.1 #best
#{‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}
lr = LogisticRegression(penalty='l2',max_iter=10**9,C=c,random_state=10, solver='liblinear')
_,_,_,y_pred = train_model(lr, X_train, X_test, y_train, y_test)
#Precision: 0.85283
#Recall: 0.84650
#f1 Score: 0.84918
#--- --- ---

In [None]:
from sklearn.linear_model import LogisticRegression

# max = [{0:0},{0:0},{0:0}]
# for c in range(1,100):
#     print(c)
#     lr = LogisticRegression(penalty='l2',max_iter=10**6,C=c/10,random_state=0, solver='liblinear')
#     p,r,f = train_model(lr, X_train, X_test, y_train, y_test)
#     if(p>list(max[0].values())[0]):
#         max[0] = {c:p}
#     if(r>list(max[1].values())[0]):
#         max[1] = {c:r}
#     if(f>list(max[2].values())[0]):
#         max[2] = {c:f}
        
c=2 #best
#{‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}
lr = LogisticRegression(penalty='l2',max_iter=10**7,C=c,random_state=10, solver='liblinear')
_,_,_,y_pred = train_model(lr, X_train, X_test, y_train, y_test)
#Precision: 0.85283
#Recall: 0.84650
#f1 Score: 0.84918
#--- --- ---

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB

for i in range(10,110,10):
    print(i/100)
    mNB = MultinomialNB(alpha=i/100)
    print(f'Classifier MultinomialNB')
    train_model(mNB, X_train.todense(), X_test.todense(), y_train, y_test)
    bNB = BernoulliNB(alpha=i/100)
    print(f'Classifier BernoulliNB')
    train_model(bNB, X_train.todense(), X_test.todense(), y_train, y_test)
    print("--- ---- ----")
gNB = GaussianNB()
print(f'Classifier GaussianNB')
train_model(gNB, X_train.todense(), X_test.todense(), y_train, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

#'gini', 'entropy'
classifier = RandomForestClassifier(n_estimators=512*2, random_state=0,max_depth=None,criterion='gini')
train_model(classifier, X_train, X_test, y_train, y_test,True)

In [None]:
from sklearn.linear_model import SGDClassifier

#perceptron, hinge, log, squared_hinge
sgd = SGDClassifier(loss='hinge', penalty='l2', validation_fraction=0.1,max_iter=3*10**6,tol=10**-8,random_state=0)
train_model(sgd, X_train, X_test, y_train, y_test)
#Precision: 0.81388
#Recall: 0.81592
#f1 Score: 0.81442
#--- --- ---


In [None]:
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(hidden_layer_sizes=256,tol = 0.001, verbose = True, n_iter_no_change = 3, max_iter=500)
train_model(model, X_train, X_test, y_train, y_test)
#Precision: 0.83057
#Recall: 0.82624
#f1 Score: 0.82788
#--- --- ---

In [None]:
from sklearn.ensemble import VotingClassifier

n_estimators = 512
# clf1 = RandomForestClassifier(n_estimators=n_estimators, random_state=0,max_depth=None,criterion='gini')
# clf2 = LogisticRegression(penalty='l2',max_iter=10**6,C=4.1,random_state=1)
# clf3 = LinearSVC(tol=1e-8,max_iter=10000,random_state=0,C=5/10, penalty='l2')
# clf4 = LinearSVC(tol=1e-8,max_iter=10000,random_state=1,C=.5, penalty='l2')
clf5 = LogisticRegression(penalty='l2',max_iter=10**6,C=4.1,random_state=10)
clf51 = LogisticRegression(penalty='l2',max_iter=10**6,C=2.5,random_state=10)
clf52 = LogisticRegression(penalty='l2',max_iter=10**6,C=4,random_state=10)
# clf6 = SGDClassifier(loss='hinge', penalty='l2', validation_fraction=0.1,max_iter=3*10**6,tol=10**-8,random_state=0)
#('clf1', clf1), ('clf2', clf2), ('clf3', clf3), ('clf4', clf4),('clf6', clf6), ('clf52', clf52)
VCclf = VotingClassifier(estimators=[('clf5', clf5), ('clf51', clf51)], voting='soft')
train_model(VCclf, X_train, X_test, y_train, y_test)

## Save Predictions

In [None]:
# file = open('prediction.csv','w')
# file.write('Job_description;Label_true;Label_pred\n')
for i in zip(test_ds['Job_offer'],y_test,y_pred):
    file.write(f'{i[0]};{i[1]};{i[2]}\n')
#     if(i[1]!=i[2]):
        print(f'{i[0]};{i[1]};{i[2]}\n')
# file.close()

In [None]:
columns = ['Job_description', 'Label_true', 'Label_pred']
df = pd.DataFrame(list(zip(test_ds['Job_offer'],y_test,y_pred)),columns=columns)
df.to_csv('prediction.csv',sep=';',index=False)

## Read Predictions

In [None]:
predictions_ds = pd.read_csv('prediction.csv', sep=';', header=0)

In [None]:
predictions_ds

## Save Model

In [None]:
filename = 'model.sav'
pickle.dump(lr, open(filename, 'wb'))

## Load Model

In [None]:
filename = 'model.sav'
loaded_model = pickle.load(open(filename, 'rb'))
_ = get_score(loaded_model,X_test,y_test)

## Word Clouds

In [None]:
from wordcloud import WordCloud
def Word_Cloud(words):
    text = ""
    for i in words[1:-1]:
        if len(i[0].split(' ')) ==1:
             text = text + " " + i[0]
    plt.figure( figsize=(20,10), facecolor='k', frameon=False)
    wordcloud= WordCloud(width=1200, height=600,min_font_size=8, max_font_size=100, max_words=500, background_color="white", contour_width=0,contour_color='white').generate(text)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()

In [None]:
Word_Cloud(list(vectorizer.vocabulary_.items()))

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 5000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(train_ds['clean'])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
X_train = tokenizer.texts_to_sequences(train_ds['clean'])
X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X_train.shape)

X_test = tokenizer.texts_to_sequences(test_ds['clean'])
X_test = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X_test.shape)

In [None]:
y_train = pd.get_dummies(train_ds['Label']).values
print('Shape of label tensor:', y_train.shape)

y_test = pd.get_dummies(test_ds['Label']).values
print('Shape of label tensor:', y_test.shape)

In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.25, recurrent_dropout=0.2))
model.add(Dense(128, activation='relu'))
model.add(Dense(5, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 50
batch_size = 64

history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

In [None]:
y_pred = model.predict(X_test)
# print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [None]:
y_test = tf.argmax(y_test,1)
y_pred = tf.argmax(y_pred,1)

In [None]:
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1_score_ = f1_score(y_test, y_pred, average='macro')
print(f"#Precision: {precision:.5f}")
print(f"#Recall: {recall:.5f}")
print(f"#f1 Score: {f1_score_:.5f}")