In [2]:
# import des librairies 
import pandas as pd
import matplotlib.pyplot as plt
import math
import numpy as np
import pickle

# clean data
from transformers import BertTokenizer, BertForTokenClassification, BertForSequenceClassification
import torch
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from torch.nn.functional import softmax

In [3]:
jobs_df = pd.read_csv("../data/silver_data.csv")
# préparer un df avec uniquement les colonnes à utiliser pour le modèle
df = jobs_df[["jobs", "description", "ID_dep", "ville", "date", "experience", "skills", "tools", "industry", "company", "company_description"]]
df.head(3)

Unnamed: 0,jobs,description,ID_dep,ville,date,experience,skills,tools,industry,company,company_description
0,Data Engineer sénior (F/H) CDI (H/F),"En tant que Data Engineer chez Quantmetry, vou...",75,PARIS 08,2023/07/12,5 ans,"concevoir et gérer un projet, concevoir un log...",,Conseil pour les affaires et autres conseils d...,QUANTMETRY,Pure player en Data et Intelligence Artificiel...
1,Data Consultant Stratégie Sénior (H/F),Nous recrutons des personnes avec une appétenc...,75,PARIS 08,2023/07/12,5 ans,"analyser les résultats d'un projet, décliner l...",,Conseil pour les affaires et autres conseils d...,QUANTMETRY,Pure player en Data et Intelligence Artificiel...
2,Chef de projets Performance Durable/Energie/Da...,Intégré(e) au sein de la Direction Performance...,92,ASNIERES SUR SEINE,2023/07/13,3 ans,contrôler et faire appliquer le respect de dis...,tableau,Activités des sièges sociaux,NEXITY,Nexity est aujourd hui leader sur les différen...


In [4]:
# Initialiser le modèle et le tokenizer
model_name = "bert-base-multilingual-cased"
model = BertForSequenceClassification.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

In [6]:
# # Créer un DataFrame à partir de la liste
# df = pd.DataFrame(possible_titles, columns=["Possible Title"])

# # Sauvegarder le DataFrame au format CSV
# df.to_csv("../data/possible_title.csv", index=False)

In [7]:
possible_title_df = pd.read_csv("../data/possible_title.csv")
possible_title_df.head()

Unnamed: 0,Possible Title
0,ABAP Developer
1,ASIC Design Engineer
2,ASIC Engineer
3,ASP.NET Developer
4,Actuarial Associate


In [12]:
def predict_job_title(title_to_check):
    max_prob = 0
    best_title = ""
    
    for possible_title in possible_titles:
        # Tokenizing title_to_check with possible_title
        encoded_input = tokenizer(title_to_check, text_pair=possible_title, padding='max_length', truncation=True, max_length=150, return_tensors='pt')
        outputs = model(**encoded_input)
        
        probs = softmax(outputs.logits, dim=1)
        cur_prob = probs[0][1].item()
        
        if cur_prob > max_prob:
            max_prob = cur_prob
            best_title = possible_title

    return best_title

possible_titles = list(possible_title_df["Possible Title"].values)
jobs_list_1 = list(df["jobs"].values)

In [9]:
possible_titles[:10]

['ABAP Developer',
 'ASIC Design Engineer',
 'ASIC Engineer',
 'ASP.NET Developer',
 'Actuarial Associate',
 'Actuarial Consultant',
 'Actuary',
 'Administrator, Lotus Notes',
 'Alliance Manager, Enterprise Software',
 'Analyst Methods & Procedures']

In [15]:
jobs_list = jobs_list[:3]

In [None]:
# Predict for each job in the list
cleaned_jobs = [predict_job_title(job) for job in jobs_list]

In [None]:
len(cleaned_jobs)

In [None]:
cleaned_jobs[:100]

In [None]:
# Replace jobs du df par la nouvelle liste de job clean

### Modèle pour obtenir job_description

maintenant qu'on a les bon noms de job, on va :

- 
séparer les descriptions (description de l'offre d'emploi) dans des listes différentes en fonction du jo- b
les envoyer au modèle pour qu'il nous en ressort une description pour ce job.

In [5]:
# import des librairies
from transformers import T5ForConditionalGeneration, T5Tokenizer
import pandas as pd
import numpy as np 

In [None]:
# Créons une liste de listes de descriptions, où chaque sous-liste contient toutes les descriptions associées à un titre de poste spécifique.
grouped_descriptions = df.groupby('job_title_clean')['description'].apply(list).to_dict()

In [None]:
# initialize the model architecture and weights
model = T5ForConditionalGeneration.from_pretrained("t5-base")
# initialize the model tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-base")

In [None]:
# Summariser chaque groupe de descriptions pour obtenir une description de poste
def summarize_grouped_texts(text_groups, model, tokenizer):
    summaries = {}
    
    for job_title, texts in text_groups.items():
        # Join the texts into a single string
        combined_text = ' '.join(texts)
        
        # Get the summary for this combined text
        summary = summarize_texts([combined_text], model, tokenizer)[0]
        
        summaries[job_title] = summary
        
    return summaries

In [None]:
summaries = summarize_grouped_texts(grouped_descriptions, model, tokenizer)

In [None]:
summaries[:3]

In [None]:
df['job_description'] = df['job_title_clean'].map(summaries)