<a href="https://colab.research.google.com/github/GarouachiMonia/Association-Rules/blob/main/Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Charger les fichiers CSV
df_cv=pd.read_csv('Resume.csv')
df_jobs=pd.read_csv('training_data.csv')

# Nettoyage des textes
def preprocess_text(text):
    text = text.lower()  # Convertir en minuscules
    text = re.sub(r'\W', ' ', text)  # Supprimer les caractères spéciaux
    text = re.sub(r'\s+', ' ', text)  # Supprimer les espaces multiples
    text = text.strip()  # Supprimer les espaces au début/fin
    return text

df_cv['clean_resume'] = df_cv['Resume_str'].apply(preprocess_text)
df_jobs['clean_job_desc'] = df_jobs['job_description'].apply(preprocess_text)

# Tokenisation et suppression des stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    tokens = text.split()
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(filtered_tokens)

df_cv['clean_resume'] = df_cv['clean_resume'].apply(remove_stopwords)
df_jobs['clean_job_desc'] = df_jobs['clean_job_desc'].apply(remove_stopwords)

# Vectorisation TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_cv = tfidf_vectorizer.fit_transform(df_cv['clean_resume']).toarray()
X_jobs = tfidf_vectorizer.transform(df_jobs['clean_job_desc']).toarray()

# Exemple de matching (Random Forest)
y = df_cv['Category']  # Définir la catégorie à prédire
X_train, X_test, y_train, y_test = train_test_split(X_cv, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Accuracy: 0.6720321931589537
                        precision    recall  f1-score   support

            ACCOUNTANT       0.76      0.90      0.83        29
              ADVOCATE       0.81      0.70      0.75        30
           AGRICULTURE       0.60      0.38      0.46         8
               APPAREL       0.60      0.30      0.40        20
                  ARTS       0.18      0.11      0.14        18
            AUTOMOBILE       0.00      0.00      0.00         6
              AVIATION       0.86      0.86      0.86        21
               BANKING       0.68      0.65      0.67        23
                   BPO       0.00      0.00      0.00         2
  BUSINESS-DEVELOPMENT       0.76      0.48      0.59        27
                  CHEF       0.85      0.71      0.77        24
          CONSTRUCTION       0.89      0.91      0.90        34
            CONSULTANT       0.78      0.35      0.48        20
              DESIGNER       0.71      0.89      0.79        19
         D

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    doc = nlp(text.lower())
    features = {'skills': [], 'education': []}

    # Extraction des entités nommées
    for ent in doc.ents:
        if ent.label_ == 'SKILL':  # Spacy peut être adapté pour détecter les compétences spécifiques
            features['skills'].append(ent.text)
        if ent.label_ in ['ORG', 'GPE', 'DATE', 'EDUCATION']:
            features['education'].append(ent.text)
    return features

In [2]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def lemmatize_words(words):
    return [lemmatizer.lemmatize(word) for word in words]

In [3]:
from transformers import RobertaTokenizer, RobertaModel

model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
pip install -U sentence-transformers



In [5]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')  # ou un autre modèle Sentence-BERT optimisé

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
def weighted_cosine_similarity(job_embedding, resume_embedding, skill_weight=0.6, edu_weight=0.4):
    skill_similarity = cosine_similarity([job_embedding['skills']], [resume_embedding['skills']])[0][0]
    edu_similarity = cosine_similarity([job_embedding['education']], [resume_embedding['education']])[0][0]
    return skill_weight * skill_similarity + edu_weight * edu_similarity

In [7]:
def get_batch_embeddings(texts, model, tokenizer, batch_size=16):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        inputs = tokenizer(texts[i:i+batch_size], return_tensors="pt", padding=True, truncation=True).to(device)
        outputs = model(**inputs)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).detach().to("cpu").numpy()
        embeddings.append(batch_embeddings)
    return np.vstack(embeddings)

In [7]:
pip install pypdf

Collecting pypdf
  Downloading pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)
Downloading pypdf-5.1.0-py3-none-any.whl (297 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/298.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/298.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.1.0


In [8]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
import pandas as pd
from pypdf import PdfReader
from nltk import pos_tag, sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import string
import re
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Fonction d'extraction de texte depuis un fichier PDF
def extract_text_from_pdf(file_path):
    try:
        reader = PdfReader(file_path)
        text = "".join(page.extract_text() for page in reader.pages)
        return text
    except FileNotFoundError:
        print(f"Le fichier {file_path} n'a pas été trouvé. Il sera ignoré.")
        return None

# Fonction de pré-traitement du texte
def preprocess_text(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z]', ' ', text)
    sentences = sent_tokenize(text)
    features = {'feature': ""}
    stop_words = set(stopwords.words("english"))
    for sent in sentences:
        if any(criteria in sent for criteria in ['skills', 'education']):
            words = word_tokenize(sent)
            words = [word for word in words if word not in stop_words]
            tagged_words = pos_tag(words)
            filtered_words = [word for word, tag in tagged_words if tag not in ['DT', 'IN', 'TO', 'PRP', 'WP']]
            features['feature'] += " ".join(filtered_words)
    return features

# Traitement des données de CV
def process_resume_data(row):
    id = row['ID']
    category = row['Category']
    file_path = f"/content/drive/MyDrive/data/{category}/{id}.pdf"
    text = extract_text_from_pdf(file_path)
    if text:
        features = preprocess_text(text)
        row['Feature'] = features['feature']
    else:
        row['Feature'] = ""
    return row

# Obtenir des embeddings à partir du texte
def get_embeddings(text, model):
    embeddings = model.encode(text)
    return embeddings

# Affichage des correspondances les plus proches
def print_top_matching_resumes(result_group):
    for i in range(15):
        print("\nJob ID:", i)
        print("Cosine Similarity | Domain Resume | Domain Description")
        print(result_group.get_group(i)[['similarity', 'domainResume', 'domainDesc']])

def main():
    # Charger les données de CV et les offres d'emploi
    resume_data = pd.read_csv("Resume.csv")
    resume_data = resume_data.drop(["Resume_html"], axis=1)
    resume_data = resume_data.apply(process_resume_data, axis=1)
    resume_data.to_csv("/content/drive/MyDrive/resume_data.csv", index=False)

    job_description = pd.read_csv("training_data.csv")
    job_description = job_description[["job_description", "position_title"]][:15]
    job_description['Features'] = job_description['job_description'].apply(lambda x : preprocess_text(x)['feature'])

    # Charger le modèle SentenceTransformer
    model_name = "bert-base-nli-mean-tokens"
    model = SentenceTransformer(model_name)

    # Obtenir les embeddings des offres d'emploi et des CV
    job_desc_embeddings = np.array([get_embeddings(desc, model) for desc in job_description['Features']])
    resume_embeddings = np.array([get_embeddings(text, model) for text in resume_data['Feature']])

    # Calculer les similarités cosinus et organiser les résultats
    result_df = pd.DataFrame(columns=['jobId', 'resumeId', 'similarity', 'domainResume', 'domainDesc'])
    for i, job_desc_emb in enumerate(job_desc_embeddings):
        similarities = cosine_similarity([job_desc_emb], resume_embeddings)
        top_k_indices = np.argsort(similarities[0])[::-1][:5]
        for j in top_k_indices:
            result_df.loc[len(result_df)] = [
                i, resume_data['ID'].iloc[j], similarities[0][j],
                resume_data['Category'].iloc[j], job_description['position_title'].iloc[i]
            ]

    result_df = result_df.sort_values(by='similarity', ascending=False)
    result_group = result_df.groupby("jobId")
    print_top_matching_resumes(result_group)

if __name__ == "__main__":
    main()


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Job ID: 0
Cosine Similarity | Domain Resume | Domain Description
   similarity          domainResume        domainDesc
0    0.912439          CONSTRUCTION  Sales Specialist
1    0.911003                  ARTS  Sales Specialist
2    0.910609            CONSULTANT  Sales Specialist
3    0.899933         DIGITAL-MEDIA  Sales Specialist
4    0.896590  BUSINESS-DEVELOPMENT  Sales Specialist

Job ID: 1
Cosine Similarity | Domain Resume | Domain Description
   similarity          domainResume                  domainDesc
5    0.864427  BUSINESS-DEVELOPMENT  Apple Solutions Consultant
6    0.859329                 SALES  Apple Solutions Consultant
7    0.858578                 SALES  Apple Solutions Consultant
8    0.856729            CONSULTANT  Apple Solutions Consultant
9    0.855073  BUSINESS-DEVELOPMENT  Apple Solutions Consultant

Job ID: 2
Cosine Similarity | Domain Resume | Domain Description
    similarity          domainResume  \
10    0.820559               BANKING   
11    0.814799

In [14]:
dff=pd.read_csv("training_data.csv")
dff.head(10)

Unnamed: 0,company_name,job_description,position_title,description_length,model_response
0,Google,minimum qualifications\nbachelors degree or eq...,Sales Specialist,2727,"{\n ""Core Responsibilities"": ""Responsible fo..."
1,Apple,description\nas an asc you will be highly infl...,Apple Solutions Consultant,828,"{\n ""Core Responsibilities"": ""as an asc you ..."
2,Netflix,its an amazing time to be joining netflix as w...,Licensing Coordinator - Consumer Products,3205,"{\n ""Core Responsibilities"": ""Help drive bus..."
3,Robert Half,description\n\nweb designers looking to expand...,Web Designer,2489,"{\n ""Core Responsibilities"": ""Designing webs..."
4,TrackFive,at trackfive weve got big goals were on a miss...,Web Developer,3167,"{\n ""Core Responsibilities"": ""Build and layo..."
5,DesignUps,designups is a nashville based design and inte...,Frontend Web Developer,892,"{\n ""Core Responsibilities"": ""Translate desi..."
6,"Equisolve, Inc.",about the position\n\nthe web designer is resp...,Remote Website Designer,3471,"{\n ""Core Responsibilities"": ""Provide design..."
7,Zander Insurance Agency,job description\n\nzander insurance group is o...,Web Designer,2896,"{\n ""Core Responsibilities"": ""Design compell..."
8,Tuff,tuff is a growth marketing team working with c...,Web Designer,5143,"{\n ""Core Responsibilities"": ""Work on variou..."
9,General Dynamics Information Technology,type of requisition regular\n\nclearance level...,SR. Web Designer,4023,"{\n ""Core Responsibilities"": ""Designs and bu..."


In [15]:
# prompt: Avec le DataFrame dff: supprimer colonnes

# Delete the specified columns
dff = dff.drop(['company_name', 'description_length'], axis=1)

dff.head(10)

Unnamed: 0,job_description,position_title,model_response
0,minimum qualifications\nbachelors degree or eq...,Sales Specialist,"{\n ""Core Responsibilities"": ""Responsible fo..."
1,description\nas an asc you will be highly infl...,Apple Solutions Consultant,"{\n ""Core Responsibilities"": ""as an asc you ..."
2,its an amazing time to be joining netflix as w...,Licensing Coordinator - Consumer Products,"{\n ""Core Responsibilities"": ""Help drive bus..."
3,description\n\nweb designers looking to expand...,Web Designer,"{\n ""Core Responsibilities"": ""Designing webs..."
4,at trackfive weve got big goals were on a miss...,Web Developer,"{\n ""Core Responsibilities"": ""Build and layo..."
5,designups is a nashville based design and inte...,Frontend Web Developer,"{\n ""Core Responsibilities"": ""Translate desi..."
6,about the position\n\nthe web designer is resp...,Remote Website Designer,"{\n ""Core Responsibilities"": ""Provide design..."
7,job description\n\nzander insurance group is o...,Web Designer,"{\n ""Core Responsibilities"": ""Design compell..."
8,tuff is a growth marketing team working with c...,Web Designer,"{\n ""Core Responsibilities"": ""Work on variou..."
9,type of requisition regular\n\nclearance level...,SR. Web Designer,"{\n ""Core Responsibilities"": ""Designs and bu..."


In [None]:
import pandas as pd
from pypdf import PdfReader
from nltk import pos_tag, sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import string
import re
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Fonction d'extraction de texte depuis un fichier PDF
def extract_text_from_pdf(file_path):
    try:
        reader = PdfReader(file_path)
        text = "".join(page.extract_text() for page in reader.pages)
        return text
    except FileNotFoundError:
        print(f"Le fichier {file_path} n'a pas été trouvé. Il sera ignoré.")
        return None

# Fonction de pré-traitement du texte
def preprocess_text(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z]', ' ', text)
    sentences = sent_tokenize(text)
    features = {'feature': ""}
    stop_words = set(stopwords.words("english"))
    for sent in sentences:
        if any(criteria in sent for criteria in ['skills', 'education']):
            words = word_tokenize(sent)
            words = [word for word in words if word not in stop_words]
            tagged_words = pos_tag(words)
            filtered_words = [word for word, tag in tagged_words if tag not in ['DT', 'IN', 'TO', 'PRP', 'WP']]
            features['feature'] += " ".join(filtered_words)
    return features

# Traitement des données de CV
def process_resume_data(row):
    id = row['ID']
    category = row['Category']
    file_path = f"/content/drive/MyDrive/data/{category}/{id}.pdf"
    text = extract_text_from_pdf(file_path)
    if text:
        features = preprocess_text(text)
        row['Feature'] = features['feature']
    else:
        row['Feature'] = ""
    return row

# Obtenir des embeddings à partir du texte
def get_embeddings(text, model, device):
    embeddings = model.encode(text, device=device)
    return embeddings

# Affichage des correspondances les plus proches
def print_top_matching_resumes(result_group):
    for i in range(15):
        print("\nJob ID:", i)
        print("Cosine Similarity | Domain Resume | Domain Description")
        print(result_group.get_group(i)[['similarity', 'domainResume', 'domainDesc']])

def main():
    # Charger les données de CV et les offres d'emploi
    resume_data = pd.read_csv("Resume.csv")
    resume_data = resume_data.drop(["Resume_html"], axis=1)
    resume_data = resume_data.apply(process_resume_data, axis=1)
    resume_data.to_csv("/content/drive/MyDrive/resume_data.csv", index=False)

    job_description = pd.read_csv("training_data.csv")
    job_description = job_description[["job_description", "position_title"]][:15]
    job_description['Features'] = job_description['job_description'].apply(lambda x : preprocess_text(x)['feature'])

    # Charger le modèle BERT avec SentenceTransformer
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model_name = "bert-base-uncased"
    model = SentenceTransformer(model_name)
    model.to(device)

    # Obtenir les embeddings des offres d'emploi et des CV
    job_desc_embeddings = np.array([get_embeddings(desc, model, device) for desc in job_description['Features']])
    resume_embeddings = np.array([get_embeddings(text, model, device) for text in resume_data['Feature']])

    # Calculer les similarités cosinus et organiser les résultats
    result_df = pd.DataFrame(columns=['jobId', 'resumeId', 'similarity', 'domainResume', 'domainDesc'])
    for i, job_desc_emb in enumerate(job_desc_embeddings):
        similarities = cosine_similarity([job_desc_emb], resume_embeddings)
        top_k_indices = np.argsort(similarities[0])[::-1][:5]
        for j in top_k_indices:
            result_df.loc[len(result_df)] = [
                i, resume_data['ID'].iloc[j], similarities[0][j],
                resume_data['Category'].iloc[j], job_description['position_title'].iloc[i]
            ]

    result_df = result_df.sort_values(by='similarity', ascending=False)
    result_group = result_df.groupby("jobId")
    print_top_matching_resumes(result_group)

if __name__ == "__main__":
    main()


In [13]:
import os

directory = "/content/drive/MyDrive/data/"
files = os.listdir(directory)
print("Fichiers dans le répertoire:", files)

Fichiers dans le répertoire: ['DIGITAL-MEDIA', 'FITNESS', 'TEACHER', 'HR', 'FINANCE', 'ENGINEERING', 'INFORMATION-TECHNOLOGY', 'HEALTHCARE', 'SALES', 'PUBLIC-RELATIONS', 'AVIATION', 'BUSINESS-DEVELOPMENT', 'DESIGNER', 'BPO', 'CONSULTANT', 'BANKING', 'AUTOMOBILE', 'CHEF', 'ARTS', 'CONSTRUCTION', 'APPAREL', 'ACCOUNTANT', 'AGRICULTURE', 'ADVOCATE']
