In [11]:
import os
import cv2
import pytesseract
import pandas as pd
import numpy as np
import re
import pyodbc
import configparser
import spacy
from spacy.matcher import Matcher
from ultralytics import YOLO

class CVAnalyzer:
    def __init__(self):
        self.model = None
        self.nlp = spacy.load("fr_core_news_sm")

    def load_yolo_model(self, model_path):
        self.model = YOLO(model_path)

    def get_predictions(self, image_path, confidence):
        img = cv2.imread(image_path)
        results = self.model.predict(source=img, conf=confidence)
        return results
    
    def extract_values_from_image(self, img_path, save_path, name, bboxes, probs, names):
        img = cv2.imread(img_path)
        img2 = img.copy()
        class_names = {
            0: 'profile',
            1: 'competance',
            2: 'experience_professionnelle',
            3: 'formation',
            4: 'langues',
            5: 'centre',
            6: 'Contact'
        }
        dicts = {0: {}, 1: {}, 2: {}, 3: {}, 4: {}, 5: {}, 6: {}}
        df = pd.DataFrame(columns=['profile', 'competance', 'experience_professionnelle', 'formation', 'langues', 'centre', 'Contact'])

        for box, prob, index in zip(bboxes, probs.tolist(), names):
            class_dict = dicts[int(index)]
            class_dict[prob] = box

        for index, class_dict in dicts.items():
            if len(class_dict) != 0:
                max_prob = max(class_dict.keys())
                box = class_dict[max_prob]
                x1, y1, x2, y2 = box
                x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
                cv2.rectangle(img, (x1, y1), (x2, y2), (0, 0, 255), 2)
                cropped_image = img2[y1:y2, x1:x2]
                folder_path = class_names[index]
                if not os.path.exists(os.path.join(save_path, folder_path)):
                    os.makedirs(os.path.join(save_path, folder_path))
                file_path = os.path.join(save_path, folder_path, f"{name.split('.')[0]}-{index}.jpg")
                cv2.imwrite(file_path, cropped_image)
                print(file_path)
                text = pytesseract.image_to_string(cropped_image)
                df.at[0, folder_path] = text if text else None

        df.to_csv('my_dataframe.csv', index=False)
        return df
    def clean_and_sort_dataframe(self, df):
        df = pd.read_csv(csv_file)
        df = df.apply(lambda x: x.astype(str).str.lower().str.strip() if x.dtype == "object" else x)
        df = df.replace({'\n': ' ', '\r': ' '}, regex=True)
        for col in df.columns:
            if df[col].dtype == "object":
                df[col] = df[col].sort_values()
        print(df)
        return(df)
    def extract_languages_from_dataframe(self, df, language_keywords):
        for index, row in df.iterrows():
            text = row['langues']
            languages = re.findall(r"\b([A-Za-zéèëêàâîïôùûü]+)\b", text)
            normalized_languages = []
            for language in languages:
                for keyword in language_keywords:
                    if keyword.lower() in language.lower():
                        normalized_languages.append(keyword)
            df.at[index, 'langues'] = ', '.join(normalized_languages) if normalized_languages else None
        print(df)
    def extract_contact_info(self, df):
        # Définition des expressions régulières
        phone_regex = r"\b\d{8}\b"
        email_regex = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+"

        for index, row in df.iterrows():
            # Texte d'origine dans df['Contact']
            text = row['Contact']
            # Extraction du numéro de téléphone
            phone_number = re.search(phone_regex, text)
            phone_number = phone_number.group() if phone_number else ""
            df.at[index, 'phone number'] = phone_number

            # Extraction de l'email
            email = re.search(email_regex, text)
            email = email.group() if email else ""
            df.at[index, 'email'] = email

            # Extraction du profil LinkedIn
            linkedin_profile = ""
            start_index = text.find("linkedin.com/in/")
            if start_index != -1:
                end_index = text.find(" ", start_index)
                if end_index == -1:
                    end_index = len(text)
                linkedin_profile = text[start_index:end_index]
            df.at[index, 'linkedin'] = linkedin_profile
        return df
    def extract_profile(self, text):
        doc = self.nlp(text)
        profile = ""
        for sent in doc.sents:
            if "actuellement" in sent.text.lower():
                profile = sent.text
                break
        return profile

    def apply_extraction(self, df):
        df["extracted_profile"] = df["profile"].apply(self.extract_profile)
        return df
    def replace_empty_with_none(self,df):
        # Iterate over each column in the DataFrame
        for column in df.columns:
            # Check if the column has empty values
            if df[column].empty or df[column].dtype == None:
                # Replace empty values with None
                df[column] = df[column].replace(np.nan, None)
        return df
    def replace(self, df):
        new_df = replace_empty_with_none(df)
        return new_df
    def save_in_database(self, df):
        # Load the config file
        config = configparser.ConfigParser()
        config.read('../../config.ini')
        cnxn_table = (
            "Driver={"+config['Database']['Driver']+"};"
            "Server="+config['Database']['Server']+";"
            "Database="+config['Database']['Database']+";"
            "Trusted_Connection="+config['Database']['Trusted_Connection']+";")
        print(cnxn_table)
        # Establish a database connection
        connection_table = pyodbc.connect(cnxn_table)
        print("successfully") 
        # Connexion à la base de données
        connection_table = pyodbc.connect(cnxn_table)

        # Activation de l'autocommit pour valider automatiquement les transactions
        connection_table.autocommit = True

        # Création d'un curseur pour exécuter les commandes SQL
        cursor = connection_table.cursor()
        # Itération sur les lignes de la DataFrame et insertion de nouvelles lignes dans la table
        for index, row in df.iterrows():
            Profil = row['profile'] 
            Competences = row['competance']
            Experiences_Professionnelles = row['experience_professionnelle']
            Formation = row['formation']
            Langues = row['langues']
            Centre = row['centre']
            Contact = row['Contact']
            Phone_Number = row['phone number'] 
            Email = row['email']
            LinkedIn = row['linkedin']
            cursor.execute('SELECT CV_id FROM CV WHERE Profil = ?  AND Competences = ?', Profil, Competences)
            CV_id = cursor.fetchone()
            CV_id = cursor.execute('SELECT @@IDENTITY').fetchone()[0]
            # Insertion d'une nouvelle ligne dans la table CV avec toutes les colonnes
            cursor.execute('INSERT INTO CV (Profil, Competences, Experiences_Professionnelles, Formation, Langues, Centre, Contact, Phone_Number, Email, LinkedIn) '
                'VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)',
            (Profil, Competences, Experiences_Professionnelles, Formation, Langues, Centre, Contact, Phone_Number, Email, LinkedIn))
        # Validation de la transaction et fermeture du curseur et de la connexion
        connection_table.commit()
        cursor.close()
        connection_table.close()

In [12]:
analyzer = CVAnalyzer()
# Load the YOLO model
analyzer.load_yolo_model('C:/Users/ASUS/code/src/content/runs/content/runs/detect/train/weights/best.pt')
results = analyzer.get_predictions('C:/Users/ASUS/code/src/content/datasets/content/datasets/train/images/cv14_page-0001.jpg', confidence=0.8)
bboxes = results[0].boxes.xyxy  # les coordonnées des boîtes englobantes
probs = results[0].boxes.conf  # les confiances des prédictions
names = results[0].boxes.cls   # les classes des prédictions
name = "cv14_page-0001.jpg"
# Extract values from the image and get the dataframe
df = analyzer.extract_values_from_image('C:/Users/ASUS/code/src/content/datasets/content/datasets/train/images/cv14_page-0001.jpg', 'C:/Users/ASUS/code/src/content/output/content/output', name, bboxes, probs, names)
csv_file='my_dataframe.csv'
df=analyzer.clean_and_sort_dataframe(csv_file)
language_keywords = ['francais', 'anglais', 'allemand', 'espagnol', 'italien', 'arab']
analyzer.extract_languages_from_dataframe(df,language_keywords)
df = analyzer.extract_contact_info(df)  # Apply to the first row (index 0)
analyzer.apply_extraction(df)
new_df = analyzer.replace(df)
analyzer.save_in_database(df)


0: 800x576 1 profile, 1 experience, 1 formation, 1 langues, 1 contact, 548.9ms
Speed: 22.8ms preprocess, 548.9ms inference, 0.0ms postprocess per image at shape (1, 3, 800, 800)


C:/Users/ASUS/code/src/content/output/content/output\profile\cv14_page-0001-0.jpg
C:/Users/ASUS/code/src/content/output/content/output\experience_professionnelle\cv14_page-0001-2.jpg
C:/Users/ASUS/code/src/content/output/content/output\formation\cv14_page-0001-3.jpg
C:/Users/ASUS/code/src/content/output/content/output\langues\cv14_page-0001-4.jpg
C:/Users/ASUS/code/src/content/output/content/output\Contact\cv14_page-0001-6.jpg
                                             profile  competance  \
0  profil  actuellement, étudiant en 3eme année c...         NaN   

                          experience_professionnelle  \
0  experiences professionnelles  stagiaire, arab ...   

                                           formation  \
0  education  ingénierie en intelligence artifici...   

                                             langues  centre  \
0  langues arabe francais anglais  allemand  dist...     NaN   

                                             Contact  
0  aloulou karim etudi

In [13]:
df

Unnamed: 0,profile,competance,experience_professionnelle,formation,langues,centre,Contact,phone number,email,linkedin,extracted_profile
0,"profil actuellement, étudiant en 3eme année c...",,"experiences professionnelles stagiaire, arab ...",education ingénierie en intelligence artifici...,"arab, francais, anglais, allemand",,aloulou karim etudiant en ingénierie en ia * ...,20107299,karim.aloulou@esprit.tn,linkedin.com/in/karim-aloulou-,"profil actuellement, étudiant en 3eme année c..."
