# 🧠 LeadCraftr - Vectorisation & Matching

In [69]:
# Imports principaux
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [71]:
freelance_df = pd.read_csv('../generate_datasets/freelances_dataset.csv', index_col='freelance_id').reset_index()
prospect_df = pd.read_csv('../generate_datasets/prospects_dataset.csv', index_col='prospect_id').reset_index()
freelance_df

Unnamed: 0,freelance_id,nom,titre,secteur_principal,skills_top3,ville,tjm,mission_statement,tonalite_preferee,style_prefere
0,1,Alexandre Traore,"Marketing digital (SEO, SEM)",Marketing,Analytics|Google Ads|SEO,Moulin,397.68,Le confort d'avancer à l'état pur,Professionnel,Storytelling
1,2,Aurore Maury-Briand,Stratégie de marque,Marketing,Copywriting|SEO|Email Marketing,Saint Éric,365.59,L'avantage d'innover de manière sûre,Bienveillant,Storytelling
2,3,Édouard Rocher,"Gestion de cloud (AWS, Azure)",Tech/SaaS,Docker|PostgreSQL|CI/CD,Leblanc-la-Forêt,454.31,Le pouvoir d'avancer avant-tout,Sérieux,Chaleureux
3,4,Julie Bertrand,Gestion de produit,Tech/SaaS,FastAPI|CI/CD|AWS,RegnierBourg,494.51,L'avantage de changer à sa source,Bienveillant,Chaleureux
4,5,Alphonse Peltier,"Marketing digital (SEO, SEM)",Marketing,Copywriting|SEO|Google Ads,Leleu,893.25,Le pouvoir d'innover naturellement,Sérieux,Chaleureux
...,...,...,...,...,...,...,...,...,...,...
295,296,Georges Klein,Connaissance des normes ESG,GreenTech,Energy Modeling|Carbon Accounting|LCA,Gomes,622.44,L'art de louer naturellement,Bienveillant,Chaleureux
296,297,Gabriel-Auguste Renaud,Développement de systèmes sécurisés,FinTech,SQL|Python|AML/KYC,Valette,836.24,Le confort d'avancer plus facilement,Professionnel,Formel
297,298,Suzanne Bouvier,Développement de systèmes sécurisés,FinTech,SQL|PowerBI|Risk Modeling,Mathieuboeuf,718.45,Le droit d'innover naturellement,Premium,Formel
298,299,Charles-Benjamin Barbier,Développement de systèmes sécurisés,FinTech,Python|PowerBI|SQL,Fischer,677.83,La liberté d'innover en toute tranquilité,Professionnel,Formel


In [72]:
prospect_df

Unnamed: 0,prospect_id,societe,secteur,contact_principal,role_contact,ville,mission_statement,company_size,funding_stage,ticket_size_class,tonalite_cible
0,1,Rivière Richard SA,FinTech,Christophe Philippe-Maréchal,CTO,Torres,Whiteboard strategic applications,ETI (201-1000),Seed,Medium,Créatif
1,2,Sanchez,Wellness/Bien-Être,Arthur Philippe,CTO,RodriguezBourg,Productize open-source initiatives,ETI (201-1000),Pre-Seed,Medium,Premium
2,3,Pinto SA,GreenTech,Sébastien Lesage,Head of Marketing,MaheVille,Synergize cutting-edge roi,Groupe (1000+),Pre-Seed,High,Premium
3,4,Benoit,GreenTech,Guillaume Guyot,Head of Data,GérardVille,Re-intermediate interactive experiences,ETI (201-1000),Series B,High,Bienveillant
4,5,Didier,Wellness/Bien-Être,Valentine Bonnet,Head of Data,Sainte Alice-les-Bains,Deploy 24/365 experiences,Groupe (1000+),Seed,High,Bienveillant
...,...,...,...,...,...,...,...,...,...,...,...
2995,2996,Baudry,GreenTech,Émile-Auguste Loiseau,CEO,Didier,Maximize 24/365 platforms,PME (21-200,Series A,Medium,Premium
2996,2997,Ledoux Lejeune S.A.,Retail/E-commerce,Maggie Verdier,Head of Marketing,Schmittdan,Strategize plug-and-play e-commerce,PME (21-200,Series C+,High,Premium
2997,2998,Ollivier,Éducation/Ed-Tech,Mathilde Duhamel,Head of Data,Alves,Generate cutting-edge partnerships,PME (21-200,Series A,Medium,Bienveillant
2998,2999,Diallo Ruiz SA,Design,Joséphine Valentin,CTO,Sainte MatthieuVille,Engage seamless roi,Groupe (1000+),Series C+,High,Premium


In [73]:
def vectorize_missions(freelance_df, prospect_df):
    """Vectorise les mission statements des freelances et prospects avec TF-IDF."""
    vectorizer = TfidfVectorizer(stop_words='english')
    freelance_tfidf = vectorizer.fit_transform(freelance_df["mission_statement"])
    prospect_tfidf = vectorizer.transform(prospect_df["mission_statement"])
    return freelance_tfidf, prospect_tfidf

def get_top_20_leads(freelance_vec, prospect_tfidf, prospect_df):
    """
    Retourne les 20 prospects les plus similaires pour un vecteur freelance donné.

    Args:
        freelance_vec: Vecteur TF-IDF (1D ou 2D) du freelance (shape: (1, n_features)).
        prospect_tfidf: Matrice TF-IDF des prospects.
        prospect_df: DataFrame des prospects.

    Returns:
        DataFrame des 20 prospects les plus similaires avec score de similarité.
    """
    similarities = cosine_similarity(freelance_vec, prospect_tfidf).flatten()
    top_20_indices = similarities.argsort()[-20:][::-1]

    return prospect_df.iloc[top_20_indices].assign(
        similarity=similarities[top_20_indices]
    )

In [74]:
freelance_tfidf, prospect_tfidf = vectorize_missions(freelance_df, prospect_df)
top_20_df = get_top_20_leads(freelance_tfidf[0], prospect_tfidf, prospect_df)

In [75]:
top_20_df

Unnamed: 0,prospect_id,societe,secteur,contact_principal,role_contact,ville,mission_statement,company_size,funding_stage,ticket_size_class,tonalite_cible,similarity
0,1,Rivière Richard SA,FinTech,Christophe Philippe-Maréchal,CTO,Torres,Whiteboard strategic applications,ETI (201-1000),Seed,Medium,Créatif,0.0
2999,3000,Marin,FinTech,Étienne Bourgeois,Head of Marketing,Seguin-sur-Richard,Innovate holistic interfaces,ETI (201-1000),Series C+,High,Professionnel,0.0
2998,2999,Diallo Ruiz SA,Design,Joséphine Valentin,CTO,Sainte MatthieuVille,Engage seamless roi,Groupe (1000+),Series C+,High,Premium,0.0
2997,2998,Ollivier,Éducation/Ed-Tech,Mathilde Duhamel,Head of Data,Alves,Generate cutting-edge partnerships,PME (21-200,Series A,Medium,Bienveillant,0.0
2996,2997,Ledoux Lejeune S.A.,Retail/E-commerce,Maggie Verdier,Head of Marketing,Schmittdan,Strategize plug-and-play e-commerce,PME (21-200,Series C+,High,Premium,0.0
2995,2996,Baudry,GreenTech,Émile-Auguste Loiseau,CEO,Didier,Maximize 24/365 platforms,PME (21-200,Series A,Medium,Premium,0.0
2994,2995,Toussaint,FinTech,Brigitte de Lagarde,CEO,Petitjeanboeuf,Benchmark cross-media relationships,PME (21-200,Series C+,High,Énergique,0.0
2993,2994,Bigot Pascal et Fils,FinTech,Noël Boulay,Head of Marketing,Sainte Gabrielle,Unleash magnetic architectures,PME (21-200,Series C+,High,Créatif,0.0
2992,2993,Lesage,Design,Capucine Brunet,Head of Marketing,Arnaud,Embrace proactive architectures,ETI (201-1000),Series C+,High,Bienveillant,0.0
2991,2992,Guibert,GreenTech,François-Jean Dijoux,Head of Data,Delorme,Matrix e-business initiatives,Groupe (1000+),Series C+,High,Bienveillant,0.0
