# 💞 Vectorisation & Matching

In [60]:
# Imports principaux
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [61]:
freelance_df = pd.read_csv('../generate_datasets/updated_freelances_dataset.csv', index_col='freelance_id').reset_index(drop=True)
prospect_df = pd.read_csv('../generate_datasets/updated_prospects_dataset.csv', index_col='prospect_id').reset_index(drop=True)
freelance_df

Unnamed: 0,name,title,main_sector,top3_skills,city,daily_rate,mission_statement,preferred_tone,preferred_style,remote
0,Allison Hill,Digital Marketing,Marketing,Analytics|Google Ads|SEO,East Jill,397.68,Allison Hill delivers reliable strategies usin...,Professional,Storytelling,Yes
1,Javier Johnson,Renewable Energy Expertise,GreenTech,LCA|Project Management|Energy Modeling,East William,462.86,Javier Johnson advances focused sustainability...,Serious,Formal,Yes
2,Meredith Barnes,Environmental Impact Analysis,GreenTech,Energy Modeling|IoT|LCA,Lawrencetown,831.17,Partner with Meredith Barnes for collaborative...,Friendly,Storytelling,No
3,Donald Lewis,Supply Chain Management,Retail/E-commerce,Facebook Ads|Customer Support|Data Analytics,Curtisfurt,371.55,Donald Lewis boosts imaginative e-commerce wit...,Creative,Formal,No
4,Renee Blair,Customer Service,Retail/E-commerce,Supply-Chain|Shopify|Customer Support,South Christianport,621.60,Renee Blair creates strategic shopping experie...,Professional,Warm,Yes
...,...,...,...,...,...,...,...,...,...,...
295,Tracy Rivera,Project Management,GreenTech,Energy Modeling|IoT|Carbon Accounting,Rodriguezville,847.87,Tracy Rivera promotes passionate GreenTech wit...,Energetic,Warm,Yes
296,Kristin Watts,Ed-Tech Integration,Education/Ed-Tech,Learning Analytics|Storyline360|HTML5,South Neil,374.88,"With Kristin Watts, motivational expertise in ...",Energetic,Formal,No
297,Kathryn Cooper,Customer Service,Retail/E-commerce,Data Analytics|Supply-Chain|Facebook Ads,South Danchester,534.32,Kathryn Cooper drives premium growth with Data...,Premium,Storytelling,Yes
298,Eric Gibson,Community Management,Wellness,Copywriting|Pilates|Nutrition,New Timothy,519.93,With Eric Gibson's innovative approach to Copy...,Creative,Warm,No


In [62]:
prospect_df

Unnamed: 0,company,sector,main_contact,contact_role,city,mission_statement,company_size,funding_stage,ticket_size_class,target_tone,remote
0,"Marsh, Spears and Yang",FinTech,Kristen Rivera,CEO,Mullenbury,Transforming finance for sme through agile pla...,SME (21-200),Series C+,High,Premium,No
1,Lopez Ltd,FinTech,Allen Mendez,CEO,South Brittany,enterprise rely on our secure FinTech services...,Enterprise (1000+),Series A,High,Creative,No
2,Ramirez Ltd,Design,Amy Garcia,Head of Marketing,West Mark,We streamline visual identity visual experienc...,Mid-size (201-1000),Seed,Medium,Premium,Yes
3,"Huffman, Rose and Fowler",GreenTech,Lisa Matthews,Head of Data,Michaelmouth,Our resilient solutions help enterprise drive ...,Enterprise (1000+),Pre-Seed,High,Serious,No
4,"Mcbride, Sellers and Lawrence",GreenTech,Emily Allen,CEO,East Nathanielberg,Our climate-conscious solutions help enterpris...,Enterprise (1000+),Series C+,High,Energetic,Yes
...,...,...,...,...,...,...,...,...,...,...,...
2995,Hill PLC,Retail/E-commerce,Amanda Bird,CEO,Fergusonport,We support startup in pioneer conversion rates...,Startup (1-20),Series B,High,Friendly,No
2996,"Mendoza, Hayes and Lynch",Retail/E-commerce,Timothy Alvarado,CEO,Andrewfort,mid-size boost customer loyalty by pioneer pur...,Mid-size (201-1000),Seed,Medium,Friendly,Yes
2997,"Carpenter, Nielsen and Stone",Tech/SaaS,Elaine Rodriguez,CTO,Burchmouth,Scaling success: our modular software empowers...,SME (21-200),Series B,High,Premium,Yes
2998,Wells Ltd,Marketing,Renee Hurst,CTO,Stanleyland,Our audience-focused strategies enable startup...,Startup (1-20),Series B,High,Premium,Yes


In [63]:
def vectorize_missions(freelance_df, prospect_df):
    """Vectorise les mission statements des freelances et prospects avec TF-IDF."""
    vectorizer = TfidfVectorizer(stop_words='english')
    freelance_tfidf = vectorizer.fit_transform(freelance_df["mission_statement"])
    prospect_tfidf = vectorizer.transform(prospect_df["mission_statement"])
    return freelance_tfidf, prospect_tfidf

def get_top_20_leads(freelance_vec, prospect_tfidf, prospect_df):
    """
    Retourne les 20 prospects les plus similaires pour un vecteur freelance donné.

    Args:
        freelance_vec: Vecteur TF-IDF (1D ou 2D) du freelance (shape: (1, n_features)).
        prospect_tfidf: Matrice TF-IDF des prospects.
        prospect_df: DataFrame des prospects.

    Returns:
        DataFrame des 20 prospects les plus similaires avec score de similarité.
    """
    similarities = cosine_similarity(freelance_vec, prospect_tfidf).flatten()
    top_20_indices = similarities.argsort()[-20:][::-1]

    return prospect_df.iloc[top_20_indices].assign(
        similarity=similarities[top_20_indices]
    )

In [64]:
freelance_tfidf, prospect_tfidf = vectorize_missions(freelance_df, prospect_df)
top_20_df = get_top_20_leads(freelance_tfidf[0], prospect_tfidf, prospect_df)

In [65]:
top_20_df

Unnamed: 0,company,sector,main_contact,contact_role,city,mission_statement,company_size,funding_stage,ticket_size_class,target_tone,remote,similarity
1181,Solis LLC,Marketing,Laurie Johns,Head of Data,East Davidville,From insight to impact: we help enterprise opt...,Enterprise (1000+),Seed,High,Premium,No,0.304745
2079,"Rivera, Fuller and Bradshaw",Marketing,Katherine Medina,Head of Data,Sanfordland,Our disruptive strategies enable enterprise to...,Enterprise (1000+),Pre-Seed,High,Serious,No,0.268355
1301,"Murphy, Carter and Cochran",Marketing,Keith Mcdonald,Head of Data,Port Meredith,Our disruptive strategies enable sme to optimi...,SME (21-200),Series A,Medium,Friendly,No,0.268355
588,Smith Group,Marketing,Clarence Manning,Head of Data,Theodoremouth,Our compelling strategies enable mid-size to o...,Mid-size (201-1000),Series B,High,Professional,Yes,0.268355
2296,Young and Sons,Marketing,Kevin Peterson,Head of Data,Evelynton,Our disruptive strategies enable startup to op...,Startup (1-20),Series C+,High,Serious,No,0.268355
2235,"Stone, Alvarado and Cross",Marketing,Dr. Jacqueline Martin,CTO,South Kimberlytown,Our disruptive strategies enable enterprise to...,Enterprise (1000+),Seed,High,Premium,Yes,0.253364
57,Wright Inc,Marketing,Richard Le,Head of Data,South Samuelchester,We optimise brand storytelling brand presence ...,SME (21-200),Pre-Seed,Medium,Energetic,No,0.250232
1654,Ellis PLC,Marketing,Lauren Morgan,Head of Data,Lake Deborah,From insight to impact: we help startup optimi...,Startup (1-20),Series B,High,Premium,Yes,0.249807
1835,Williams-Graves,Marketing,Robert Mcdowell,Head of Data,Lake Tinachester,From insight to impact: we help enterprise opt...,Enterprise (1000+),Pre-Seed,High,Professional,No,0.249807
408,Pham Group,Marketing,Stephanie Crane,Head of Data,Sherrymouth,From insight to impact: we help startup optimi...,Startup (1-20),Series C+,High,Premium,No,0.249807


# 📧 Mail generator

In [97]:
from langchain.chat_models import init_chat_model
from IPython.display import Markdown
import time

In [101]:
def mail_generator(freelance, prospect):
    """
    Génère un email de prospection personnalisé en anglais à partir des données d’un freelance et d’une entreprise cible.

    Paramètres :
    -----------
    freelance : dict
        Dictionnaire contenant les informations du freelance :
        - 'name' : Nom complet
        - 'title' : Titre ou métier
        - 'main_sector' : Secteur principal d’activité
        - 'city' : Ville
        - 'top3_skills' : Compétences clés (format texte)
        - 'daily_rate' : Tarif journalier
        - 'remote' : "Yes"/"No"
        - 'mission_statement' : Résumé de la proposition de valeur
        - 'preferred_tone' : Ton préféré (ex. : Professional)
        - 'preferred_style' : Style préféré (ex. : Storytelling)

    prospect : pandas.Series ou dict
        Informations sur l’entreprise cible :
        - 'company' : Nom de l’entreprise
        - 'city' : Ville
        - 'sector' : Secteur d’activité
        - 'main_contact' : Nom du contact principal
        - 'contact_role' : Poste du contact
        - 'company_size' : Taille de l’entreprise
        - 'funding_stage' : Stade de financement
        - 'remote' : "Yes"/"No"
        - 'target_tone' : Ton attendu côté entreprise

    Retour :
    --------
    prospect : pandas.Series ou dict
        Le même objet que `prospect` mais avec un champ supplémentaire 'mail' contenant l’email généré.
    """

    model = init_chat_model("gemini-2.0-flash", model_provider="google_genai")
    prospect = prospect.copy()

    prompt = f"""
    Write a clear, professional, and personalized cold email in English, addressed to {prospect['main_contact']} ({prospect['contact_role']})
    from the company {prospect['company']}, based in {prospect['city']} and operating in the {prospect['sector']} sector.

    You are {freelance['name']}, a {freelance['title']} based in {freelance['city']}, specialized in the {freelance['main_sector']} sector.
    You provide services with expertise in {freelance['top3_skills']}, at a daily rate of {freelance['daily_rate']} EUR (remote: {freelance['remote']}).
    Your mission is: {freelance['mission_statement']}

    The company is a {prospect['company_size']} at the {prospect['funding_stage']} stage, and remote work availability is {prospect['remote']}.

    The email should:
    - Open with a brief and relevant introduction.
    - Present the value you can bring to this company in 2–3 concise sentences.
    - Match the company's tone: {prospect['target_tone']}, while reflecting your preferred tone: {freelance['preferred_tone']} and style: {freelance['preferred_style']}.
    - Be business-oriented and adapted to the company's context.
    - End with a clear, actionable closing (e.g., propose a short call or ask for availability).
    - Sign with your name
    Ensure the language is polite, direct, and free from repetition or generic phrases. Avoid using placeholders or uncertain formulations.
    Return only the body of the email (no subject line or explanation).
    """

    try:
        response = model.invoke(prompt)
        content = response.content if hasattr(response, "content") else str(response)
    except Exception as e:
        print(f"Error : {e}")
        content = f"ERROR: {e}"

    prospect['mail'] = response.__dict__['content']

    return prospect


In [106]:
freelance_index = 2
freelance = freelance_df.iloc[freelance_index]
top_20_df = get_top_20_leads(freelance_tfidf[freelance_index], prospect_tfidf, prospect_df)
prospect = top_20_df.iloc[5]

prospect_mail = mail_generator(freelance, prospect)
Markdown(prospect_mail['mail'])

Dear Latoya,

I recently came across Poole PLC's work in Jacksonmouth, and I'm genuinely impressed by your commitment to GreenTech innovation. It feels like you're building a future where sustainability isn't just a buzzword, but a core principle.

As an Environmental Impact Analyst specializing in Energy Modeling, IoT, and LCA, I believe I can help Poole PLC further refine its eco-impact strategies. Think of me as a translator, turning complex environmental data into actionable insights that drive innovation and efficiency – all within your existing infrastructure. Imagine being able to predict the environmental impact of your newest projects before they even leave the drawing board.

Would you be open to a brief 15-minute call next week to explore how my expertise can specifically benefit Poole PLC?

Sincerely,

Meredith Barnes