# 💞 Vectorisation & Matching

In [107]:
# Imports principaux
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [108]:
freelance_df = pd.read_csv('../generate_datasets/updated_freelances_dataset.csv', index_col='freelance_id').reset_index(drop=True)
prospect_df = pd.read_csv('../generate_datasets/updated_prospects_dataset.csv', index_col='prospect_id').reset_index(drop=True)
freelance_df

Unnamed: 0,name,title,main_sector,top3_skills,city,daily_rate,mission_statement,preferred_tone,preferred_style,remote,email
0,Allison Hill,Digital Marketing,Marketing,Analytics|Google Ads|SEO,East Jill,397.68,Allison Hill delivers strategic strategies usi...,Professional,Storytelling,Yes,allison.hill@marketingfreelance.com
1,Javier Johnson,Renewable Energy Expertise,GreenTech,LCA|Project Management|Energy Modeling,East William,462.86,Javier Johnson drives analytical eco-solutions...,Serious,Formal,Yes,javier.johnson@greentechfreelance.com
2,Meredith Barnes,Environmental Impact Analysis,GreenTech,Energy Modeling|IoT|LCA,Lawrencetown,831.17,Meredith Barnes advances eco-friendly sustaina...,Friendly,Storytelling,No,meredith.barnes@greentechfreelance.com
3,Donald Lewis,Supply Chain Management,Retail/E-commerce,Facebook Ads|Customer Support|Data Analytics,Curtisfurt,371.55,Donald Lewis boosts imaginative e-commerce wit...,Creative,Formal,No,donald.lewis@retailecommercefreelance.com
4,Renee Blair,Customer Service,Retail/E-commerce,Supply-Chain|Shopify|Customer Support,South Christianport,621.60,Partner with Renee Blair for customer-focused ...,Professional,Warm,Yes,renee.blair@retailecommercefreelance.com
...,...,...,...,...,...,...,...,...,...,...,...
295,Tracy Rivera,Project Management,GreenTech,Energy Modeling|IoT|Carbon Accounting,Rodriguezville,847.87,With Tracy Rivera's bold approach to Energy Mo...,Energetic,Warm,Yes,tracy.rivera@greentechfreelance.com
296,Kristin Watts,Ed-Tech Integration,Education/Ed-Tech,Learning Analytics|Storyline360|HTML5,South Neil,374.88,Partner with Kristin Watts for engaging Learni...,Energetic,Formal,No,kristin.watts@educationedtechfreelance.com
297,Kathryn Cooper,Customer Service,Retail/E-commerce,Data Analytics|Supply-Chain|Facebook Ads,South Danchester,534.32,Kathryn Cooper creates exclusive shopping expe...,Premium,Storytelling,Yes,kathryn.cooper@retailecommercefreelance.com
298,Eric Gibson,Community Management,Wellness,Copywriting|Pilates|Nutrition,New Timothy,519.93,Eric Gibson creates visionary wellness program...,Creative,Warm,No,eric.gibson@wellnessfreelance.com


In [109]:
prospect_df

Unnamed: 0,company,sector,main_contact,contact_role,city,mission_statement,company_size,funding_stage,ticket_size_class,target_tone,remote,email
0,"Marsh, Spears and Yang",FinTech,Kristen Rivera,CEO,Mullenbury,sme rely on our intelligent FinTech services t...,SME (21-200),Series C+,High,Premium,No,kristen.rivera@marshspearsandyang.com
1,Lopez Ltd,FinTech,Allen Mendez,CEO,South Brittany,At the heart of our mission: transform digital...,Enterprise (1000+),Series A,High,Creative,No,allen.mendez@lopezltd.com
2,Ramirez Ltd,Design,Amy Garcia,Head of Marketing,West Mark,Helping mid-size redefine storytelling brand s...,Mid-size (201-1000),Seed,Medium,Premium,Yes,amy.garcia@ramirezltd.com
3,"Huffman, Rose and Fowler",GreenTech,Lisa Matthews,Head of Data,Michaelmouth,enterprise partner with us to accelerate susta...,Enterprise (1000+),Pre-Seed,High,Serious,No,lisa.matthews@huffmanroseandfowler.com
4,"Mcbride, Sellers and Lawrence",GreenTech,Emily Allen,CEO,East Nathanielberg,Our impact-driven solutions help enterprise pi...,Enterprise (1000+),Series C+,High,Energetic,Yes,emily.allen@mcbridesellersandlawrence.com
...,...,...,...,...,...,...,...,...,...,...,...,...
2995,Hill PLC,Retail/E-commerce,Amanda Bird,CEO,Fergusonport,We support startup in transform conversion rat...,Startup (1-20),Series B,High,Friendly,No,amanda.bird@hillplc.com
2996,"Mendoza, Hayes and Lynch",Retail/E-commerce,Timothy Alvarado,CEO,Andrewfort,Bringing retail online: our conversion-driven ...,Mid-size (201-1000),Seed,Medium,Friendly,Yes,timothy.alvarado@mendozahayesandlynch.com
2997,"Carpenter, Nielsen and Stone",Tech/SaaS,Elaine Rodriguez,CTO,Burchmouth,Scaling success: our resilient software empowe...,SME (21-200),Series B,High,Premium,Yes,elaine.rodriguez@carpenternielsenandstone.com
2998,Wells Ltd,Marketing,Renee Hurst,CTO,Stanleyland,Helping startup optimise brand storytelling br...,Startup (1-20),Series B,High,Premium,Yes,renee.hurst@wellsltd.com


In [110]:
def vectorize_missions(freelance_df, prospect_df):
    """Vectorise les mission statements des freelances et prospects avec TF-IDF."""
    vectorizer = TfidfVectorizer(stop_words='english')
    freelance_tfidf = vectorizer.fit_transform(freelance_df["mission_statement"])
    prospect_tfidf = vectorizer.transform(prospect_df["mission_statement"])
    return freelance_tfidf, prospect_tfidf

def get_top_20_leads(freelance_vec, prospect_tfidf, prospect_df):
    """
    Retourne les 20 prospects les plus similaires pour un vecteur freelance donné.

    Args:
        freelance_vec: Vecteur TF-IDF (1D ou 2D) du freelance (shape: (1, n_features)).
        prospect_tfidf: Matrice TF-IDF des prospects.
        prospect_df: DataFrame des prospects.

    Returns:
        DataFrame des 20 prospects les plus similaires avec score de similarité.
    """
    similarities = cosine_similarity(freelance_vec, prospect_tfidf).flatten()
    top_20_indices = similarities.argsort()[-20:][::-1]

    return prospect_df.iloc[top_20_indices].assign(
        similarity=similarities[top_20_indices]
    )

In [111]:
freelance_tfidf, prospect_tfidf = vectorize_missions(freelance_df, prospect_df)
top_20_df = get_top_20_leads(freelance_tfidf[0], prospect_tfidf, prospect_df)

In [112]:
top_20_df

Unnamed: 0,company,sector,main_contact,contact_role,city,mission_statement,company_size,funding_stage,ticket_size_class,target_tone,remote,email,similarity
558,Anderson Inc,Design,Corey Thomas,Head of Data,New Jameston,Shaping how sme are perceived by elevate brand...,SME (21-200),Series B,High,Creative,Yes,corey.thomas@andersoninc.com,0.498487
2634,"Walker, Alexander and Stevens",Design,Taylor Smith,Head of Data,New Debra,Shaping how enterprise are perceived by elevat...,Enterprise (1000+),Series C+,High,Professional,Yes,taylor.smith@walkeralexanderandstevens.com,0.498487
1595,Sharp-Wagner,Design,Elizabeth Smith,Head of Data,Chaneychester,Shaping how mid-size are perceived by elevate ...,Mid-size (201-1000),Pre-Seed,Medium,Serious,No,elizabeth.smith@sharpwagner.com,0.498487
193,Parsons PLC,Design,Cody Smith,Head of Marketing,Kimberlybury,Shaping how startup are perceived by elevate b...,Startup (1-20),Pre-Seed,Low,Energetic,No,cody.smith@parsonsplc.com,0.434225
1392,"Martinez, Hudson and Munoz",Design,Lawrence Bass,Head of Marketing,Joshuamouth,Shaping how sme are perceived by elevate brand...,SME (21-200),Series A,Medium,Creative,Yes,lawrence.bass@martinezhudsonandmunoz.com,0.434225
2642,Clark-Parker,Marketing,Madison Garcia,Head of Data,Jenniferton,From insight to impact: we help enterprise opt...,Enterprise (1000+),Series B,High,Energetic,Yes,madison.garcia@clarkparker.com,0.427103
1855,Jacobs Group,Marketing,Stephen Williamson,Head of Data,South Ericfort,We optimise brand storytelling brand presence ...,Startup (1-20),Seed,Low,Serious,No,stephen.williamson@jacobsgroup.com,0.375663
1107,King-Rice,Marketing,Suzanne Rodgers,CEO,Connieview,From insight to impact: we help mid-size pione...,Mid-size (201-1000),Pre-Seed,Medium,Premium,No,suzanne.rodgers@kingrice.com,0.373402
673,Holden LLC,Marketing,Shannon Clark,CEO,Palmerburgh,From insight to impact: we help mid-size pione...,Mid-size (201-1000),Series C+,High,Energetic,Yes,shannon.clark@holdenllc.com,0.373402
931,Cooper-Quinn,Marketing,Sabrina Miller,Head of Data,Smithland,mid-size grow their reach by optimise brand st...,Mid-size (201-1000),Seed,Medium,Premium,No,sabrina.miller@cooperquinn.com,0.371422


# 📧 Mail generator

In [113]:
from langchain.chat_models import init_chat_model
from IPython.display import Markdown
import time

In [114]:
def mail_generator(freelance, prospect):
    """
    Génère un email de prospection personnalisé en anglais à partir des données d’un freelance et d’une entreprise cible.

    Paramètres :
    -----------
    freelance : dict
        Dictionnaire contenant les informations du freelance :
        - 'name' : Nom complet
        - 'title' : Titre ou métier
        - 'main_sector' : Secteur principal d’activité
        - 'city' : Ville
        - 'top3_skills' : Compétences clés (format texte)
        - 'daily_rate' : Tarif journalier
        - 'remote' : "Yes"/"No"
        - 'mission_statement' : Résumé de la proposition de valeur
        - 'preferred_tone' : Ton préféré (ex. : Professional)
        - 'preferred_style' : Style préféré (ex. : Storytelling)

    prospect : pandas.Series ou dict
        Informations sur l’entreprise cible :
        - 'company' : Nom de l’entreprise
        - 'city' : Ville
        - 'sector' : Secteur d’activité
        - 'main_contact' : Nom du contact principal
        - 'contact_role' : Poste du contact
        - 'company_size' : Taille de l’entreprise
        - 'funding_stage' : Stade de financement
        - 'remote' : "Yes"/"No"
        - 'target_tone' : Ton attendu côté entreprise

    Retour :
    --------
    prospect : pandas.Series ou dict
        Le même objet que `prospect` mais avec un champ supplémentaire 'mail' contenant l’email généré.
    """

    model = init_chat_model("gemini-2.0-flash", model_provider="google_genai")
    prospect = prospect.copy()

    prompt = f"""
    Write a clear, professional, and personalized cold email in English, addressed to {prospect['main_contact']} ({prospect['contact_role']})
    from the company {prospect['company']}, based in {prospect['city']} and operating in the {prospect['sector']} sector.

    You are {freelance['name']}, a {freelance['title']} based in {freelance['city']}, specialized in the {freelance['main_sector']} sector.
    You provide services with expertise in {freelance['top3_skills']}, at a daily rate of {freelance['daily_rate']} EUR (remote: {freelance['remote']}).
    Your mission is: {freelance['mission_statement']}

    The company is a {prospect['company_size']} at the {prospect['funding_stage']} stage, and remote work availability is {prospect['remote']}.

    The email should:
    - Open with a brief and relevant introduction.
    - Present the value you can bring to this company in 2–3 concise sentences.
    - Match the company's tone: {prospect['target_tone']}, while reflecting your preferred tone: {freelance['preferred_tone']} and style: {freelance['preferred_style']}.
    - Be business-oriented and adapted to the company's context.
    - End with a clear, actionable closing (e.g., propose a short call or ask for availability).
    - Sign with your name
    Ensure the language is polite, direct, and free from repetition or generic phrases. Avoid using placeholders or uncertain formulations.
    Return only the body of the email (no subject line or explanation).
    """

    try:
        response = model.invoke(prompt)
        content = response.content if hasattr(response, "content") else str(response)
    except Exception as e:
        print(f"Error : {e}")
        content = f"ERROR: {e}"

    prospect['mail'] = response.__dict__['content']

    return prospect


In [115]:
freelance_index = 2
freelance = freelance_df.iloc[freelance_index]
top_20_df = get_top_20_leads(freelance_tfidf[freelance_index], prospect_tfidf, prospect_df)
prospect = top_20_df.iloc[5]

prospect_mail = mail_generator(freelance, prospect)
Markdown(prospect_mail['mail'])

Hi Shannon,

I came across Deleon-Herrera’s work in GreenTech and was particularly impressed by [mention a specific project or initiative]. It’s inspiring to see companies so dedicated to sustainable solutions.

As an Environmental Impact Analyst specializing in Energy Modeling, IoT, and LCA, I help GreenTech companies like yours optimize their environmental impact and ensure long-term sustainability. I can deliver tailored strategies and impactful results, making sure your innovative technologies truly lead the way in eco-friendly practices.

Would you be open to a brief 15-minute call next week to explore how my expertise could support Deleon-Herrera's goals?

Best regards,

Meredith Barnes