# ⚙️ Preprocessing

In [1]:
# Imports principaux
import pandas as pd
import numpy as np

In [2]:
freelance_df = pd.read_csv('../generate_datasets/freelances_dataset.csv', index_col='freelance_id').reset_index(drop=True)
prospect_df = pd.read_csv('../generate_datasets/prospects_dataset.csv', index_col='prospect_id').reset_index(drop=True)
freelance_df

Unnamed: 0,name,title,main_sector,top3_skills,city,daily_rate,mission_statement,preferred_tone,preferred_style,remote,email,tfidf_vector
0,Norma Fisher,Sustainable Engineering,GreenTech,LCA|IoT|Energy Modeling,Altrincham,583.45,With Norma Fisher's proactive approach to LCA ...,Energetic,Warm,No,norma.fisher@greentechfreelance.com,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,Richard Turner,Risk Management,FinTech,Risk Modeling|AML/KYC|SQL,Lewes,366.38,Richard Turner empowers innovative FinTech ven...,Energetic,Storytelling,Yes,richard.turner@fintechfreelance.com,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,Colleen Taylor,Health & Wellness Coaching,Wellness,Yoga|Community Management|Pilates,Redcar,370.49,Colleen Taylor inspires holistic well-being wi...,Creative,Warm,Yes,colleen.taylor@wellnessfreelance.com,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,Danielle Browning,Educational Data Analysis,Education/Ed-Tech,Learning Analytics|Python|Pedagogy,Dewsbury,941.20,With Danielle Browning's educational approach ...,Professional,Storytelling,No,danielle.browning@educationedtechfreelance.com,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,Nicholas Nolan,Cloud Management,Tech/SaaS,CI/CD|AWS|Docker,Reigate,527.64,Partner with Nicholas Nolan for scalable CI/CD...,Professional,Formal,Yes,nicholas.nolan@techsaasfreelance.com,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
295,Matthew Brown,Secure System Development,FinTech,Risk Modeling|PowerBI|Python,Margate,596.67,With Matthew Brown's transparent approach to R...,Friendly,Formal,No,matthew.brown@fintechfreelance.com,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
296,Adriana Atkins,Ed-Tech Integration,Education/Ed-Tech,Pedagogy|Python|HTML5,Northampton,370.70,Adriana Atkins creates engaging learning platf...,Friendly,Storytelling,Yes,adriana.atkins@educationedtechfreelance.com,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
297,Michelle Rodriguez,Cloud Management,Tech/SaaS,Python|CI/CD|AWS,Malvern,710.09,Michelle Rodriguez builds high-performance sol...,Premium,Storytelling,No,michelle.rodriguez@techsaasfreelance.com,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
298,Grace Davis,UI/UX Design,Design,Figma|User Research|AdobeXD,Hinckley,953.42,Grace Davis creates collaborative designs usin...,Friendly,Formal,Yes,grace.davis@designfreelance.com,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [3]:
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag

In [4]:
def get_wordnet_pos(word):
    """Map POS tag to first character for WordNetLemmatizer"""
    tag = pos_tag([word])[0][1][0].lower()  # Get first letter of POS tag
    return{'n': 'n', 'v': 'v', 'a': 'a', 'r': 'r'}.get(tag, 'n')

def cleaning(sentence):
    # Basic cleaning
    sentence = sentence.strip().lower()
    sentence = ''.join(char for char in sentence if not char.isdigit())
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))

    # Tokenization
    tokenized_sentence = word_tokenize(sentence)

    # Stopwords removal
    stop_words = set(stopwords.words('english'))
    tokenized_sentence_cleaned = [w for w in tokenized_sentence if w not in stop_words]

    # Lemmatization with correct POS tagging
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tokenized_sentence_cleaned]

    return ' '.join(lemmatized)

In [5]:
freelance_df['tfidf_vector'] = freelance_df['mission_statement'].apply(cleaning)
prospect_df['tfidf_vector'] = prospect_df['mission_statement'].apply(cleaning)
freelance_df

Unnamed: 0,name,title,main_sector,top3_skills,city,daily_rate,mission_statement,preferred_tone,preferred_style,remote,email,tfidf_vector
0,Norma Fisher,Sustainable Engineering,GreenTech,LCA|IoT|Energy Modeling,Altrincham,583.45,With Norma Fisher's proactive approach to LCA ...,Energetic,Warm,No,norma.fisher@greentechfreelance.com,norma fisher proactive approach lca iot innova...
1,Richard Turner,Risk Management,FinTech,Risk Modeling|AML/KYC|SQL,Lewes,366.38,Richard Turner empowers innovative FinTech ven...,Energetic,Storytelling,Yes,richard.turner@fintechfreelance.com,richard turner empowers innovative fintech ven...
2,Colleen Taylor,Health & Wellness Coaching,Wellness,Yoga|Community Management|Pilates,Redcar,370.49,Colleen Taylor inspires holistic well-being wi...,Creative,Warm,Yes,colleen.taylor@wellnessfreelance.com,colleen taylor inspires holistic wellbeing yog...
3,Danielle Browning,Educational Data Analysis,Education/Ed-Tech,Learning Analytics|Python|Pedagogy,Dewsbury,941.20,With Danielle Browning's educational approach ...,Professional,Storytelling,No,danielle.browning@educationedtechfreelance.com,danielle browning educational approach learn a...
4,Nicholas Nolan,Cloud Management,Tech/SaaS,CI/CD|AWS|Docker,Reigate,527.64,Partner with Nicholas Nolan for scalable CI/CD...,Professional,Formal,Yes,nicholas.nolan@techsaasfreelance.com,partner nicholas nolan scalable cicd aws solut...
...,...,...,...,...,...,...,...,...,...,...,...,...
295,Matthew Brown,Secure System Development,FinTech,Risk Modeling|PowerBI|Python,Margate,596.67,With Matthew Brown's transparent approach to R...,Friendly,Formal,No,matthew.brown@fintechfreelance.com,matthew brown transparent approach risk model ...
296,Adriana Atkins,Ed-Tech Integration,Education/Ed-Tech,Pedagogy|Python|HTML5,Northampton,370.70,Adriana Atkins creates engaging learning platf...,Friendly,Storytelling,Yes,adriana.atkins@educationedtechfreelance.com,adriana atkins creates engage learn platform p...
297,Michelle Rodriguez,Cloud Management,Tech/SaaS,Python|CI/CD|AWS,Malvern,710.09,Michelle Rodriguez builds high-performance sol...,Premium,Storytelling,No,michelle.rodriguez@techsaasfreelance.com,michelle rodriguez build highperformance solut...
298,Grace Davis,UI/UX Design,Design,Figma|User Research|AdobeXD,Hinckley,953.42,Grace Davis creates collaborative designs usin...,Friendly,Formal,Yes,grace.davis@designfreelance.com,grace davis creates collaborative design use f...


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import vstack

In [7]:
def vectorize_missions(freelance_df, prospect_df):
    """
    Vectorise les mission statements avec TF-IDF et stocke les vecteurs dans la colonne 'tfidf_vector'.
    """
    # Copie des textes bruts
    all_missions = pd.concat([
        freelance_df["tfidf_vector"],
        prospect_df["tfidf_vector"]
    ])

    vectorizer = TfidfVectorizer(stop_words='english')
    vectorizer.fit(all_missions)

    # Ajoute les vecteurs TF-IDF dans une nouvelle colonne
    freelance_df = freelance_df.copy()
    prospect_df = prospect_df.copy()

    freelance_df["tfidf_vector"] = list(vectorizer.transform(freelance_df["tfidf_vector"]))
    prospect_df["tfidf_vector"] = list(vectorizer.transform(prospect_df["tfidf_vector"]))
    return freelance_df, prospect_df

freelance_df, prospect_df = vectorize_missions(freelance_df, prospect_df)

# 💞 Matching

In [8]:
def get_top_20_leads(freelance_vec, prospect_df):
    """
    Retourne les 20 prospects les plus proches du vecteur freelance donné.
    """
    prospect_matrix = vstack(prospect_df["tfidf_vector"].values)

    similarities = cosine_similarity(freelance_vec, prospect_matrix).flatten()
    top_20_idx = similarities.argsort()[-20:][::-1]

    return prospect_df.iloc[top_20_idx].assign(similarity=similarities[top_20_idx])

top_20_df = get_top_20_leads(freelance_df.iloc[0]["tfidf_vector"], prospect_df)
top_20_df

Unnamed: 0,company,sector,main_contact,contact_role,city,mission_statement,company_size,funding_stage,ticket_size_class,target_tone,remote,email,tfidf_vector,similarity
1759,"Jackson, Gallagher and Kirk",Marketing,Molly King,CEO,Tamworth,Our disruptive strategies enable startup to dr...,Startup (1-20),Seed,Low,Energetic,Yes,molly.king@jacksongallagherandkirk.com,<Compressed Sparse Row sparse matrix of dtype ...,0.071773
1841,Giles-Weaver,GreenTech,Heather Ellison,CEO,Sunderland,We support enterprise in transform green value...,Enterprise (1000+),Series A,High,Creative,Yes,heather.ellison@gilesweaver.com,<Compressed Sparse Row sparse matrix of dtype ...,0.0647
2592,Casey-Hansen,GreenTech,Lori Smith,CTO,Gateshead,We support sme in accelerate sustainability gr...,SME (21-200),Series A,Medium,Energetic,No,lori.smith@caseyhansen.com,<Compressed Sparse Row sparse matrix of dtype ...,0.064163
1305,"Harris, Yang and Duncan",GreenTech,Diana Chavez,Head of Marketing,Kingston upon Hull,We support mid-size in enhance energy efficien...,Mid-size (201-1000),Pre-Seed,Medium,Premium,No,diana.chavez@harrisyangandduncan.com,<Compressed Sparse Row sparse matrix of dtype ...,0.063919
2157,Jenkins-Vincent,GreenTech,Brenda Alvarez,Head of Marketing,Solihull,We support sme in drive environmental impact g...,SME (21-200),Series A,Medium,Friendly,No,brenda.alvarez@jenkinsvincent.com,<Compressed Sparse Row sparse matrix of dtype ...,0.063757
1719,Kelley LLC,GreenTech,Judith Smith,CTO,Taunton,We support startup in drive environmental impa...,Startup (1-20),Series C+,High,Premium,Yes,judith.smith@kelleyllc.com,<Compressed Sparse Row sparse matrix of dtype ...,0.063471
2050,Ellis Inc,GreenTech,Charles Wood,CTO,Coalville,We support startup in drive environmental impa...,Startup (1-20),Seed,Low,Energetic,Yes,charles.wood@ellisinc.com,<Compressed Sparse Row sparse matrix of dtype ...,0.063355
408,"Newman, Smith and Johnson",GreenTech,Michael Tapia,Head of Marketing,Horsham,We support sme in drive environmental impact g...,SME (21-200),Series B,High,Professional,Yes,michael.tapia@newmansmithandjohnson.com,<Compressed Sparse Row sparse matrix of dtype ...,0.063316
2640,"Shepherd, Garcia and Quinn",GreenTech,Richard Schultz,Head of Marketing,Stourbridge,We support startup in drive environmental impa...,Startup (1-20),Series B,High,Premium,Yes,richard.schultz@shepherdgarciaandquinn.com,<Compressed Sparse Row sparse matrix of dtype ...,0.063284
115,Young Group,GreenTech,Donald Banks,CEO,Halesowen,We support sme in pioneer green value chains u...,SME (21-200),Series B,High,Premium,Yes,donald.banks@younggroup.com,<Compressed Sparse Row sparse matrix of dtype ...,0.063125


# 📧 Mail generator

In [9]:
from langchain.chat_models import init_chat_model
from IPython.display import Markdown
import time

In [10]:
def mail_generator(freelance, prospect):
    """
    Génère un email de prospection personnalisé en anglais à partir des données d’un freelance et d’une entreprise cible.

    Paramètres :
    -----------
    freelance : dict
        Dictionnaire contenant les informations du freelance :
        - 'name' : Nom complet
        - 'title' : Titre ou métier
        - 'main_sector' : Secteur principal d’activité
        - 'city' : Ville
        - 'top3_skills' : Compétences clés (format texte)
        - 'daily_rate' : Tarif journalier
        - 'remote' : "Yes"/"No"
        - 'mission_statement' : Résumé de la proposition de valeur
        - 'preferred_tone' : Ton préféré (ex. : Professional)
        - 'preferred_style' : Style préféré (ex. : Storytelling)

    prospect : pandas.Series ou dict
        Informations sur l’entreprise cible :
        - 'company' : Nom de l’entreprise
        - 'city' : Ville
        - 'sector' : Secteur d’activité
        - 'main_contact' : Nom du contact principal
        - 'contact_role' : Poste du contact
        - 'company_size' : Taille de l’entreprise
        - 'funding_stage' : Stade de financement
        - 'remote' : "Yes"/"No"
        - 'target_tone' : Ton attendu côté entreprise

    Retour :
    --------
    prospect : pandas.Series ou dict
        Le même objet que `prospect` mais avec un champ supplémentaire 'mail' contenant l’email généré.
    """

    model = init_chat_model("gemini-2.0-flash", model_provider="google_genai")
    prospect = prospect.copy()

    prompt = f"""
    Write a clear, professional, and personalized cold email in English, addressed to {prospect['main_contact']} ({prospect['contact_role']})
    from the company {prospect['company']}, based in {prospect['city']} and operating in the {prospect['sector']} sector.

    You are {freelance['name']}, a {freelance['title']} based in {freelance['city']}, specialized in the {freelance['main_sector']} sector.
    You provide services with expertise in {freelance['top3_skills']}, at a daily rate of {freelance['daily_rate']} EUR (remote: {freelance['remote']}).
    Your mission is: {freelance['mission_statement']}

    The company is a {prospect['company_size']} at the {prospect['funding_stage']} stage, and remote work availability is {prospect['remote']}.

    The email should:
    - Open with a brief and relevant introduction.
    - Present the value you can bring to this company in 2–3 concise sentences.
    - Match the company's tone: {prospect['target_tone']}, while reflecting your preferred tone: {freelance['preferred_tone']} and style: {freelance['preferred_style']}.
    - Be business-oriented and adapted to the company's context.
    - End with a clear, actionable closing (e.g., propose a short call or ask for availability).
    - Sign with your name
    Ensure the language is polite, direct, and free from repetition or generic phrases. Avoid using placeholders or uncertain formulations.
    Return only the body of the email ready to be sent (no subject line or explanation).
    """

    try:
        response = model.invoke(prompt)
        content = response.content if hasattr(response, "content") else str(response)
    except Exception as e:
        print(f"Error : {e}")
        content = f"ERROR: {e}"

    content = response.__dict__['content']

    return content


In [11]:
freelance_index = 2
freelance = freelance_df.iloc[freelance_index]
top_20_df = get_top_20_leads(freelance_df.iloc[freelance_index]["tfidf_vector"], prospect_df)
prospect = top_20_df.iloc[10]
Markdown(mail_generator(freelance, prospect))

Hi Jessica,

I came across Villa-Morris and was impressed by your energetic approach to wellness. As a Health & Wellness Coach based in Redcar, I specialize in Yoga, Pilates, and Community Management, and I believe my expertise could significantly enhance your startup's wellness initiatives. I can help you foster a strong community, boost engagement, and create a more holistic wellness experience for your users.

My daily rate is EUR 370.49, and I'm fully equipped for remote collaboration. Would you be open to a brief chat next week to explore how my skills can contribute to Villa-Morris's success?

Best regards,

Colleen Taylor

# 🌐 API

In [12]:
import requests

BASE_URL = "https://leadcraftr-api-cloud-623673804405.europe-west1.run.app"

def test_root():
    response = requests.get(f"{BASE_URL}/")
    print("Root:", response.status_code, response.json())

def test_match_freelance():
    mission = "Looking for a data engineer with cloud and ETL experience"
    response = requests.get(f"{BASE_URL}/match_freelance", params={"mission_statement": mission})
    print("Match Freelance:", response.status_code)
    print(response.json())

def test_match_prospect():
    mission = "Senior frontend developer specialized in React and UX"
    response = requests.get(f"{BASE_URL}/match_prospect", params={"mission_statement": mission})
    print("Match Prospect:", response.status_code)
    print(response.json())

def test_generate_mail_freelance():
    payload = {
        "freelance": {
            "name": "Alice",
            "title": "Data Scientist",
            "city": "Paris",
            "main_sector": "Finance",
            "top3_skills": "Python, Machine Learning, SQL",
            "daily_rate": 700,
            "remote": "yes",
            "mission_statement": "Helping companies leverage data for smarter decisions.",
            "preferred_tone": "professional",
            "preferred_style": "concise"
        },
        "prospect": {
            "main_contact": "John Doe",
            "contact_role": "CTO",
            "company": "FinTech Corp",
            "city": "London",
            "sector": "Finance",
            "company_size": "Startup",
            "funding_stage": "Series A",
            "remote": "yes",
            "mission_statement": "We build AI tools for investment analysis.",
            "target_tone": "business"
        },
        "basic_mail_content": ""
    }
    response = requests.post(f"{BASE_URL}/generate_mail_freelance", json=payload)
    print("Mail Freelance:", response.status_code)
    print(response.json())

def test_generate_mail_prospect():
    payload = {
        "freelance": {
            "name": "Alice",
            "title": "Data Scientist",
            "city": "Paris",
            "main_sector": "Finance",
            "top3_skills": "Python, Machine Learning, SQL",
            "daily_rate": 700,
            "remote": "yes",
            "mission_statement": "Helping companies leverage data for smarter decisions.",
            "preferred_tone": "professional",
            "preferred_style": "concise"
        },
        "prospect": {
            "main_contact": "John Doe",
            "contact_role": "CTO",
            "company": "FinTech Corp",
            "city": "London",
            "sector": "Finance",
            "company_size": "Startup",
            "funding_stage": "Series A",
            "remote": "yes",
            "mission_statement": "We build AI tools for investment analysis.",
            "target_tone": "business"
        },
        "basic_mail_content": ""
    }
    response = requests.post(f"{BASE_URL}/generate_mail_prospect", json=payload)
    print("Mail Prospect:", response.status_code)
    print(response.json())

if __name__ == "__main__":
    test_root()
    test_match_freelance()
    test_match_prospect()
    test_generate_mail_freelance()
    test_generate_mail_prospect()


Root: 200 {'message': 'Hi, The API is running!'}
Match Freelance: 200
[{'prospect_id': 2195, 'company': 'Cox, Simpson and Harris', 'sector': 'Tech/SaaS', 'main_contact': 'Amber Griffith', 'contact_role': 'CEO', 'city': 'Coventry', 'mission_statement': 'Powering transformation in sme with scalable digital tools that drive performance through cloud-native architecture.', 'company_size': 'SME (21-200)', 'funding_stage': 'Series A', 'ticket_size_class': 'Medium', 'target_tone': 'Creative', 'remote': True, 'email': 'amber.griffith@coxsimpsonandharris.com', 'similarity': 0.40595415097852067}, {'prospect_id': 352, 'company': 'Peck-Livingston', 'sector': 'Tech/SaaS', 'main_contact': 'Benjamin Thomas', 'contact_role': 'CEO', 'city': 'Torquay', 'mission_statement': 'Powering transformation in enterprise with intelligent digital tools that transform performance through cloud-native architecture.', 'company_size': 'Enterprise (1000+)', 'funding_stage': 'Series C+', 'ticket_size_class': 'High', 'ta