# ⚙️ Preprocessing

In [61]:
# Imports principaux
import pandas as pd
import numpy as np

In [73]:
freelance_df = pd.read_csv('../generate_datasets/freelances_dataset.csv', index_col='freelance_id').reset_index(drop=True)
prospect_df = pd.read_csv('../generate_datasets/prospects_dataset.csv', index_col='prospect_id').reset_index(drop=True)
freelance_df

Unnamed: 0,name,title,main_sector,top3_skills,city,daily_rate,mission_statement,preferred_tone,preferred_style,remote,email
0,Norma Fisher,Sustainable Engineering,GreenTech,LCA|IoT|Energy Modeling,Tammyfort,640.15,Norma Fisher drives inspiring eco-solutions us...,Energetic,Warm,No,norma.fisher@greentechfreelance.com
1,Heather Snow,Risk Management,FinTech,Risk Modeling|AML/KYC|SQL,New Donald,829.06,Partner with Heather Snow for analytical Risk ...,Serious,Warm,Yes,heather.snow@fintechfreelance.com
2,Danielle Browning,Health & Wellness Coaching,Wellness,Yoga|Community Management|Pilates,West Corey,691.87,Danielle Browning promotes vibrant wellness wi...,Energetic,Warm,No,danielle.browning@wellnessfreelance.com
3,Brian Hamilton,Risk Management,FinTech,Python|PowerBI|AML/KYC,New Thomas,482.34,Brian Hamilton empowers focused FinTech ventur...,Serious,Formal,Yes,brian.hamilton@fintechfreelance.com
4,Kimberly Smith,Renewable Energy Expertise,GreenTech,Project Management|Energy Modeling|Carbon Acco...,Pagetown,470.74,Partner with Kimberly Smith for dynamic Projec...,Energetic,Storytelling,Yes,kimberly.smith@greentechfreelance.com
...,...,...,...,...,...,...,...,...,...,...,...
295,Kevin Pearson,Software Development,Tech/SaaS,Python|AWS|Docker,Lake Monica,834.04,"With Kevin Pearson, secure expertise in Python...",Serious,Storytelling,No,kevin.pearson@techsaasfreelance.com
296,Nicholas Lee,Nutrition Expertise,Wellness,Nutrition|Copywriting|Pilates,Lake Casey,454.27,Nicholas Lee promotes exclusive wellness with ...,Premium,Formal,Yes,nicholas.lee@wellnessfreelance.com
297,Hailey Huffman,Brand Strategy,Marketing,SEO|Analytics|Copywriting,Dunnside,871.68,Hailey Huffman transforms marketing with trust...,Professional,Storytelling,Yes,hailey.huffman@marketingfreelance.com
298,Luis Black,Software Development,Tech/SaaS,Python|AWS|FastAPI,Clarkborough,871.04,Partner with Luis Black for forward-thinking P...,Creative,Storytelling,Yes,luis.black@techsaasfreelance.com


In [74]:
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag

In [75]:
def get_wordnet_pos(word):
    """Map POS tag to first character for WordNetLemmatizer"""
    tag = pos_tag([word])[0][1][0].lower()  # Get first letter of POS tag
    return{'n': 'n', 'v': 'v', 'a': 'a', 'r': 'r'}.get(tag, 'n')

def cleaning(sentence):
    # Basic cleaning
    sentence = sentence.strip().lower()
    sentence = ''.join(char for char in sentence if not char.isdigit())
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))

    # Tokenization
    tokenized_sentence = word_tokenize(sentence)

    # Stopwords removal
    stop_words = set(stopwords.words('english'))
    tokenized_sentence_cleaned = [w for w in tokenized_sentence if w not in stop_words]

    # Lemmatization with correct POS tagging
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tokenized_sentence_cleaned]

    return ' '.join(lemmatized)

In [76]:
freelance_df['tfidf_vector'] = freelance_df['mission_statement'].apply(cleaning)
prospect_df['tfidf_vector'] = prospect_df['mission_statement'].apply(cleaning)
freelance_df

Unnamed: 0,name,title,main_sector,top3_skills,city,daily_rate,mission_statement,preferred_tone,preferred_style,remote,email,tfidf_vector
0,Norma Fisher,Sustainable Engineering,GreenTech,LCA|IoT|Energy Modeling,Tammyfort,640.15,Norma Fisher drives inspiring eco-solutions us...,Energetic,Warm,No,norma.fisher@greentechfreelance.com,norma fisher drive inspire ecosolutions use lc...
1,Heather Snow,Risk Management,FinTech,Risk Modeling|AML/KYC|SQL,New Donald,829.06,Partner with Heather Snow for analytical Risk ...,Serious,Warm,Yes,heather.snow@fintechfreelance.com,partner heather snow analytical risk model aml...
2,Danielle Browning,Health & Wellness Coaching,Wellness,Yoga|Community Management|Pilates,West Corey,691.87,Danielle Browning promotes vibrant wellness wi...,Energetic,Warm,No,danielle.browning@wellnessfreelance.com,danielle browning promotes vibrant wellness yo...
3,Brian Hamilton,Risk Management,FinTech,Python|PowerBI|AML/KYC,New Thomas,482.34,Brian Hamilton empowers focused FinTech ventur...,Serious,Formal,Yes,brian.hamilton@fintechfreelance.com,brian hamilton empowers focus fintech venture ...
4,Kimberly Smith,Renewable Energy Expertise,GreenTech,Project Management|Energy Modeling|Carbon Acco...,Pagetown,470.74,Partner with Kimberly Smith for dynamic Projec...,Energetic,Storytelling,Yes,kimberly.smith@greentechfreelance.com,partner kimberly smith dynamic project managem...
...,...,...,...,...,...,...,...,...,...,...,...,...
295,Kevin Pearson,Software Development,Tech/SaaS,Python|AWS|Docker,Lake Monica,834.04,"With Kevin Pearson, secure expertise in Python...",Serious,Storytelling,No,kevin.pearson@techsaasfreelance.com,kevin pearson secure expertise python aws scal...
296,Nicholas Lee,Nutrition Expertise,Wellness,Nutrition|Copywriting|Pilates,Lake Casey,454.27,Nicholas Lee promotes exclusive wellness with ...,Premium,Formal,Yes,nicholas.lee@wellnessfreelance.com,nicholas lee promotes exclusive wellness nutri...
297,Hailey Huffman,Brand Strategy,Marketing,SEO|Analytics|Copywriting,Dunnside,871.68,Hailey Huffman transforms marketing with trust...,Professional,Storytelling,Yes,hailey.huffman@marketingfreelance.com,hailey huffman transforms marketing trust seo ...
298,Luis Black,Software Development,Tech/SaaS,Python|AWS|FastAPI,Clarkborough,871.04,Partner with Luis Black for forward-thinking P...,Creative,Storytelling,Yes,luis.black@techsaasfreelance.com,partner luis black forwardthinking python aws ...


In [77]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import vstack

In [78]:
def vectorize_missions(freelance_df, prospect_df):
    """
    Vectorise les mission statements avec TF-IDF et stocke les vecteurs dans la colonne 'tfidf_vector'.
    """
    # Copie des textes bruts
    all_missions = pd.concat([
        freelance_df["tfidf_vector"],
        prospect_df["tfidf_vector"]
    ])

    vectorizer = TfidfVectorizer(stop_words='english')
    vectorizer.fit(all_missions)

    # Ajoute les vecteurs TF-IDF dans une nouvelle colonne
    freelance_df = freelance_df.copy()
    prospect_df = prospect_df.copy()

    freelance_df["tfidf_vector"] = list(vectorizer.transform(freelance_df["tfidf_vector"]))
    prospect_df["tfidf_vector"] = list(vectorizer.transform(prospect_df["tfidf_vector"]))
    return freelance_df, prospect_df

freelance_df, prospect_df = vectorize_missions(freelance_df, prospect_df)

# 💞 Matching

In [71]:
def get_top_20_leads(freelance_vec, prospect_df):
    """
    Retourne les 20 prospects les plus proches du vecteur freelance donné.
    """
    prospect_matrix = vstack(prospect_df["tfidf_vector"].values)

    similarities = cosine_similarity(freelance_vec, prospect_matrix).flatten()
    top_20_idx = similarities.argsort()[-20:][::-1]

    return prospect_df.iloc[top_20_idx].assign(similarity=similarities[top_20_idx])

top_20_df = get_top_20_leads(freelance_df.iloc[0]["tfidf_vector"], prospect_df)
top_20_df

Unnamed: 0,company,sector,main_contact,contact_role,city,mission_statement,company_size,funding_stage,ticket_size_class,target_tone,remote,email,tfidf_vector,similarity
1160,Anderson and Sons,GreenTech,Michelle Woods,Head of Data,Kimtown,Empowering enterprise to accelerate sustainabi...,Enterprise (1000+),Pre-Seed,High,Professional,No,michelle.woods@andersonandsons.com,<Compressed Sparse Row sparse matrix of dtype ...,0.206394
2616,"Carr, Zhang and Green",GreenTech,Kimberly Williams,Head of Data,Port Beth,Our eco-friendly solutions help startup accele...,Startup (1-20),Series A,Medium,Friendly,No,kimberly.williams@carrzhangandgreen.com,<Compressed Sparse Row sparse matrix of dtype ...,0.173981
2720,Byrd-Pearson,GreenTech,Lauren Boyle,Head of Data,Taraborough,Our eco-friendly solutions help enterprise acc...,Enterprise (1000+),Seed,High,Professional,No,lauren.boyle@byrdpearson.com,<Compressed Sparse Row sparse matrix of dtype ...,0.173892
2895,"Cox, Webb and Chan",GreenTech,Michael Brown,CTO,Debratown,Our eco-friendly solutions help enterprise acc...,Enterprise (1000+),Series A,High,Professional,No,michael.brown@coxwebbandchan.com,<Compressed Sparse Row sparse matrix of dtype ...,0.171224
1194,Farmer PLC,GreenTech,Hannah King,CTO,West Daniel,Our eco-friendly solutions help enterprise acc...,Enterprise (1000+),Series A,High,Energetic,Yes,hannah.king@farmerplc.com,<Compressed Sparse Row sparse matrix of dtype ...,0.171224
433,"Dudley, Gross and Gregory",GreenTech,Amanda Hernandez,Head of Marketing,South Karen,We support startup in accelerate sustainabilit...,Startup (1-20),Series C+,High,Premium,No,amanda.hernandez@dudleygrossandgregory.com,<Compressed Sparse Row sparse matrix of dtype ...,0.162528
1424,Lee Group,GreenTech,Lisa Edwards,CEO,Thomasstad,Empowering startup to transform sustainability...,Startup (1-20),Series A,Medium,Creative,Yes,lisa.edwards@leegroup.com,<Compressed Sparse Row sparse matrix of dtype ...,0.162043
1578,Stanley and Sons,GreenTech,Katie Booth,CEO,Port Eileenshire,Empowering mid-size to transform sustainabilit...,Mid-size (201-1000),Series A,Medium,Creative,Yes,katie.booth@stanleyandsons.com,<Compressed Sparse Row sparse matrix of dtype ...,0.158649
652,White-Simmons,GreenTech,Matthew Snyder,Head of Data,Lake Anthonyshire,"At the intersection of ecology and innovation,...",Startup (1-20),Series B,High,Friendly,Yes,matthew.snyder@whitesimmons.com,<Compressed Sparse Row sparse matrix of dtype ...,0.157608
2835,Snow-Mitchell,GreenTech,Susan Martin,CTO,North Mitchellbury,"At the intersection of ecology and innovation,...",Enterprise (1000+),Series A,High,Energetic,Yes,susan.martin@snowmitchell.com,<Compressed Sparse Row sparse matrix of dtype ...,0.155492


# 📧 Mail generator

In [54]:
from langchain.chat_models import init_chat_model
from IPython.display import Markdown
import time

In [55]:
def mail_generator(freelance, prospect):
    """
    Génère un email de prospection personnalisé en anglais à partir des données d’un freelance et d’une entreprise cible.

    Paramètres :
    -----------
    freelance : dict
        Dictionnaire contenant les informations du freelance :
        - 'name' : Nom complet
        - 'title' : Titre ou métier
        - 'main_sector' : Secteur principal d’activité
        - 'city' : Ville
        - 'top3_skills' : Compétences clés (format texte)
        - 'daily_rate' : Tarif journalier
        - 'remote' : "Yes"/"No"
        - 'mission_statement' : Résumé de la proposition de valeur
        - 'preferred_tone' : Ton préféré (ex. : Professional)
        - 'preferred_style' : Style préféré (ex. : Storytelling)

    prospect : pandas.Series ou dict
        Informations sur l’entreprise cible :
        - 'company' : Nom de l’entreprise
        - 'city' : Ville
        - 'sector' : Secteur d’activité
        - 'main_contact' : Nom du contact principal
        - 'contact_role' : Poste du contact
        - 'company_size' : Taille de l’entreprise
        - 'funding_stage' : Stade de financement
        - 'remote' : "Yes"/"No"
        - 'target_tone' : Ton attendu côté entreprise

    Retour :
    --------
    prospect : pandas.Series ou dict
        Le même objet que `prospect` mais avec un champ supplémentaire 'mail' contenant l’email généré.
    """

    model = init_chat_model("gemini-2.0-flash", model_provider="google_genai")
    prospect = prospect.copy()

    prompt = f"""
    Write a clear, professional, and personalized cold email in English, addressed to {prospect['main_contact']} ({prospect['contact_role']})
    from the company {prospect['company']}, based in {prospect['city']} and operating in the {prospect['sector']} sector.

    You are {freelance['name']}, a {freelance['title']} based in {freelance['city']}, specialized in the {freelance['main_sector']} sector.
    You provide services with expertise in {freelance['top3_skills']}, at a daily rate of {freelance['daily_rate']} EUR (remote: {freelance['remote']}).
    Your mission is: {freelance['mission_statement']}

    The company is a {prospect['company_size']} at the {prospect['funding_stage']} stage, and remote work availability is {prospect['remote']}.

    The email should:
    - Open with a brief and relevant introduction.
    - Present the value you can bring to this company in 2–3 concise sentences.
    - Match the company's tone: {prospect['target_tone']}, while reflecting your preferred tone: {freelance['preferred_tone']} and style: {freelance['preferred_style']}.
    - Be business-oriented and adapted to the company's context.
    - End with a clear, actionable closing (e.g., propose a short call or ask for availability).
    - Sign with your name
    Ensure the language is polite, direct, and free from repetition or generic phrases. Avoid using placeholders or uncertain formulations.
    Return only the body of the email ready to be sent (no subject line or explanation).
    """

    try:
        response = model.invoke(prompt)
        content = response.content if hasattr(response, "content") else str(response)
    except Exception as e:
        print(f"Error : {e}")
        content = f"ERROR: {e}"

    content = response.__dict__['content']

    return content


In [56]:
freelance_index = 2
freelance = freelance_df.iloc[freelance_index]
top_20_df = get_top_20_leads(freelance_df.iloc[freelance_index]["tfidf_vector"], prospect_df)
prospect = top_20_df.iloc[4]

Markdown(mail_generator(freelance, prospect))

Dear Aaron,

I'm Danielle Browning, a Health & Wellness Coach based in West Corey, specializing in the Wellness sector, and I was particularly interested in Erickson, Johnson and Gonzalez's innovative work in Stevenland.

With my expertise in Yoga, Pilates, and Community Management, I can help Erickson, Johnson and Gonzalez foster a thriving and engaged employee community, boosting morale and promoting overall well-being. My tailored strategies can directly contribute to a healthier and more productive workforce, aligning perfectly with your company's focus on wellness.

Would you be open to a brief 15-minute call next week to discuss how I can contribute to Erickson, Johnson and Gonzalez's wellness initiatives?

Sincerely,

Danielle Browning