# 💞 Vectorisation & Matching

In [1]:
# Imports principaux
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from packagename.preprocessing import cleaning
from packagename.matching import vectorize_missions_dataset, get_top_20_leads

from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import vstack

[nltk_data] Downloading package punkt to
[nltk_data]     /home/david_veryser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/david_veryser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/david_veryser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/david_veryser/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


reimagining digital presence help midsize empower user interaction customer engagement elegant design


In [2]:
freelance_df = pd.read_csv('../generate_datasets/freelances_dataset.csv', index_col='freelance_id').reset_index(drop=True)
prospect_df = pd.read_csv('../generate_datasets/prospects_dataset.csv', index_col='prospect_id').reset_index(drop=True)


In [3]:
freelance_df['TfidfVect']=freelance_df['mission_statement'].apply(cleaning)
prospect_df['TfidfVect']=prospect_df['mission_statement'].apply(cleaning)

In [None]:
prospect_df.columns

Index(['company', 'sector', 'main_contact', 'contact_role', 'city',
       'mission_statement', 'company_size', 'funding_stage',
       'ticket_size_class', 'target_tone', 'remote', 'email', 'TfidfVect'],
      dtype='object')

In [34]:
freelance_df_vectorized, prospect_df_vectorized, vectorizer = vectorize_missions_dataset(freelance_df, prospect_df)

In [36]:
freelance_df_vectorized.columns

Index(['name', 'title', 'main_sector', 'top3_skills', 'city', 'daily_rate',
       'mission_statement', 'preferred_tone', 'preferred_style', 'remote',
       'email', 'TfidfVect', 'tfidf_vector'],
      dtype='object')

In [38]:
freelance_index = 2
freelance = freelance_df_vectorized.iloc[freelance_index]
print("Informations sur le freelance sélectionné (index 2) :")
print(freelance[['name', 'title', 'main_sector', 'mission_statement', 'top3_skills', 'city', 'daily_rate', 'remote', 'preferred_tone', 'preferred_style']])


Informations sur le freelance sélectionné (index 2) :
name                                                    Colleen Taylor
title                                       Health & Wellness Coaching
main_sector                                                   Wellness
mission_statement    Colleen Taylor inspires holistic well-being wi...
top3_skills                          Yoga|Community Management|Pilates
city                                                            Redcar
daily_rate                                                      370.49
remote                                                             Yes
preferred_tone                                                Creative
preferred_style                                                   Warm
Name: 2, dtype: object


In [41]:
freelance_vec = freelance_df_vectorized['tfidf_vector'].iloc[freelance_index]
top_20_df = get_top_20_leads(freelance_vec, prospect_df_vectorized)

In [42]:
# Sélectionner le prospect à l'index 5
prospect = top_20_df.iloc[5]
print("\nInformations sur le prospect sélectionné (index 5) :")
prospect_columns = ['company', 'sector', 'main_contact', 'contact_role',
                    'city', 'company_size', 'funding_stage', 'remote', 'target_tone']
available_prospect_columns = [col for col in prospect_columns if col in prospect_df_vectorized.columns]
print(prospect[available_prospect_columns])


Informations sur le prospect sélectionné (index 5) :
company          Shaw, Hanson and Alvarado
sector                            Wellness
main_contact                 Michael Adams
contact_role                           CTO
city                           Bournemouth
company_size           Mid-size (201-1000)
funding_stage                    Series C+
remote                                 Yes
target_tone                      Energetic
Name: 2316, dtype: object


# 📧 Mail generator

In [21]:
from langchain.chat_models import init_chat_model
from IPython.display import Markdown
import time

In [44]:
def mail_generator(freelance, prospect):
    """
    Génère un email de prospection personnalisé en anglais à partir des données d’un freelance et d’une entreprise cible.

    Paramètres :
    -----------
    freelance : dict
        Dictionnaire contenant les informations du freelance :
        - 'name' : Nom complet
        - 'title' : Titre ou métier
        - 'main_sector' : Secteur principal d’activité
        - 'city' : Ville
        - 'top3_skills' : Compétences clés (format texte)
        - 'daily_rate' : Tarif journalier
        - 'remote' : "Yes"/"No"
        - 'mission_statement' : Résumé de la proposition de valeur
        - 'preferred_tone' : Ton préféré (ex. : Professional)
        - 'preferred_style' : Style préféré (ex. : Storytelling)

    prospect : pandas.Series ou dict
        Informations sur l’entreprise cible :
        - 'company' : Nom de l’entreprise
        - 'city' : Ville
        - 'sector' : Secteur d’activité
        - 'main_contact' : Nom du contact principal
        - 'contact_role' : Poste du contact
        - 'company_size' : Taille de l’entreprise
        - 'funding_stage' : Stade de financement
        - 'remote' : "Yes"/"No"
        - 'target_tone' : Ton attendu côté entreprise

    Retour :
    --------
    prospect : pandas.Series ou dict
        Le même objet que `prospect` mais avec un champ supplémentaire 'mail' contenant l’email généré.
    """

    model = init_chat_model("gemini-2.0-flash", model_provider="google_genai")
    prospect = prospect.copy()

    prompt = f"""
    Write a clear, professional, and personalized cold email in English, addressed to {prospect['main_contact']} ({prospect['contact_role']})
    from the company {prospect['company']}, based in {prospect['city']} and operating in the {prospect['sector']} sector.

    You are {freelance['name']}, a {freelance['title']} based in {freelance['city']}, specialized in the {freelance['main_sector']} sector.
    You provide services with expertise in {freelance['top3_skills']}, at a daily rate of {freelance['daily_rate']} EUR (remote: {freelance['remote']}).
    Your mission is: {freelance['mission_statement']}

    The company is a {prospect['company_size']} at the {prospect['funding_stage']} stage, and remote work availability is {prospect['remote']}.

    The email should:
    - Open with a brief and relevant introduction.
    - Present the value you can bring to this company in 2–3 concise sentences.
    - Match the company's tone: {prospect['target_tone']}, while reflecting your preferred tone: {freelance['preferred_tone']} and style: {freelance['preferred_style']}.
    - Be business-oriented and adapted to the company's context.
    - End with a clear, actionable closing (e.g., propose a short call or ask for availability).
    - Sign with your name
    Ensure the language is polite, direct, and free from repetition or generic phrases. Avoid using placeholders or uncertain formulations.
    Return only the body of the email (no subject line or explanation).
    """

    try:
        response = model.invoke(prompt)
        content = response.content if hasattr(response, "content") else str(response)
    except Exception as e:
        print(f"Error : {e}")
        content = f"ERROR: {e}"

    prospect['mail'] = response.__dict__['content']

    return prospect


In [45]:
prospect_mail = mail_generator(freelance, prospect)
Markdown(prospect_mail['mail'])

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised PermissionDenied: 403 Generative Language API has not been used in project 1084728417149 before or it is disabled. Enable it by visiting https://console.developers.google.com/apis/api/generativelanguage.googleapis.com/overview?project=1084728417149 then retry. If you enabled this API recently, wait a few minutes for the action to propagate to our systems and retry. [reason: "SERVICE_DISABLED"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "generativelanguage.googleapis.com"
}
metadata {
  key: "serviceTitle"
  value: "Generative Language API"
}
metadata {
  key: "containerInfo"
  value: "1084728417149"
}
metadata {
  key: "consumer"
  value: "projects/1084728417149"
}
metadata {
  key: "activationUrl"
  value: "https://console.developers.google.com/apis/api/generativelanguage.googleapis.com/overview?project=1084728417149"
}
, locale: "en-US"
message: "Genera

Error : 403 Generative Language API has not been used in project 1084728417149 before or it is disabled. Enable it by visiting https://console.developers.google.com/apis/api/generativelanguage.googleapis.com/overview?project=1084728417149 then retry. If you enabled this API recently, wait a few minutes for the action to propagate to our systems and retry. [reason: "SERVICE_DISABLED"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "generativelanguage.googleapis.com"
}
metadata {
  key: "serviceTitle"
  value: "Generative Language API"
}
metadata {
  key: "containerInfo"
  value: "1084728417149"
}
metadata {
  key: "consumer"
  value: "projects/1084728417149"
}
metadata {
  key: "activationUrl"
  value: "https://console.developers.google.com/apis/api/generativelanguage.googleapis.com/overview?project=1084728417149"
}
, locale: "en-US"
message: "Generative Language API has not been used in project 1084728417149 before or it is disabled. Enable it by visiting https://console.

UnboundLocalError: local variable 'response' referenced before assignment