In [2]:
!pip install google-cloud-aiplatform
!pip install google-cloud-storage



In [3]:
from google.colab import auth
from google.cloud import aiplatform
from google.cloud import storage
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel
from typing import List, Optional
import numpy as np
import pandas as pd
import json



In [4]:
auth.authenticate_user()
BUCKET_URI = 'gs://jobijobai_index_data_bucket'
index_id = 'jobijobai-vector-search'
project = 'shift-aihack-nantes24-10'
location = 'europe-west1'

aiplatform.init(project=project, location=location)

In [5]:
def download_blob(bucket_name, source_blob_name, destination_file_name):
    """Downloads a blob from the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)

    print(f"Blob {source_blob_name} downloaded to {destination_file_name}.")

bucket_name = 'esco_raw_data'

download_blob(bucket_name, 'fr/occupations_fr.csv', '/tmp/occupations_fr.csv')
download_blob(bucket_name, 'fr/skills_fr.csv', '/tmp/skills_fr.csv')
download_blob(bucket_name, 'fr/occupationSkillRelations_fr.csv', '/tmp/occupationSkillRelations_fr.csv')

Blob fr/occupations_fr.csv downloaded to /tmp/occupations_fr.csv.
Blob fr/skills_fr.csv downloaded to /tmp/skills_fr.csv.
Blob fr/occupationSkillRelations_fr.csv downloaded to /tmp/occupationSkillRelations_fr.csv.


In [18]:
# Load raw data
raw_occupations = pd.read_csv('/tmp/occupations_fr.csv')
raw_skills = pd.read_csv('/tmp/skills_fr.csv')
raw_occupations_skills = pd.read_csv('/tmp/occupationSkillRelations_fr.csv')

raw_occupations.head()

Unnamed: 0,conceptType,conceptUri,iscoGroup,preferredLabel,altLabels,hiddenLabels,status,modifiedDate,regulatedProfessionNote,scopeNote,definition,inScheme,description,code
0,Occupation,http://data.europa.eu/esco/occupation/00030d09...,2654,directeur technique/directrice technique,régisseuse\nrégisseur\ndirectrice technique\nr...,,released,2024-01-25T11:28:50.295Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/occu...,Les directeurs techniques/directrices techniqu...,2654.1.7
1,Occupation,http://data.europa.eu/esco/occupation/000e93a3...,8121,opérateur de tréfilerie/opératrice de tréfilerie,opérateur de tréfilerie\nopératrice de tréfilerie,,released,2024-01-23T10:09:32.099Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/memb...,Les opérateurs de tréfilerie installent et fon...,8121.4
2,Occupation,http://data.europa.eu/esco/occupation/0019b951...,7543,contrôleur qualité des instruments de précisio...,contrôleuse qualité des instruments de précisi...,,released,2024-01-25T15:00:12.188Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/occu...,Les contrôleurs qualité des instruments de pré...,7543.10.3
3,Occupation,http://data.europa.eu/esco/occupation/0022f466...,3155,technicien de la sécurité du trafic aérien/tec...,technicienne de la sécurité du trafic aérien\n...,,released,2024-01-29T16:01:13.998Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/memb...,Les techniciens de la sécurité aérienne apport...,3155.1
4,Occupation,http://data.europa.eu/esco/occupation/002da35b...,2431,gestionnaire de recettes dans le secteur Horeca,responsable des revenus hôteliers\nyield manag...,,released,2024-01-11T10:28:45.871Z,http://data.europa.eu/esco/regulated-professio...,,,http://data.europa.eu/esco/concept-scheme/memb...,Les gestionnaires de recettes dans le secteur ...,2431.9


In [19]:
occupations = raw_occupations[['conceptUri', 'code', 'preferredLabel', 'altLabels', 'description']]
occupationsDic = occupations.set_index('conceptUri').to_dict()
skills = raw_skills[['conceptUri', 'preferredLabel']]
skillsDic = raw_skills.set_index('conceptUri').to_dict()
occupations_skills = raw_occupations_skills[['occupationUri', 'skillUri']]

In [21]:
occupations_skills = pd.merge(raw_occupations_skills, raw_skills, left_on='skillUri', right_on='conceptUri')
groupedSkills = occupations_skills.groupby('occupationUri').agg({'preferredLabel':  lambda x : ', '.join(x)})

In [22]:
final_jobs_and_skills = pd.merge(occupations, groupedSkills, left_on='conceptUri', right_index=True).rename(columns={'preferredLabel_x': 'jobTitle', 'altLabels': 'similarJobTitles', 'preferredLabel_y': 'skills'})
final_jobs_and_skills = final_jobs_and_skills[['conceptUri', 'code', 'jobTitle', 'similarJobTitles', 'description', 'skills']]


In [23]:
all_line_strings = []

for index, row in final_jobs_and_skills.iterrows():
    line_string = f"{row['jobTitle']} {row['similarJobTitles']} {row['description']} {row['skills']}".replace('\n', ' ')
    #line_string = row['jobTitle']
    all_line_strings.append(line_string)

print(all_line_strings[0])

directeur technique/directrice technique régisseuse régisseur directrice technique régisseur/régisseuse directeur technique Les directeurs techniques/directrices techniques donnent corps aux visions artistiques des créateurs dans le respect de contraintes techniques. Ils/elles coordonnent les opérations des différentes unités de production, telles que la scène, les costumes, le son, l’éclairage et le maquillage. Ils/elles adaptent le prototype et étudient la faisabilité, la mise en œuvre, l’exploitation et le suivi technique du projet artistique. Ils/elles sont également responsables du matériel scénique et de l’équipement technique. techniques théâtrales, organiser des répétitions, rédiger une évaluation des risques sur la représentation d'une production artistique, assurer la coordination avec les services créatifs, s’adapter aux exigences créatives d’artistes, négocier des questions de santé et de sécurité avec des tiers, adapter le travail des concepteurs à la salle de spectacle, p

In [24]:
def embed_text(
    texts: List[str] = [],
    task: str = "RETRIEVAL_QUERY",
    model_name: str = "textembedding-gecko@003",
    dimensionality: Optional[int] = None,
) -> List[List[float]]:
    """Embeds texts with a pre-trained, foundational model."""
    model = TextEmbeddingModel.from_pretrained(model_name)
    inputs = [TextEmbeddingInput(text, task) for text in texts]
    kwargs = dict(output_dimensionality=dimensionality) if dimensionality else {}
    embeddings = model.get_embeddings(inputs, **kwargs)
    return [embedding.values for embedding in embeddings]

In [27]:
vectors = []
n = 20
iteration = 0
maxIteration = 99999999999

for k in range(len(all_line_strings)//n):
  iteration = iteration + 1
  if(iteration > maxIteration):
    break
  print(iteration)
  vectors.extend(embed_text(all_line_strings[k*n:(k+1)*n]))

indexed_items = []
for index, vector in enumerate(vectors):
  indexed_items.append({
                'id': final_jobs_and_skills.iloc[index]['code'],
                'jobTitle': final_jobs_and_skills.iloc[index]['jobTitle'],
                'similarJobTitles': final_jobs_and_skills.iloc[index]['similarJobTitles'],
                'skills': final_jobs_and_skills.iloc[index]['skills'],
                'description': final_jobs_and_skills.iloc[index]['description'],
                'embedding': vector
            })

with open(f'/tmp/vectors.json', 'w') as outfile:
  for line in indexed_items:
    json.dump(line, outfile)
    outfile.write('\n')



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151


In [15]:
! gsutil cp "/tmp/vectors.json" "gs://jobijobai_index_data_bucket"

Copying file:///tmp/vectors.json [Content-Type=application/json]...
- [1 files][ 56.2 MiB/ 56.2 MiB]                                                
Operation completed over 1 objects/56.2 MiB.                                     


In [28]:
# create Index
my_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name = f"jobijobai-vector-search-all-data",
    contents_delta_uri = BUCKET_URI,
    dimensions = 768,
    approximate_neighbors_count = 10,
)

INFO:google.cloud.aiplatform.matching_engine.matching_engine_index:Creating MatchingEngineIndex
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index:Create MatchingEngineIndex backing LRO: projects/145034832321/locations/europe-west1/indexes/8210915341719568384/operations/6795918093806731264
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index:MatchingEngineIndex created. Resource name: projects/145034832321/locations/europe-west1/indexes/8210915341719568384
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index:To use this MatchingEngineIndex in another session:
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index:index = aiplatform.MatchingEngineIndex('projects/145034832321/locations/europe-west1/indexes/8210915341719568384')
