In [1]:
import faiss
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sentence_transformers import SentenceTransformer
from tqdm import tqdm  


data_path = "Projectdata/cleaned_job_company_pair.csv"  
raw_data = pd.read_csv(data_path)
if 'description' not in raw_data.columns:
    raw_data['description'] = raw_data['skill_name'] + " in " + raw_data['industry_name']

print("Loading lightweight embedding model...")
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')  
print("Model loaded.")


print("Normalizing numerical features...")

numerical_features = ['annual_min_salary', 'annual_max_salary', 'employee_count', 'follower_count']
scaler = StandardScaler()
raw_data[numerical_features] = scaler.fit_transform(raw_data[numerical_features].fillna(0))


text_features = ['description']
text_embeddings = []

print("Embedding text features...")
for feature in tqdm(text_features, desc="Text Embeddings"):
    raw_data[feature] = raw_data[feature].fillna('')
    embeddings = model.encode(raw_data[feature].tolist(), batch_size=128, show_progress_bar=True)
    text_embeddings.append(embeddings)


print("Combining text embeddings...")
combined_text_embeddings = np.hstack(text_embeddings)


print("Merging features...")
combined_features = np.hstack([combined_text_embeddings, raw_data[numerical_features].values])


def create_faiss_index(embeddings, nlist=100):
    d = embeddings.shape[1]  
    quantizer = faiss.IndexFlatL2(d)  
    index = faiss.IndexIVFPQ(quantizer, d, nlist, 8, 8)  
    print("Training FAISS index...")
    index.train(embeddings)  
    print("Adding embeddings to FAISS index...")
    index.add(embeddings)  
    print("FAISS index created.")
    return index







  from tqdm.autonotebook import tqdm, trange



Loading lightweight embedding model...
Model loaded.
Normalizing numerical features...
Embedding text features...


Text Embeddings:   0%|                                                                           | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2098 [00:00<?, ?it/s]

Text Embeddings: 100%|███████████████████████████████████████████████████████████████████| 1/1 [00:26<00:00, 26.73s/it]


Combining text embeddings...
Merging features...


In [3]:
def query_jobs(user_description, numerical_inputs, top_n=25):
    print("Encoding user query...")
    
    user_embedding = model.encode([user_description], show_progress_bar=False)

    
    user_numerical = np.array([numerical_inputs[col] for col in numerical_features]).reshape(1, -1)
    numerical_scaled = scaler.transform(user_numerical)  # 标准化

    
    query_features = np.hstack([user_embedding, numerical_scaled])
    #print(len(query_features))
    
    if query_features.shape[1] != faiss_index.d:
        print(f"Adjusting query features from {query_features.shape[1]} to {faiss_index.d}.")
        if query_features.shape[1] < faiss_index.d:
            
            print("here------------------------------------------")
            padding = np.zeros((1, faiss_index.d - query_features.shape[1]))
            query_features = np.hstack([query_features, padding])
        else:
            
            query_features = query_features[:, :faiss_index.d]

    print("Searching for similar jobs...")
    D, I = faiss_index.search(query_features.reshape(1, -1), top_n)
    results = []

    for idx in I[0]:
        if idx < len(raw_data):
            results.append(raw_data.iloc[idx])

    return pd.DataFrame(results)







queries = [
    {
        "query": "I want to be an engineer. I have good python skills and knowledge about machine learning.",
        "numerical_inputs": {
            'annual_min_salary': 0,
            'annual_max_salary': 80000,
            'employee_count': 1000,
            'follower_count': 500,
        },
    },
    {
        "query": "Looking for a software developer position specializing in cloud computing.",
        "numerical_inputs": {
            'annual_min_salary': 60000,
            'annual_max_salary': 120000,
            'employee_count': 200,
            'follower_count': 300,
        },
    },
    {
        "query": "I have experience in data science and want a job in data analytics.",
        "numerical_inputs": {
            'annual_min_salary': 50000,
            'annual_max_salary': 90000,
            'employee_count': 500,
            'follower_count': 400,
        },
    },
    {
        "query": "I wanna be a teacher.",
        "numerical_inputs": {
            'annual_min_salary': 0,
            'annual_max_salary': 150000,
            'employee_count': 1500,
            'follower_count': 600,
        },
    },
]

# Loop through each query and execute the query_jobs function
for i, entry in enumerate(queries, start=1):
    print(f"\nPerforming query {i}: {entry['query']}")
    result = query_jobs(entry["query"], numerical_inputs=entry["numerical_inputs"], top_n=10)
    print(f"Top matching jobs for query {i}:")
    print(result[['company_name', 'description','skill_name', 'industry_name']])



Performing query 1: I want to be an engineer. I have good python skills and knowledge about machine learning.
Encoding user query...
Adjusting query features from 388 to 392.
here------------------------------------------
Searching for similar jobs...
Top matching jobs for query 1:
                             company_name  \
242308  Innovative Construction Solutions   
267286                           HCRC Inc   
261749                            Amplify   
261750                            Amplify   
238337                          LandrumHR   
29451                              Engtal   
29452                              Engtal   
29453                              Engtal   
122814    Peskind Executive Search, Inc.    
202823                             Astrix   

                                              description   skill_name  \
242308                      Training in Civil Engineering     Training   
267286                      Training in Civil Engineering     Training  

