In [1]:
import os
import json
from langchain_groq import ChatGroq
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import PromptTemplate
from groq import Groq
import pandas as pd
import numpy as np

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [11]:
from dotenv import load_dotenv
load_dotenv()

os.environ["GROQ_API_KEY"] = "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"
groq_api_key=os.environ['GROQ_API_KEY']

In [3]:
client = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)

In [4]:
prompt_template = """
You are trying to match different job titles of various professionals with the following domains: 
1. Software
2. Data Scientist
3. Consultant
4. Management
5. Product Engineering
6. Academia
7. Finance
8. Law
9. Sales & Marketing
10. Mechatronics

The job title must be matched with the corresponding domain it is included in, so that professionals of each domain can be grouped together. The given titles will bein a list form and the groupings must be presented in a list form as well.

Here are a few examples:
1. Software: Oracle | IBM Research; Software Developer @ Times Internet; Full Stack Developer; SDE-1 @ Amazon
2. Data Science: Senior Data Scientist @ American Express; Data Science @Amex
3. Consultant: Training Consultant; Associate, Client Services at AlphaSights; Risk Consultant; Human Capital Consultant 
4. Management: Backing India's next generation brands; BIU-Business Analyst at Axis Bank; Hiring React/Flutter Developers
5. Product Engineering: Product@AzaFashions; Product @ Liquiloans; APM at MasterCard; 
6. Academia: IIM Calcutta PGP '25; Chemical Engineering | Indian Institute of Technology, Kharagpur; Werkstudent: Deutz AG
7. Finance: CA || MBA(FMS); Deutsche Bank; Capital Markets - Blackstone Real Estate
8. Law: Legal Operations & Technology | In-House | Lawyer
9. Sales & Marketing: Insights Manager; Fabric Manufacturer and Exporter
10. Mechatronics: Incoming @ JLR India
As you can see, each job title is separated by a semi colon for clarity. I will be providing a list of job titles, and you are supposed to classify them in a similar fashion in the 10 listed domains. 

You should understand what work the person with the specific job title would be doing, and which domain it would fall into, with the help of the given example. If a proper title is not available, you can consider how the work in the company/other details specified would be.
Provide only a single domain name (e.g., Software, Data Science, etc.) that the job title suits maximum to without any explanations. If there are multiple matching domains, choose any one.

"""

In [5]:


# Define a function to categorize job titles
def categorize_job_title(job_title):
    llm = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": prompt_template
            },
            {
                "role": "user",
                "content": job_title
            }
        ],
        model="mixtral-8x7b-32768",
    )
    return (llm.choices[0].message.content)

In [6]:
#prompt = PromptTemplate(input_variables=["job_title"], template=prompt_template).format(job_title="Manager")
llm = client.chat.completions.create(
    messages=[
        {
            "role": "system",
            "content": prompt_template
        },
        {
            "role": "user",
            "content": "Data Science @Amex"
        }
    ],
    model="llama-3.3-70b-versatile",
)

print(llm.choices[0].message.content)

Data Scientist


In [7]:
def process_json_files(folder_path):
    user_data = {}
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json"):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, "r") as file:
                data = json.load(file)
                user_id = os.path.splitext(file_name)[0]
                job_title = data.get("position", "")
                if job_title:
                    domain = categorize_job_title(job_title)
                    user_data[user_id] = {"job_title": job_title.split('|')[0].strip().split('\n')[0].strip() if job_title else "No idea", "domain": domain}
    return user_data

In [8]:
def generate_adjacency_matrix(user_data):
    user_ids = list(user_data.keys())
    n = len(user_ids)
    matrix = np.zeros((n, n), dtype=int)
    domains = [user_data[user_id]["domain"] for user_id in user_ids]

    for i in range(n):
        for j in range(n):
            if domains[i] == domains[j]:
                matrix[i][j] = 1
    return pd.DataFrame(matrix, index=user_ids, columns=user_ids)

In [17]:
from tqdm import tqdm
def process_json_files(json_folder, output_user_data, output_adjacency_matrix):
    user_data = {}
    adjacency_matrix = None

    try:
        # Load existing data if files exist
        if os.path.exists(output_user_data):
            with open(output_user_data, "r") as file:
                user_data = json.load(file)

        if os.path.exists(output_adjacency_matrix):
            adjacency_matrix = pd.read_csv(output_adjacency_matrix, index_col=0)
        else:
            adjacency_matrix = pd.DataFrame()

        user_files = os.listdir(json_folder)
        for file_name in tqdm(user_files, desc="Processing JSON files"):
            user_id = os.path.splitext(file_name)[0]
            if user_id in user_data:
                continue  # Skip already processed users

            file_path = os.path.join(json_folder, file_name)
            with open(file_path, "r") as file:
                user_details = json.load(file)

            # Extract job title and classify domain
            job_title = user_details.get("position", "Unknown Position")
            domain = categorize_job_title(job_title.split('|')[0].strip().split('\n')[0].strip() if job_title else "Unknown Position")

            if not domain:
                print(f"Skipping user {user_id}: domain could not be classified.")
                continue

            # Save user data with domain
            user_data[user_id] = {
                "details": user_details,
                "domain": domain,
            }

            # Update adjacency matrix
            try:
                new_row = pd.DataFrame(
                    0, index=[user_id], columns=adjacency_matrix.index
                )
                new_column = pd.DataFrame(
                    0, index=adjacency_matrix.index, columns=[user_id]
                )

                for existing_user_id in adjacency_matrix.index:
                    if (
                        existing_user_id in user_data
                        and domain == user_data[existing_user_id]["domain"]
                    ):
                        new_row.loc[user_id, existing_user_id] = 1
                        new_column.loc[existing_user_id, user_id] = 1

                adjacency_matrix = pd.concat([adjacency_matrix, new_row], axis=0)
                adjacency_matrix = pd.concat([adjacency_matrix, new_column], axis=1)
                adjacency_matrix.fillna(0, inplace=True)

                # Save progress after processing each user
                with open(output_user_data, "w") as file:
                    json.dump(user_data, file)

                adjacency_matrix.to_csv(output_adjacency_matrix)
            except KeyError as e:
                print(f"Error updating adjacency matrix for user {user_id}: {e}")
                continue

    except Exception as e:
        print(f"An error occurred: {e}")
        print("Saving progress...")
        # Save current user data
        with open(output_user_data, "w") as file:
            json.dump(user_data, file)

        # Save adjacency matrix if partially created
        if adjacency_matrix is not None:
            adjacency_matrix.to_csv(output_adjacency_matrix)
        raise e


In [None]:
def main():
    json_folder = "final_user_profiles"  # Replace with your JSON folder path
    output_user_data = "user_data.json"
    output_adjacency_matrix = "adjacency_matrix.csv"

    process_json_files(json_folder, output_user_data, output_adjacency_matrix)

if __name__ == "__main__":
    main()

Processing JSON files:  65%|██████▌   | 2200/3370 [44:58<2:13:01,  6.82s/it]