In [None]:
import os
from dotenv import load_dotenv, find_dotenv
from pymongo import MongoClient


# Load environment variables
load_dotenv(find_dotenv())

# MongoDBs connection string
mongo_url = os.getenv("MongoClientURI")

# Define the collection name
mongo_collection_name = "KGL_LIN_PRF_USA"

# Connect to the MongoDB database
mongodb = MongoClient(mongo_url)["raw_data"]
collection = mongodb[mongo_collection_name]

In [12]:
import re


def process_experience(experience_dict: dict):
    """
    Preprocess a experience from a LinkedIn profile by cleaning text fields.
    
    :param experience_dict: A dictionary containing profile information.

    :return: A preprocessed dictionary with cleaned text fields.
    """
    # Define the keys to extract and clean
    keys_to_extract = ['company', 'title', 'description', 'location']
    
    # Initialize an empty dictionary to store the preprocessed data
    preprocessed_data = {}
    
    # Iterate through the required keys
    for key in keys_to_extract:
        # Extract the value from the profile dictionary or use an empty string if the key is not present
        value = experience_dict.get(key, '')
        
        # Clean up the text:
        # - Strip leading and trailing whitespace
        # - Replace multiple spaces with a single space
        # - Replace new line characters and tabs with a single space
        if value:
            cleaned_value = re.sub(r'\s+', ' ', value.strip())
        else:
            cleaned_value = None
        
        # Store the cleaned value in the preprocessed data dictionary
        preprocessed_data[key] = cleaned_value
    
    # Return the preprocessed data
    return preprocessed_data

In [13]:
test_experience = {
      "starts_at": {
        "day": 1,
        "month": 5,
        "year": 2012
      },
      "ends_at": {
        "day": 31,
        "month": 8,
        "year": 2012
      },
      "company": "Limited Brands",
      "company_linkedin_profile_url": "https://www.linkedin.com/company/lbrands",
      "title": "Raw Materials Intern- MAST Global",
      "description": "- Supported the day to day management of the print and pattern process\n- Worked cross functionally with Design, Tech Design, and Pre-production\n- Sat in on sketch line review and created PowerPoints to be sent out to our regional partners in order to receive costing\n- Communicated with our regional partners to follow up on lab-dips, strike-offs, and sampling as requested by the Design team\n- Completed numerous projects that were presented to co-workers for eye opening opportunities to further progress the growth of the brand",
      "location": "Columbus, Ohio Area",
      "logo_url": "https://media-exp1.licdn.com/dms/image/C4E0BAQHD8okj9rA0EQ/company-logo_100_100/0/1547228096275?e=1655942400&v=beta&t=2y-txs4X8PWRyl-UFfX_lGv0JtryM8MA7AQqNN6wlqo"
    }

result = process_experience(test_experience)

print(result)

{'company': 'Limited Brands', 'title': 'Raw Materials Intern- MAST Global', 'description': '- Supported the day to day management of the print and pattern process - Worked cross functionally with Design, Tech Design, and Pre-production - Sat in on sketch line review and created PowerPoints to be sent out to our regional partners in order to receive costing - Communicated with our regional partners to follow up on lab-dips, strike-offs, and sampling as requested by the Design team - Completed numerous projects that were presented to co-workers for eye opening opportunities to further progress the growth of the brand', 'location': 'Columbus, Ohio Area'}


In [18]:
import json
from transformers import AutoTokenizer


def prepare_model_input(preprocessed_dict: dict, tokenizer: AutoTokenizer):
    # Convert the dictionary to a JSON string to maintain structure in the prompt
    data_json = json.dumps(preprocessed_dict, indent=None)
    
    # Construct the prompt
    prompt = f"Extract and categorize information from the following LinkedIn profile entry: {data_json}"
    
    # Tokenize the prompt
    input_ids = tokenizer(prompt, return_tensors="pt")
    
    return input_ids

In [23]:
import os
import dotenv
from transformers import AutoTokenizer


# Load environment variables
dotenv.load_dotenv(dotenv.find_dotenv())

# Load the model and tokenizer
model_id = os.getenv("LLAMA_3_PATH")
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Preprocess the experience
result = process_experience(test_experience)

# Get tokenized input ready for the model
tokenized_input = prepare_model_input(result, tokenizer)

print(tokenized_input)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'input_ids': tensor([[128000,  30059,    323,  22824,    553,   2038,    505,    279,   2768,
          33867,   5643,   4441,     25,   5324,  10348,    794,    330,  75577,
          55332,    498,    330,   2150,    794,    330,  20613,  32009,   4514,
             12,    386,   6483,   8121,    498,    330,   4789,    794,   6660,
          50080,    279,   1938,    311,   1938,   6373,    315,    279,   1194,
            323,   5497,   1920,    482,   5664,    291,   5425,    734,    750,
            449,   7127,     11,  17829,   7127,     11,    323,   5075,  70666,
            482,  13479,    304,    389,  26610,   1584,   3477,    323,   3549,
           7572,  11665,    311,    387,   3288,    704,    311,   1057,  15481,
           8717,    304,   2015,    311,   5371,  54824,    482,  16838,    660,
            449,   1057,  15481,   8717,    311,   1833,    709,    389,  10278,
           1773,   3153,     11,  13471,  65039,     11,    323,  25936,    439,
          1147

In [None]:
collections = [
    {"name": "KGL_LIN_PRF_USA", "id_origin": 1},
    {"name": "KGL_LIN_PRF_IND", "id_origin": 2},
    {"name": "KGL_LIN_PRF_CAN", "id_origin": 3},
    {"name": "KGL_LIN_PRF_SNG", "id_origin": 4},
    {"name": "KGL_LIN_PRF_ISR", "id_origin": 5},
    {"name": "KGL_LIN_PRF_BRS", "id_origin": 6},
    {"name": "KGL_LIN_PRF_JPN", "id_origin": 7},
    {"name": "KGL_LIN_PRF_DEN", "id_origin": 8}
]


# Run the insertion function
insert_collection_documents(
    mongo_collection_name,
    dwh_id_origin,
    mysql_url,
    mongo_url,
    dwh_schema_name
)

    # Skip if  without experiences
    if not doc.get('experiences', []):
        continue

    # Initialize counter variables
    counter = 0
    total = collection.count_documents({})

    # Get the collection cursor
    documents = collection.find()

    # Processing loop
    for doc in documents:
        try:
            # Print progress
            counter += 1
            print(f"{collection_str}: {counter}/{total}")