In [1]:
import jsonlines
from tqdm import tqdm

all_users = []
with jsonlines.open('output_3.jsonl', mode='r') as writer:
    a = list(writer)
    all_users.extend(a)
with jsonlines.open('output_2.jsonl', mode='r') as writer:
    b = list(writer)
    all_users.extend(b)

In [2]:
print(len(all_users))

3710


In [3]:
def format_profile(profile):
    if not profile: return None
    headline = profile["headline"].strip() if "headline" in profile and profile["headline"] else "Not listed"
    industryName = profile["industryName"].strip() if "industryName" in profile and profile["industryName"] else "Not listed"
    name = profile["firstName"] + " " + profile["lastName"]
    country = profile["locationName"].strip() if "locationName" in profile and profile["locationName"] else "Not listed"
    summary = profile["summary"].strip() if "summary" in profile and profile["summary"] else "Not listed"
    experience = profile["experience"] if "experience" in profile and profile["experience"] else None
    volunteering = profile["volunteer"] if "volunteer" in profile and profile["volunteer"] else None
    education = profile["education"] if "education" in profile and profile["education"] else None
    awards = profile["honors"] if "honors" in profile else None

    def formatList(l, keys):
        res = ""
        for item_num, item in enumerate(l):
            for i, key in enumerate(keys.keys()):
                if (key not in item or not item[key]): continue
                # res += "" if i != 0 else ""
                res += keys[key] + ": " + item[key].strip() + ("\n" if i != len(keys) - 1 else "")
            if item_num != len(l) - 1:
                res += "\n"
        return res
    
    def formatExperience(experience, keys):
        res = []
        for item in experience:
            temp = ""
            for i, key in enumerate(keys.keys()):
                if (key not in item or not item[key]): continue
                # res += "" if i != 0 else ""
                temp += keys[key] + ": " + item[key].strip() + "\n"
            time_period_valid = item['timePeriod'] and 'startDate' in item['timePeriod'] and item['timePeriod']['startDate']
            month_valid = time_period_valid and 'month' in item['timePeriod']['startDate'] and item['timePeriod']['startDate']['month']
            year_valid = time_period_valid and 'year' in item['timePeriod']['startDate'] and item['timePeriod']['startDate']['year']
            date_str = f"{item['timePeriod']['startDate']['month']:0>2}/{item['timePeriod']['startDate']['year']}"if time_period_valid and month_valid and year_valid else "Not listed"
            temp += f"Starting Date: " + date_str
            res.append((temp,
                        int(item['timePeriod']['startDate']['year']) if time_period_valid and year_valid else None,
                        int(item['timePeriod']['startDate']['month']) if time_period_valid and month_valid else None))
        
        return list(reversed(res))


    formatted_education = formatList(education, { 'schoolName': 'School name', 'description': 'Description' }) if education else "Not listed"
    formatted_volunteering = formatList(volunteering, { 'companyName': 'Company name', 'role': 'Role', 'description': 'Description' }) if volunteering else "Not listed"
    formatted_awards = formatList(awards, { 'title': 'Title', 'issuer': 'Issuer', 'description': "Description" }) if awards else "Not listed"
    formatted_experience = formatExperience(experience, { 'companyName': 'Company name', 'title': 'Title', 'description': 'Description' }) if experience else []

    formatting_string = """Name: {name}

Industry: {industryName}
Country: {country}
Headline: {headline}
Summary: {summary}
---
Volunteering:
{formatted_volunteering}
---
Honors & Awards:
{formatted_awards}
---
Education:
{formatted_education}
"""
    return (formatting_string.format(name=name, country=country, industryName=industryName,
                                    headline=headline, summary=summary, 
                                    formatted_volunteering=formatted_volunteering, 
                                    formatted_education=formatted_education, formatted_awards=formatted_awards), formatted_experience)


In [4]:
real_users = [user for user in all_users if user]
len(real_users)

3515

In [5]:
from datasets import Dataset
ds = Dataset.from_list(real_users)

In [23]:
def map_formatting(examples):
    new_examples = []
    unbatched = []
    for i in range(len(examples["firstName"])):
        unbatched.append({
            k: v[i] for k, v in examples.items()
        })

    texts = []
    labels = []
    for example in unbatched:
        text, experiences = format_profile(example)
        if len(experiences) <= 0: continue
        experience_to_give = experiences[:-1]
        experience_label = experiences[-1]

        texts.append(text + "---\nExperience:\n" + "\n\n".join(e[0] for e in experience_to_give))
        labels.append(experience_label[0])
    
    return {"input": texts, "output": labels}

new_ds = ds.map(map_formatting, remove_columns=ds.column_names, batched=True, batch_size=10)
new_ds = new_ds.add_column("instruction", ["This is my LinkedIn profile information."]*len(new_ds["input"]))

Map:   0%|          | 0/3515 [00:00<?, ? examples/s]

In [24]:
new_ds.to_json("formatted_full.jsonl", orient="records", lines=True)

Creating json from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

6985877

In [22]:
print(new_ds[0]["input"])

Name: Anthony Wang

Industry: Information Technology & Services
Country: Canada
Headline: Software Developer @ blueRover | Math/BBA @ uwaterloo & wlu
Summary: Not listed
---
Volunteering:
Company name: University of Waterloo Chinese Students Association
Role: VP of Internals
Description: 2022: Executive & Director of internals
2023: VP of internals
Company name: UW Computer Science Club
Role: Event Coordinator

Company name: Tech+ UW
Role: Logistics Organizer

---
Honors & Awards:
Not listed
---
Education:
School name: University of Waterloo

School name: Wilfrid Laurier University

School name: International Baccalaureate

---
Experience:
Company name: Kumon North America, Inc.
Title: Kumon Math/English Tutor
Starting Date: 06/2015

Company name: City of Kitchener
Title: Swim Instructor/Lifeguard
Starting Date: 01/2019

Company name: University of Waterloo
Title: Sub Team Lead
Description: We Accelerate program w/ Manulife: Modern Web Application Design
Starting Date: 04/2022

Company