In [20]:
import pandas as pd
import pickle
from datasets import Dataset

In [21]:

import sys
sys.path.append('/home/ec2-user/SageMaker/llama_root/src')
sys.path.append('../llama-recipes/src/llama_recipes/')

In [22]:
from transformers import LlamaForCausalLM, LlamaTokenizer

model_id="meta-llama/Llama-2-7b-chat-hf"

tokenizer = LlamaTokenizer.from_pretrained(model_id)

In [23]:
df = pd.read_csv('custom_data/gpt4_parsed_resumes.csv')

In [24]:
df.columns

Index(['id', 'resume', 'basic_details', 'work_experience', 'education',
       'technical_skills'],
      dtype='object')

## Work Experience Training Data
We try and define a decent text prompt here for the base untrained model to understand the general task of extracting work experience from a resume

In [25]:
work_prompt = f'''
You are a helpful language model working for a job platform. You will be given the raw 
 unstructured text of a user's resume, and the task is to extract the work experience of the 
 user from the resume. Do NOT include the education or other certifications in the output.
 Extract the work experience from the raw text in the following format: \n{{query_format}}\n

 This is the resume text:\n{{resume_text}}\n
 This is the output in the required_format:\n{{output}}\n{{eos_token}}
'''

### Output Format 
We're telling the model how to format the output and give us a repsonse

In [26]:
work_format = '''{
    'work_experience': [{'company': 'company Name 1',
                         'role': 'job designation 1',
                         'start_date': 'mm/yyyy',
                         'end_date': 'mm/yyyy',
                         'description': 'complete Job description taken from resume'},
                        {'company': 'company name 2',
                         'role': 'job designation 2',
                         'start_date': mm/yyyy',
                         'end_date': 'mm/yyyy',
                         'description': 'complete Job description taken from resume'}]
}'''

In [27]:
work_df = df[['resume','work_experience']]

In [28]:
work_df

Unnamed: 0,resume,work_experience
0,Vaishnavi Moholkar\n Software Developer\n vai...,{'company': 'Fintech Credit Systems India Pvt....
1,Akshat Srivastav - Resume\n\n\n\nContact Info...,"{'company': 'Turno, Bangalore', 'role': 'Data ..."
2,DEVA KUMAR\nEmail: devasqldev@gmail.com\nConta...,"{'company': 'Infosys Technologies PVT ltd', 'r..."
3,"I&rsquo;m\n lo,\n\n Ashiti Khanuja\n Hel\n UX...","{'company': 'Turno', 'role': 'UX / UI Designer..."
4,Hi! I'am Arun S\n UI/UX Designer\n\n +91-8610...,"{'company': 'Tata Consultancy Services', 'role..."
...,...,...
136,"GAYATHIRI R\n Thanjavur, Tamil Nadu | P: +91 ...","{'company': 'BANK OF AMERICA', 'role': 'Softwa..."
137,Sivakumar Akella\n\n Technical Program Manager...,"{'company': 'NSPlus Technology Pvt. Ltd.', 'ro..."
138,Deepak Mishra\n deepakmishrapc2020@gmail.com\...,"{'company': 'Gainsight Software Pvt Ltd', 'rol..."
139,Kumari Anjali\n +91-9074799694 | anjali41256@...,"{'company': 'Vias Groups', 'role': 'DevOps Eng..."


In [29]:
work_df.columns = ['resume','output']

In [30]:
work_df['format'] = work_format
work_df['prompt'] = work_prompt
work_data = Dataset.from_pandas(work_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  work_df['format'] = work_format
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  work_df['prompt'] = work_prompt


In [31]:
from llama_recipes.utils.dataset_utils import get_preprocessed_dataset
from llama_recipes.datasets.utils import Concatenator


In [32]:
work_data

Dataset({
    features: ['resume', 'output', 'format', 'prompt'],
    num_rows: 141
})

In [33]:
print(work_data[4]['format'])

{
    'work_experience': [{'company': 'company Name 1',
                         'role': 'job designation 1',
                         'start_date': 'mm/yyyy',
                         'end_date': 'mm/yyyy',
                         'description': 'complete Job description taken from resume'},
                        {'company': 'company name 2',
                         'role': 'job designation 2',
                         'start_date': mm/yyyy',
                         'end_date': 'mm/yyyy',
                         'description': 'complete Job description taken from resume'}]
}


In [38]:
print(work_data[4]['prompt'])


You are a helpful language model working for a job platform. You will be given the raw 
 unstructured text of a user's resume, and the task is to extract the work experience of the 
 user from the resume. Do NOT include the education or other certifications in the output.
 Extract the work experience from the raw text in the following format: 
{query_format}


 This is the resume text:
{resume_text}

 This is the output in the required_format:
{output}
{eos_token}



In [39]:

# def apply_prompt_template(sample):
#     return {
#         "text": work_prompt.format(
#             work_format=sample["format"],
#             resume_text=sample["resume"],
#             output=sample["output"],
#             eos_token=tokenizer.eos_token,
#         )
#     }

# work_ds = work_data.map(apply_prompt_template, remove_columns=list(work_data.features))

# work_ds = work_ds.map(
#     lambda sample: tokenizer(sample["text"]),
#     batched=True,
#     remove_columns=list(work_ds.features),
# ).map(Concatenator(), batched=True)



## Personal Information Training Data
We try and define a decent text prompt here for the base untrained model to understand the general task of extracting work experience from a resume

In [40]:
pi_format = '''{
    'personal_information': {'name': "Name",
                         'email_id': "Valid Email ID",
                         'phone_number': "10 Digit phone number",
                         'location': "User's current location"}
}'''

In [41]:
pi_prompt = f'''
You are a helpful language model working for a job platform. You will be given the raw 
 unstructured text of a user's resume, and the task is to extract the personal information (name, phone number, email ID and the location) of the 
 user from the raw text in the following format: \n{{query_format}}\n
 If the information is not available, return 'NA'
 This is the resume text:\n{{resume_text}}\n
 This is the output in the required_format:\n{{output}}\n{{eos_token}}
'''

In [42]:
pi_df = df[['resume','basic_details']]

In [43]:
pi_df.columns = ['resume','output']

In [44]:
pi_df

Unnamed: 0,resume,output
0,Vaishnavi Moholkar\n Software Developer\n vai...,"{'name': 'Vaishnavi Moholkar', 'location': 'Pu..."
1,Akshat Srivastav - Resume\n\n\n\nContact Info...,"{'name': 'Akshat Srivastav', 'location': 'Kora..."
2,DEVA KUMAR\nEmail: devasqldev@gmail.com\nConta...,"{'name': 'Deva Kumar', 'location': 'Hyderabad'..."
3,"I&rsquo;m\n lo,\n\n Ashiti Khanuja\n Hel\n UX...","{'name': 'Ashiti Khanuja', 'location': 'Baroda..."
4,Hi! I'am Arun S\n UI/UX Designer\n\n +91-8610...,"{'name': 'Arun S', 'location': 'Chennai, Tamil..."
...,...,...
136,"GAYATHIRI R\n Thanjavur, Tamil Nadu | P: +91 ...","{'name': 'GAYATHIRI R', 'location': 'Thanjavur..."
137,Sivakumar Akella\n\n Technical Program Manager...,"{'name': 'Sivakumar Akella', 'location': 'Bang..."
138,Deepak Mishra\n deepakmishrapc2020@gmail.com\...,"{'name': 'Deepak Mishra', 'location': 'Mumbai'..."
139,Kumari Anjali\n +91-9074799694 | anjali41256@...,"{'name': 'Kumari Anjali', 'location': 'Unknown..."


In [45]:
pi_df['format'] = pi_format
pi_df['prompt'] = pi_prompt
pi_data = Dataset.from_pandas(pi_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pi_df['format'] = pi_format
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pi_df['prompt'] = pi_prompt


In [150]:
pi_data

Dataset({
    features: ['resume', 'output', 'format', 'prompt'],
    num_rows: 141
})

In [151]:
# def pi_prompt_template(sample):
    
#     prompt = f'''
#     You are a helpful language model working for a job platform. You will be given the raw 
#      unstructured text of a user's resume, and the task is to extract the personal information (name, phone number, email ID and the location) of the 
#      user from the raw text in the following format: \n{{pi_format}}\n
#      If the information is not available, return 'NA'
#      This is the resume text:\n{{resume_text}}\n
#      This is the output in the required_format:\n{{output}}\n{{eos_token}}
#     '''
    
#     return {
#         "text": prompt.format(
#             pi_format=sample["format"],
#             resume_text=sample["resume"],
#             output=sample["output"],
#             eos_token=tokenizer.eos_token,
#         )
#     }

# pi_ds = pi_data.map(pi_prompt_template, remove_columns=list(pi_data.features))

# pi_ds = pi_ds.map(
#     lambda sample: tokenizer(sample["text"]),
#     batched=True,
#     remove_columns=list(pi_ds.features),
# ).map(Concatenator(), batched=True)


## Education Training Data
We try and define a decent text prompt here for the base untrained model to understand the general task of extracting Education from a resume

In [152]:
from string import Template

In [51]:
edu_prompt = f'''
You are a helpful language model working for a job platform. You will be given the raw 
 unstructured text of a user's resume, and the task is to extract 
 the educational information (graduate/post graduate degree name, institution name, start and end dates of the program) of the 
 user from the raw text in the following format: \n{{query_format}}\n
 If the information for a certain field is not available, return 'NA'
 This is the resume text:\n{{resume_text}}\n
 This is the output in the required_format:\n${{output}}\n{{eos_token}}
'''

In [52]:
edu_format = '''[
    {
        "institution": "put name of educational institution here"
        "program" : "name of degree/certification/diploma as given in the resume"
        "start_date" : "start date in dd/mm/yyyy format"
        "end_date" : "end date in dd/mm/yyyy format"
        
    },
    {
        "institution": "put name of educational institution here"
        "program" : "name of degree/certification/diploma as given in the resume"
        "start_date" : "start date in dd/mm/yyyy format"
        "end_date" : "end date in dd/mm/yyyy format"
        
    }
]'''

In [53]:
edu_df = df[['resume','education']]

In [54]:
edu_df.columns = ['resume','output']

In [55]:
edu_df['prompt'] = edu_prompt
edu_df['format'] = edu_format
edu_data = Dataset.from_pandas(edu_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  edu_df['prompt'] = edu_prompt
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  edu_df['format'] = edu_format


In [56]:
# def edu_prompt_template(sample):

#     edu_prompt = Template('''You are a helpful language model working for a job platform. You will be given the raw 
#                      unstructured text of a user's resume, and the task is to extract 
#                      the educational information (graduate/post graduate degree name, institution name, email ID and the location) of the 
#                      user from the raw text in the following format: \n${edu_format}\n
#                      If the information for a certain field is not available, return 'NA'
#                      This is the resume text:\n${resume_text}\n
#                      This is the output in the required_format:\n${output}\n{eos_token}
#                     ''')

#     return {
#         "text": edu_prompt.substitute(
#         edu_format=sample["format"],
#         resume_text=sample["resume"],
#         output=sample["output"],
#         eos_token=tokenizer.eos_token
#         )
#     }

# edu_ds = edu_data.map(edu_prompt_template, remove_columns=list(edu_data.features))

# edu_ds = edu_ds.map(
#     lambda sample: tokenizer(sample["text"]),
#     batched=True,
#     remove_columns=list(edu_ds.features),
# ).map(Concatenator(), batched=True)


In [57]:
# from datasets import concatenate_datasets
# train_data = concatenate_datasets([edu_ds, pi_ds, work_ds])

In [58]:
import pandas as pd 
train_df = pd.concat([pi_df,work_df,edu_df])

In [59]:
train_df = train_df.sample(frac=1)

In [60]:
train_df

Unnamed: 0,resume,output,format,prompt
101,Dayakar A\nEmail : dayakar.a1993@gmail.com\nMo...,[{'institution': 'Sree Vidyanikethan Engineeri...,"[\n {\n ""institution"": ""put name of ...",\nYou are a helpful language model working for...
112,,"{'name': 'John Doe', 'location': 'San Francisc...","{\n 'personal_information': {'name': ""Name""...",\nYou are a helpful language model working for...
49,Sandeep Jaiswal Professional Experience\nData ...,"{'company': 'X.L Dynamics Pvt Ltd', 'role': 'D...",{\n 'work_experience': [{'company': 'compan...,\nYou are a helpful language model working for...
108,Devyani Gaikwad Seeking long term employment i...,"{'name': 'Devyani Gaikwad', 'location': 'Benga...","{\n 'personal_information': {'name': ""Name""...",\nYou are a helpful language model working for...
38,Bipin Gupta\n Software QA Engineer\n Total 5+...,[{'institution': 'Lokmanya Tilak College of En...,"[\n {\n ""institution"": ""put name of ...",\nYou are a helpful language model working for...
...,...,...,...,...
106,Birari Sujata\n\nMob: + 91-9307576882\n\nE-mai...,"[{'institution': 'Pune University', 'program':...","[\n {\n ""institution"": ""put name of ...",\nYou are a helpful language model working for...
103,"SHA MOHAMED BIJILI.H\nS/o. HUSSAIN BIJILI,\n#8...","{'company': 'Ford Motor Company', 'role': 'SPC...",{\n 'work_experience': [{'company': 'compan...,\nYou are a helpful language model working for...
94,Divya Wadhwa\nMobile: +91- 9910344206\nEmail: ...,"{'name': 'Divya Wadhwa ', 'location': 'Not Men...","{\n 'personal_information': {'name': ""Name""...",\nYou are a helpful language model working for...
130,",,","[{'institution': 'Columbia University', 'progr...","[\n {\n ""institution"": ""put name of ...",\nYou are a helpful language model working for...


In [61]:
data_list = []
for row in train_df.itertuples():
    text = row.prompt.format(
            query_format=row.format,
            resume_text=row.resume,
            output=row.output,
            eos_token=tokenizer.eos_token)
    data_list.append(text)

In [80]:
dl = {
    'text':data_list
}

In [81]:
td = Dataset.from_dict(dl)

In [82]:
len(data_list)

423

In [83]:
sp = Dataset.from_dict(dl)

In [84]:

td = td.map(
    lambda sample: tokenizer(sample["text"]),
    batched=True,
    remove_columns=list(td.features),
).map(Concatenator(), batched=True)


Map:   0%|          | 0/423 [00:00<?, ? examples/s]

Map:   0%|          | 0/423 [00:00<?, ? examples/s]

In [85]:
td

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 349
})

In [86]:
td.save_to_disk('custom_data/train_data.hf')

Saving the dataset (0/1 shards):   0%|          | 0/349 [00:00<?, ? examples/s]

In [99]:
# work_ds.save_to_disk('custom_data/work_data.hf')

In [100]:
# edu_ds.save_to_disk('custom_data/education_data.hf')

In [101]:
# pi_ds.save_to_disk('custom_data/pi_data.hf')