# JsonL processing
This file contains auxiliary functions used to transform our .jsonl OpenAI files into an easier model training format.

In [1]:
import pandas as pd

In [None]:
# Generated title files
titles_df = pd.read_json('title_batch_output.jsonl', lines=True)

# Generated description files
description_df_1 = pd.read_json('description_batch_output_half_1.jsonl', lines=True)
description_df_2 = pd.read_json('description_batch_output_half_2.jsonl', lines=True)
description_df = pd.concat([description_df_1, description_df_2], ignore_index=True)
 

### Functions

In [3]:
def get_generation_index(response):
    answer_list = response['body']['choices']
    num_list = []
    for item in answer_list:
        num_index = item['index']
        num_list.append(num_index)
    return num_list

def get_generation(response):
    answer_list = response['body']['choices']
    generation_list = []
    for item in answer_list:
        num_index = item['index']
        text = item['message']['content']
        generation_list.append(text)
    return generation_list


### Processing

The following cells use custom_id to identify a description with its title, and with its UDC codes.

In [4]:
df_2 = titles_df.copy()
df_2['generation_index'] = df_2['response'].apply(get_generation_index)
df_2['generated_title'] = df_2['response'].apply(get_generation)
df_2.head()

Unnamed: 0,id,custom_id,response,error,generation_index,generated_title
0,batch_req_684b77e26ee08190ba793c08abf53c22,0-request-book,"{'status_code': 200, 'request_id': '15d4c3729a...",,"[0, 1, 2, 3]","[Exploring Contemporary Philosophies at Beida,..."
1,batch_req_684b77e27bb8819096a2cc6522eb3673,0-request-sci,"{'status_code': 200, 'request_id': '54d2bc5aaf...",,"[0, 1]",[Investigating the Differential Genetic Expres...
2,batch_req_684b77e289e4819088a40fc182834b56,1-request-book,"{'status_code': 200, 'request_id': 'e2ae2d4f65...",,"[0, 1, 2, 3]",[Understanding and Managing Soil Profiles: A G...
3,batch_req_684b77e29ebc8190940591794a84e99b,1-request-sci,"{'status_code': 200, 'request_id': '4f7aeaef24...",,"[0, 1]",[Characterization and Comparative Analysis of ...
4,batch_req_684b77e2abfc8190a1de5d8c8e828316,2-request-book,"{'status_code': 200, 'request_id': '33c2f321c7...",,"[0, 1, 2, 3]","[""The Harmonious World of A Cappella: Vocal Mu..."


In [28]:
df_3 = df_2.explode(['generation_index', 'generated_title'], ignore_index=False)
df_3['type'] = df_3['custom_id'].apply(lambda x: 'book' if 'book' in x else 'article')
df_3['uid'] = df_3['custom_id'].apply(lambda x: int(x.split('-')[0]))
df_3['desc_custom_id'] = df_3['custom_id'] + '-' + df_3['generation_index'].astype(str)
df_3

Unnamed: 0,id,custom_id,response,error,generation_index,generated_title,type,uid,desc_custom_id
0,batch_req_684b77e26ee08190ba793c08abf53c22,0-request-book,"{'status_code': 200, 'request_id': '15d4c3729a...",,0,Exploring Contemporary Philosophies at Beida,book,0,0-request-book-0
0,batch_req_684b77e26ee08190ba793c08abf53c22,0-request-book,"{'status_code': 200, 'request_id': '15d4c3729a...",,1,Exploring Chinese Culture in Modern Times,book,0,0-request-book-1
0,batch_req_684b77e26ee08190ba793c08abf53c22,0-request-book,"{'status_code': 200, 'request_id': '15d4c3729a...",,2,Whispers of the Old Library,book,0,0-request-book-2
0,batch_req_684b77e26ee08190ba793c08abf53c22,0-request-book,"{'status_code': 200, 'request_id': '15d4c3729a...",,3,Whispers of the Eastern Lotus,book,0,0-request-book-3
1,batch_req_684b77e27bb8819096a2cc6522eb3673,0-request-sci,"{'status_code': 200, 'request_id': '54d2bc5aaf...",,0,Investigating the Differential Genetic Express...,article,0,0-request-sci-0
...,...,...,...,...,...,...,...,...,...
12718,batch_req_684b7ae6e4e881909d1a8b4361c9a9f7,6359-request-book,"{'status_code': 200, 'request_id': 'f341445798...",,1,"""Shadows of Azure: Secrets Beneath the Blue""",book,6359,6359-request-book-1
12718,batch_req_684b7ae6e4e881909d1a8b4361c9a9f7,6359-request-book,"{'status_code': 200, 'request_id': 'f341445798...",,2,Secrets of the Azure Stone,book,6359,6359-request-book-2
12718,batch_req_684b7ae6e4e881909d1a8b4361c9a9f7,6359-request-book,"{'status_code': 200, 'request_id': 'f341445798...",,3,Whispers Beneath the Azure Veil,book,6359,6359-request-book-3
12719,batch_req_684b7ae6f2248190a7ef068ec4342c09,6359-request-sci,"{'status_code': 200, 'request_id': '2d685baf64...",,0,Characterization and Properties of Azurite: A ...,article,6359,6359-request-sci-0


In [31]:
df_titles_pruned = df_3[['uid', 'type', 'generated_title', 'desc_custom_id']]
df_titles_pruned.head()

Unnamed: 0,uid,type,generated_title,desc_custom_id
0,0,book,Exploring Contemporary Philosophies at Beida,0-request-book-0
0,0,book,Exploring Chinese Culture in Modern Times,0-request-book-1
0,0,book,Whispers of the Old Library,0-request-book-2
0,0,book,Whispers of the Eastern Lotus,0-request-book-3
1,0,article,Investigating the Differential Genetic Express...,0-request-sci-0


In [12]:
df_description_2 = description_df.copy()
df_description_2['generated_description'] = df_description_2['response'].apply(get_generation).explode()
df_description_2.head()

Unnamed: 0,id,custom_id,response,error,generated_description
0,batch_req_684d8104f56c819083d15084cf597f89,0-request-book-0,"{'status_code': 200, 'request_id': '9c77c5ed08...",,"""Exploring Contemporary Philosophies at Beida""..."
1,batch_req_684d81050c1c8190911d5a046358d633,0-request-book-1,"{'status_code': 200, 'request_id': '7332a7920a...",,"""Exploring Chinese Culture in Modern Times"" of..."
2,batch_req_684d81051de48190b63c4856b3b181cf,0-request-book-2,"{'status_code': 200, 'request_id': '4d6b1a7d55...",,"In ""Whispers of the Old Library,"" A Beida weav..."
3,batch_req_684d81052f7881909f4dd3f329247808,0-request-book-3,"{'status_code': 200, 'request_id': '1cb06c974b...",,"""Whispers of the Eastern Lotus"" is a captivati..."
4,batch_req_684d810543dc8190b1b688dab6ceea2f,0-request-sci-0,"{'status_code': 200, 'request_id': '329f02e16f...",,This article explores the variations in geneti...


In [41]:
df_description_pruned = df_description_2[['custom_id', 'generated_description']].copy()
df_description_pruned['desc_custom_id'] = df_description_pruned['custom_id']
df_description_pruned = df_description_pruned.drop(columns=['custom_id'])
df_description_pruned.head()

Unnamed: 0,generated_description,desc_custom_id
0,"""Exploring Contemporary Philosophies at Beida""...",0-request-book-0
1,"""Exploring Chinese Culture in Modern Times"" of...",0-request-book-1
2,"In ""Whispers of the Old Library,"" A Beida weav...",0-request-book-2
3,"""Whispers of the Eastern Lotus"" is a captivati...",0-request-book-3
4,This article explores the variations in geneti...,0-request-sci-0


In [38]:
df_categories = pd.read_csv('categories.csv')
df_categories.head()

Unnamed: 0,uid,text,udc_1,udc_2,udc_3,udc_4,udc_5
0,0,A Beida,1¢(533.22),,,,
1,1,A-C horizons,631.472.6,,,,
2,2,a cappella music,784.1,,,,
3,3,a la carte menus,642.53,,,,
4,4,a posteriori (natural language) systems,1¢=929.3,,,,


In [39]:
df_cat_title = pd.merge(df_categories, df_titles_pruned, on='uid', how='outer')
df_cat_title.head()

Unnamed: 0,uid,text,udc_1,udc_2,udc_3,udc_4,udc_5,type,generated_title,desc_custom_id
0,0,A Beida,1¢(533.22),,,,,book,Exploring Contemporary Philosophies at Beida,0-request-book-0
1,0,A Beida,1¢(533.22),,,,,book,Exploring Chinese Culture in Modern Times,0-request-book-1
2,0,A Beida,1¢(533.22),,,,,book,Whispers of the Old Library,0-request-book-2
3,0,A Beida,1¢(533.22),,,,,book,Whispers of the Eastern Lotus,0-request-book-3
4,0,A Beida,1¢(533.22),,,,,article,Investigating the Differential Genetic Express...,0-request-sci-0


In [42]:
df_cat_title_description = pd.merge(df_cat_title, df_description_pruned, on='desc_custom_id', how='outer')
df_cat_title_description.head()

Unnamed: 0,uid,text,udc_1,udc_2,udc_3,udc_4,udc_5,type,generated_title,desc_custom_id,generated_description
0,0,A Beida,1¢(533.22),,,,,book,Exploring Contemporary Philosophies at Beida,0-request-book-0,"""Exploring Contemporary Philosophies at Beida""..."
1,0,A Beida,1¢(533.22),,,,,book,Exploring Chinese Culture in Modern Times,0-request-book-1,"""Exploring Chinese Culture in Modern Times"" of..."
2,0,A Beida,1¢(533.22),,,,,book,Whispers of the Old Library,0-request-book-2,"In ""Whispers of the Old Library,"" A Beida weav..."
3,0,A Beida,1¢(533.22),,,,,book,Whispers of the Eastern Lotus,0-request-book-3,"""Whispers of the Eastern Lotus"" is a captivati..."
4,0,A Beida,1¢(533.22),,,,,article,Investigating the Differential Genetic Express...,0-request-sci-0,This article explores the variations in geneti...


In [46]:
df_cat_title_description.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38160 entries, 0 to 38159
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   uid                    38160 non-null  int64  
 1   text                   38160 non-null  object 
 2    udc_1                 38160 non-null  object 
 3    udc_2                 1230 non-null   object 
 4    udc_3                 78 non-null     object 
 5    udc_4                 6 non-null      object 
 6    udc_5                 0 non-null      float64
 7   type                   38160 non-null  object 
 8   generated_title        38160 non-null  object 
 9   desc_custom_id         38160 non-null  object 
 10  generated_description  38160 non-null  object 
dtypes: float64(1), int64(1), object(9)
memory usage: 3.2+ MB


### Saving

In [45]:
df_cat_title_description.drop_duplicates(subset=['generated_title']).info()

<class 'pandas.core.frame.DataFrame'>
Index: 37373 entries, 0 to 38159
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   uid                    37373 non-null  int64  
 1   text                   37373 non-null  object 
 2    udc_1                 37373 non-null  object 
 3    udc_2                 1205 non-null   object 
 4    udc_3                 75 non-null     object 
 5    udc_4                 6 non-null      object 
 6    udc_5                 0 non-null      float64
 7   type                   37373 non-null  object 
 8   generated_title        37373 non-null  object 
 9   desc_custom_id         37373 non-null  object 
 10  generated_description  37373 non-null  object 
dtypes: float64(1), int64(1), object(9)
memory usage: 3.4+ MB


Our features are:  
genereated_title: The title generated by the model.  
generated_description: The description generated by the model.

In [47]:
non_duplicated_df = df_cat_title_description.drop_duplicates(subset=['generated_title'])
non_duplicated_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37373 entries, 0 to 38159
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   uid                    37373 non-null  int64  
 1   text                   37373 non-null  object 
 2    udc_1                 37373 non-null  object 
 3    udc_2                 1205 non-null   object 
 4    udc_3                 75 non-null     object 
 5    udc_4                 6 non-null      object 
 6    udc_5                 0 non-null      float64
 7   type                   37373 non-null  object 
 8   generated_title        37373 non-null  object 
 9   desc_custom_id         37373 non-null  object 
 10  generated_description  37373 non-null  object 
dtypes: float64(1), int64(1), object(9)
memory usage: 3.4+ MB


In [48]:
non_duplicated_df.to_csv('udc_dataset_no_duplicate_titles.csv', index=False)

In [43]:
final_df = df_cat_title_description.copy()
final_df.to_csv('udc_dataset.csv')