In [5]:
import pandas as pd
import pyarrow.parquet as pq

# Read parquet using pyarrow directly, ignoring extension types
table = pq.read_table("postings_linkedin_individual_0000_part_01.parquet")
df = table.to_pandas(ignore_metadata=True)

print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())


Dataset shape: (81920, 34)
Columns: ['job_id', 'rcid', 'company', 'rics_k50', 'rics_k200', 'rics_k400', 'title_raw', 'title_translated', 'job_category', 'role_k50', 'role_k150', 'role_k300', 'role_k500', 'role_k1000', 'role_k1250', 'role_k1500', 'location_raw', 'region', 'country', 'state', 'metro_area', 'salary', 'post_date', 'remove_date', 'ultimate_parent_rcid', 'ultimate_parent_company_name', 'onet_code', 'onet_title', 'remote_type', 'jobtitle', 'description', 'salary_min', 'salary_max', 'salary_predicted']


In [6]:
# Display the first 5 columns of the dataset
print("Columns:", df.columns[:5].tolist())
display(df[df.columns[:5]].head())


Columns: ['job_id', 'rcid', 'company', 'rics_k50', 'rics_k200']


Unnamed: 0,job_id,rcid,company,rics_k50,rics_k200
0,373831869800000000002,6070410,SchÃ¶n Klinik Gruppe,Healthcare and Wellness Services,Mental and Public Health Services
1,370402171700000000002,405626,"Nevada HAND, Inc.",Real Estate and Development,Real Estate Development and Management
2,347726240300000000002,730244,"Great Clips, Inc.",Wellness Products,Health and Beauty Care
3,3851002279,980711,Elora SAS,Apparel Retail,Apparel and Fashion Retail
4,3892258137,14069521,Jobs via eFinancialCareers,Human Resources Services,Employment and Staffing Services


In [7]:
# Count the number of null (missing) values in each column
null_counts = df.isnull().sum().sort_values(ascending=False)

print("Number of null values per column:")
display(null_counts)


Number of null values per column:


salary_min                      76124
salary_max                      76122
remove_date                       666
description                       113
location_raw                       85
title_translated                   69
jobtitle                           65
title_raw                          33
role_k1000                         11
role_k1250                         11
role_k300                          11
role_k500                          11
role_k150                          11
role_k50                           11
job_category                       11
salary                             10
salary_predicted                   10
rics_k400                           1
rics_k200                           1
ultimate_parent_rcid                1
ultimate_parent_company_name        1
rics_k50                            1
rcid                                0
job_id                              0
company                             0
role_k1500                          0
region      

In [8]:
import pandas as pd

df1 = pd.read_parquet("data/sampled_engineers_with_clusters_20251105_175242.parquet")
df2 = pd.read_parquet("data/cluster_reps_checkpoint_final_20251106_163826.parquet")

print("=== sampled_engineers_with_clusters ===")
print(df1.shape)
print(df1.columns.tolist())

print("\n=== cluster_reps_checkpoint_final ===")
print(df2.shape)
print(df2.columns.tolist())


=== sampled_engineers_with_clusters ===
(4500, 38)
['job_id', 'rcid', 'company', 'rics_k50', 'rics_k200', 'rics_k400', 'title_raw', 'title_translated', 'job_category', 'role_k50', 'role_k150', 'role_k300', 'role_k500', 'role_k1000', 'role_k1250', 'role_k1500', 'location_raw', 'region', 'country', 'state', 'metro_area', 'salary', 'post_date', 'remove_date', 'ultimate_parent_rcid', 'ultimate_parent_company_name', 'onet_code', 'onet_title', 'remote_type', 'jobtitle', 'description', 'salary_min', 'salary_max', 'salary_predicted', 'description_embedding', 'cluster_label', 'cluster_probability', 'clustered']

=== cluster_reps_checkpoint_final ===
(293, 13)
['cluster_id', 'row_index', 'title_raw', 'company', 'description', 'job_category', 'role_k50', 'cv_standard', 'cv_error_standard', 'cv_extracted_requirements_standard', 'cv_desc_only', 'cv_error_desc', 'cv_extracted_requirements_desc']


In [7]:
print("Nombre total de clusters :", df1["cluster_label"].nunique())
print("Exemples :", df1["cluster_label"].unique()[:10])



Nombre total de clusters : 295
Exemples : [ -1 152   7 284 236 283 281 184 268 333]


In [3]:
df2.groupby("cluster_id").size().reset_index(name="count_per_cluster")


Unnamed: 0,cluster_id,count_per_cluster
0,1,1
1,2,1
2,3,1
3,4,1
4,5,1
...,...,...
288,295,1
289,296,1
290,297,1
291,305,1


In [11]:
selected_clusters = [ 86, 265, 46, 179, 146, 127, 230, 120, 10, 92, 150, 281, 34, 287, 224, 79, 47, 6, 234, 247, 126, 241, 285, 103, 77, 276, 279, 111, 62, 205 ]

df1[df1["cluster_label"].isin(selected_clusters)].groupby("cluster_label").size().reset_index(name="count").sort_values("cluster_label")



Unnamed: 0,cluster_label,count
0,6,3
1,10,5
2,34,3
3,46,3
4,47,4
5,62,3
6,77,3
7,79,3
8,86,5
9,92,6
