## Environment and Data

### Modules import, config, custom functions

Data:
https://www.kaggle.com/datasets/arshkon/linkedin-job-postings

In [3]:
# Environment and configs
import sys
import os
from pathlib import Path

import kagglehub
from kagglehub import KaggleDatasetAdapter

parent_dir = Path.cwd().resolve().parent
sys.path.append(str(parent_dir))

from src.config import Config

config = Config()
print(parent_dir)
print('Config initialized')

# Modules for data 
import pandas as pd

C:\Users\Мариан\Desktop\Jupyter Notes\Projects\Trainee_iFortex\Git\job_posting
Config initialized


In [4]:
def size_memory_info(df: pd.DataFrame, name: str = 'current df'):
    size_in_bytes = df.memory_usage(deep=True).sum()
    size_in_megabytes = size_in_bytes / (1024 ** 2)
    size_in_gigabytes = size_in_bytes / (1024 ** 3)

    print(f"\nMemory usage of {name}: {size_in_megabytes:.2f} MB ~ {size_in_gigabytes:.2f} GB\
                \nNumber of rows in this table: {df.shape[0]}\
                \nNumber of columns in this table: {df.shape[1]}\n")

### Download Dataset directly from kaggle

In [4]:
# os.environ["KAGGLEHUB_CACHE"] = str(config.get('raw_dir'))
# # Set the path to the file you'd like to load
# path = kagglehub.dataset_download("arshkon/linkedin-job-postings")

# # Load the latest version
# df = kagglehub.load_dataset(
#   KaggleDatasetAdapter.PANDAS,
#   "arshkon/linkedin-job-postings",
#   file_path,
#   # Provide any additional arguments like 
#   # sql_query or pandas_kwargs. See the 
#   # documenation for more information:
#   # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
# )

# print("First 5 records:", df.head())

### Reading Raw_Data

In [9]:
postings_raw = pd.read_csv(config.get('postings'))

skills_id = pd.read_csv(config.get('skills_id'))
mapping_skills = pd.read_csv(config.get('mapping_skills'))

industries_id = pd.read_csv(config.get('industries_id'))
mapping_industries = pd.read_csv(config.get('mapping_industries'))

In [10]:
print(postings_raw.shape)

print(skills_id.shape)
print(mapping_skills.shape)

print(industries_id.shape)
print(mapping_industries.shape)

(123849, 31)
(213768, 2)
(35, 2)
(164808, 2)
(422, 2)


### Basic Info about Data

In [7]:
postings_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123849 entries, 0 to 123848
Data columns (total 31 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   job_id                      123849 non-null  int64  
 1   company_name                122130 non-null  object 
 2   title                       123849 non-null  object 
 3   description                 123842 non-null  object 
 4   max_salary                  29793 non-null   float64
 5   pay_period                  36073 non-null   object 
 6   location                    123849 non-null  object 
 7   company_id                  122132 non-null  float64
 8   views                       122160 non-null  float64
 9   med_salary                  6280 non-null    float64
 10  min_salary                  29793 non-null   float64
 11  formatted_work_type         123849 non-null  object 
 12  applies                     23320 non-null   float64
 13  original_liste

In [8]:
postings_raw[['job_id','title']].head(4)

Unnamed: 0,job_id,title
0,921716,Marketing Coordinator
1,1829192,Mental Health Therapist/Counselor
2,10998357,Assitant Restaurant Manager
3,23221523,Senior Elder Law / Trusts and Estates Associat...


In [9]:
industries_id.head(5)

Unnamed: 0,job_id,industry_id
0,3884428798,82
1,3887473071,48
2,3887465684,41
3,3887467939,82
4,3887467939,80


In [10]:
mapping_industries.head(5)

Unnamed: 0,industry_id,industry_name
0,1,Defense and Space Manufacturing
1,3,Computer Hardware Manufacturing
2,4,Software Development
3,5,Computer Networking Products
4,6,"Technology, Information and Internet"


Would like to see how our feature looks:

In [11]:
list(postings_raw.loc[:,'description'].sample(5)[1:20])

["Angi® is transforming the home services industry, creating an environment for homeowners, service professionals and employees to feel right at “home.” For most home maintenance needs, our platform makes it easier than ever to find a qualified service professional for indoor and outdoor jobs, home renovations (or anything in between!). We are on a mission to become the home for everything home by helping small businesses thrive and providing solutions to financing and booking home jobs with just a few clicks.\n\nOver the last 25 years we have opened our doors to a network of over 200K service professionals and helped over 150 million homeowners love where they live. We believe home is the most important place on earth and are embarking on a journey to redefine how people care for their homes. Angi is an amazing place to build your dream career, join us—we cannot wait to welcome you home!\n\nOur Inside Sales Representatives are responsible for expanding our network of service providers

## DQC

### Duplicates postings_raw

In [12]:
postings_raw.duplicated().sum()

np.int64(0)

No full duplicates

In [13]:
postings_raw[postings_raw.duplicated(subset=["job_id"], keep = False)]

Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,normalized_salary,zip_code,fips


No duplicates for job_id.

### 'Description' Duplicates in postings_raw

In [28]:
print('Number of duplicates values in description: ',postings_raw.duplicated(subset=["description"], keep = False).sum())

Number of duplicates values in description:  22122


In [15]:
def description_duplicates(postings_raw):
    print(f' \'description\' duplicates precentage: {postings_raw[postings_raw.duplicated(subset=["description"], keep=False)].sort_values("description").shape[0] / postings_raw.shape[0]*100:.2f}% ')

In [16]:
description_duplicates(postings_raw=postings_raw)

 'description' duplicates precentage: 17.86% 


Exact values:

In [17]:
postings_raw[postings_raw.duplicated(subset=["description"], keep=False)].sort_values("description").head(10)

Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,normalized_salary,zip_code,fips
88229,3904393150,Boehringer Ingelheim,"US_Specialist III, LMS",\n\n\nDescription \n\n\n\n\n\nLearning is a cr...,,,"Ridgefield, CT",3235.0,112.0,,...,,1713395000000.0,jobs.boehringer-ingelheim.com,0,FULL_TIME,,,,6877.0,9001.0
108217,3905337321,Boehringer Ingelheim,"Global Specialist III, LMS",\n\n\nDescription \n\n\n\n\n\nLearning is a cr...,,,"Ridgefield, CT",3235.0,4.0,,...,,1713481000000.0,jobs.boehringer-ingelheim.com,0,FULL_TIME,,,,6877.0,9001.0
41184,3899520881,TEKsystems,"Sales Development Rep - Jackson, MS",\n\nAbout TEKsystems and TEKsystems Global Ser...,,HOURLY,"Ridgeland, MS",2152.0,18.0,21.0,...,,1713277000000.0,ars2.equest.com,0,FULL_TIME,USD,BASE_SALARY,43680.0,39157.0,28089.0
41159,3899519973,TEKsystems,Sales Development Rep (Columbus),\n\nAbout TEKsystems and TEKsystems Global Ser...,,,"Dublin, OH",2152.0,7.0,,...,,1713277000000.0,ars2.equest.com,0,FULL_TIME,,,,43016.0,
42343,3899533522,TEKsystems,Sales Development Rep-Madison,\n\nAbout TEKsystems and TEKsystems Global Ser...,,HOURLY,"Madison, WI",2152.0,14.0,21.0,...,,1713280000000.0,ars2.equest.com,0,FULL_TIME,USD,BASE_SALARY,43680.0,53703.0,55025.0
41612,3899525622,TEKsystems,Sales Development Representative - Fort Worth,\n\nAbout TEKsystems and TEKsystems Global Ser...,,HOURLY,"Fort Worth, TX",2152.0,11.0,21.0,...,,1713278000000.0,ars2.equest.com,0,FULL_TIME,USD,BASE_SALARY,43680.0,76102.0,48439.0
42460,3899535274,TEKsystems,Sales Development Rep-2 Greensboro (Summer 202...,\n\nAbout TEKsystems and TEKsystems Global Ser...,,,"Greensboro, NC",2152.0,4.0,,...,,1713280000000.0,ars2.equest.com,0,FULL_TIME,,,,27395.0,
42459,3899535273,TEKsystems,"Sales Development Rep 2 - Roanoke, VA (Summer ...",\n\nAbout TEKsystems and TEKsystems Global Ser...,,,"Roanoke, VA",2152.0,14.0,,...,,1713280000000.0,ars2.equest.com,0,FULL_TIME,,,,24011.0,51770.0
42190,3899531573,TEKsystems,Sales Development Rep-2- San Antonio,\n\nAbout TEKsystems and TEKsystems Global Ser...,,,"San Antonio, TX",2152.0,27.0,,...,,1713280000000.0,ars2.equest.com,0,FULL_TIME,,,,78201.0,48029.0
57242,3901942961,CJ,Associate Program Compliance Manager,\n\nFrom helping to invent the affiliate indus...,,,"Chicago, IL",5679.0,2.0,,...,,1713473000000.0,jobs.smartrecruiters.com,0,FULL_TIME,,,,60601.0,17031.0


### Duplicates in Vocabs

In [18]:
industries_id

Unnamed: 0,job_id,industry_id
0,3884428798,82
1,3887473071,48
2,3887465684,41
3,3887467939,82
4,3887467939,80
...,...,...
164803,3902882321,104
164804,3902879720,27
164805,3902876855,80
164806,3902878689,116


In [27]:
industries_id[industries_id.duplicated(subset=["job_id"], keep=False)].sort_values("job_id")

Unnamed: 0,job_id,industry_id
16315,112576855,34
16314,112576855,31
53188,115639136,128
53187,115639136,44
53189,115639136,46
...,...,...
164730,3906266272,12
164728,3906266272,17
164606,3906267126,4
164605,3906267126,6


In [None]:
industries_id.isnull().sum()

job_id         0
industry_id    0
dtype: int64

In [None]:
mapping_industries.isnull().sum()

industry_id       0
industry_name    34
dtype: int64

In [33]:
mapping_industries[mapping_industries['industry_name'].isnull()]

Unnamed: 0,industry_id,industry_name
160,431,
168,564,
171,616,
243,1517,
246,1600,
247,1602,
248,1633,
249,1649,
262,1759,
271,1909,


### Missing values

In [19]:
postings_raw.isnull().sum()

job_id                             0
company_name                    1719
title                              0
description                        7
max_salary                     94056
pay_period                     87776
location                           0
company_id                      1717
views                           1689
med_salary                    117569
min_salary                     94056
formatted_work_type                0
applies                       100529
original_listed_time               0
remote_allowed                108603
job_posting_url                    0
application_url                36665
application_type                   0
expiry                             0
closed_time                   122776
formatted_experience_level     29409
skills_desc                   121410
listed_time                        0
posting_domain                 39968
sponsored                          0
work_type                          0
currency                       87776
c

We are happy! At leas we don't see a lot nulls in 'description' columns

In [20]:
size_memory_info(postings_raw)


Memory usage of current df: 928.20 MB ~ 0.91 GB                
Number of rows in this table: 123849                
Number of columns in this table: 31



### Vocabs

Job id supposed to be a unique value as part of page URL, duplicates will be removed from skills_id

In [23]:
skills_id['skill'] = skills_id['skill_abr'].\
    map(mapping_skills.set_index('skill_abr')['skill_name'])
skills_id

Unnamed: 0,job_id,skill_abr,skill
0,3884428798,MRKT,Marketing
1,3884428798,PR,Public Relations
2,3884428798,WRT,Writing/Editing
3,3887473071,SALE,Sales
4,3887465684,FIN,Finance
...,...,...,...
213763,3902876855,HR,Human Resources
213764,3902878689,MGMT,Management
213765,3902878689,MNFC,Manufacturing
213766,3902883233,SALE,Sales


In [24]:
postings_raw['target'] = postings_raw['job_id'].map(skills_id.set_index('job_id')['skill'])
postings_raw

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

### Transform

In [None]:
postings_raw['description'] = postings_raw['description'].astype('string')

# Other

In [None]:


df1 = pd.DataFrame({
    "id": [1, 2, 3, 4]
})

df2 = pd.DataFrame({
    "id": [1, 2, 3],
    "value": ["a", "b", "c"]
})


In [None]:
df1

Unnamed: 0,id
0,1
1,2
2,3
3,4


In [None]:
df1["value"] = df1["id"].map(df2.set_index("id")["value"])

In [36]:
companies_raw = pd.read_csv(config.get('companies'))

In [37]:
companies_raw

Unnamed: 0,company_id,name,description,company_size,state,country,city,zip_code,address,url
0,1009,IBM,"At IBM, we do more than work. We create. We cr...",7.0,NY,US,"Armonk, New York",10504,International Business Machines Corp.,https://www.linkedin.com/company/ibm
1,1016,GE HealthCare,Every day millions of people feel the impact o...,7.0,0,US,Chicago,0,-,https://www.linkedin.com/company/gehealthcare
2,1025,Hewlett Packard Enterprise,Official LinkedIn of Hewlett Packard Enterpris...,7.0,Texas,US,Houston,77389,1701 E Mossy Oaks Rd Spring,https://www.linkedin.com/company/hewlett-packa...
3,1028,Oracle,We’re a cloud technology company that provides...,7.0,Texas,US,Austin,78741,2300 Oracle Way,https://www.linkedin.com/company/oracle
4,1033,Accenture,Accenture is a leading global professional ser...,7.0,0,IE,Dublin 2,0,Grand Canal Harbour,https://www.linkedin.com/company/accenture
...,...,...,...,...,...,...,...,...,...,...
24468,103463217,JRC Services,,2.0,0,0,0,0,0,https://www.linkedin.com/company/jrcservices
24469,103466352,Centent Consulting LLC,Centent Consulting LLC is a reputable human re...,,0,0,0,0,0,https://www.linkedin.com/company/centent-consu...
24470,103467540,"Kings and Queens Productions, LLC",We are a small but mighty collection of thinke...,,0,0,0,0,0,https://www.linkedin.com/company/kings-and-que...
24471,103468936,WebUnite,Our mission at WebUnite is to offer experience...,,Pennsylvania,US,Southampton,18966,720 2nd Street Pike,https://www.linkedin.com/company/webunite


In [5]:
company_industries = pd.read_csv(config.get('company_industries'))

In [6]:
company_industries

Unnamed: 0,company_id,industry
0,391906,Book and Periodical Publishing
1,22292832,Construction
2,20300,Banking
3,3570660,Book and Periodical Publishing
4,878353,Staffing and Recruiting
...,...,...
24370,32313,Retail Luxury Goods and Jewelry
24371,15225088,IT Services and IT Consulting
24372,2852377,Hospitality
24373,19114724,Construction


In [7]:
company_industries[company_industries.duplicated(subset=["company_id"], keep=False)].sort_values("company_id")

Unnamed: 0,company_id,industry
2908,4721,International Trade and Development
14958,4721,IT Services and IT Consulting
9906,21792,Financial Services
3361,21792,Real Estate
9890,27292,IT Services and IT Consulting
1170,27292,Staffing and Recruiting
24142,1687254,Retail Apparel and Fashion
793,1687254,Manufacturing
10987,2471716,Retail
7849,2471716,Food and Beverage Manufacturing


In [11]:
postings_raw

Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,normalized_salary,zip_code,fips
0,921716,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,20.0,HOURLY,"Princeton, NJ",2774458.0,20.0,,...,Requirements: \n\nWe are seeking a College or ...,1.713398e+12,,0,FULL_TIME,USD,BASE_SALARY,38480.0,8540.0,34021.0
1,1829192,,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...",50.0,HOURLY,"Fort Collins, CO",,1.0,,...,,1.712858e+12,,0,FULL_TIME,USD,BASE_SALARY,83200.0,80521.0,8069.0
2,10998357,The National Exemplar,Assitant Restaurant Manager,The National Exemplar is accepting application...,65000.0,YEARLY,"Cincinnati, OH",64896719.0,8.0,,...,We are currently accepting resumes for FOH - A...,1.713278e+12,,0,FULL_TIME,USD,BASE_SALARY,55000.0,45202.0,39061.0
3,23221523,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,Senior Associate Attorney - Elder Law / Trusts...,175000.0,YEARLY,"New Hyde Park, NY",766262.0,16.0,,...,This position requires a baseline understandin...,1.712896e+12,,0,FULL_TIME,USD,BASE_SALARY,157500.0,11040.0,36059.0
4,35982263,,Service Technician,Looking for HVAC service tech with experience ...,80000.0,YEARLY,"Burlington, IA",,3.0,,...,,1.713452e+12,,0,FULL_TIME,USD,BASE_SALARY,70000.0,52601.0,19057.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123844,3906267117,Lozano Smith,Title IX/Investigations Attorney,Our Walnut Creek office is currently seeking a...,195000.0,YEARLY,"Walnut Creek, CA",56120.0,1.0,,...,,1.713571e+12,,0,FULL_TIME,USD,BASE_SALARY,157500.0,94595.0,6013.0
123845,3906267126,Pinterest,"Staff Software Engineer, ML Serving Platform",About Pinterest:\n\nMillions of people across ...,,,United States,1124131.0,3.0,,...,,1.713572e+12,www.pinterestcareers.com,0,FULL_TIME,,,,,
123846,3906267131,EPS Learning,"Account Executive, Oregon/Washington",Company Overview\n\nEPS Learning is a leading ...,,,"Spokane, WA",90552133.0,3.0,,...,,1.713572e+12,epsoperations.bamboohr.com,0,FULL_TIME,,,,99201.0,53063.0
123847,3906267195,Trelleborg Applied Technologies,Business Development Manager,The Business Development Manager is a 'hunter'...,,,"Texas, United States",2793699.0,4.0,,...,,1.713573e+12,,0,FULL_TIME,,,,,


New Dataset

In [17]:
jobs = pd.read_csv('../data/01_raw/job_description.csv')

In [18]:
jobs

Unnamed: 0.1,Unnamed: 0,Category,Description,Benefits,Requirement,Requirements
0,0,Business Analyst,Description\nJob Title: Junior Business Analys...,Benefits\nMethods is passionate about its peop...,Requirements\n• Confidence in communicating an...,
1,1,Business Analyst,Description\nThe Business Analyst (BA) will ca...,Benefits\nA competitive base salary\nBonus sch...,Requirements\nYou have…\nA methodical approach...,
2,2,Business Analyst,Description\nWe help our clients design and de...,Benefits\nWe have 2 promotion windows open eac...,"Requirements\nYou are open, curious, and excit...",
3,3,Business Analyst,Description\nFounded in 2015 and rapidly expan...,Benefits\nIn addition to a competitive package...,Requirements\nKey Responsibilities Include:\nP...,
4,4,Business Analyst,Description\nWe are seeking a skilled Business...,Benefits\nFood Allowance\nGovernment Benefits\...,Requirements\nBachelor's Degree in management ...,
...,...,...,...,...,...,...
320,320,UI/UX,Description\nThe UX Designer will be focused o...,Benefits\nGovernment-mandated contributions: S...,,Requirements\nMust Haves:\nNative or Fluent En...
321,321,UI/UX,Description\nWe are seeking a dynamic UI/UX de...,Benefits\nPrivate Health Insurance\nTraining &...,,Requirements\nGather and evaluate user require...
322,322,UI/UX,Description\nWe are looking for a UI/UX Design...,Benefits\nAttractive salaries\nHybrid work mod...,,Requirements\nProven work experience as a UI/U...
323,323,UI/UX,Description\nRemote – Full-Time\nClay is a glo...,Benefits\nFully remote work\nExcellent compens...,,Requirements\nYou have 5+ years of digital des...
