# Environment and Data

### Modules import, config, custom functions

In [162]:
# Environment and configs
import sys
import os
from pathlib import Path

import kagglehub
from kagglehub import KaggleDatasetAdapter

parent_dir = Path.cwd().resolve().parent
sys.path.append(str(parent_dir))

from src.config import Config

config = Config()
print(parent_dir)
print('Config initialized')

# Modules for data 
import pandas as pd

C:\Users\Мариан\Desktop\Jupyter Notes\Projects\Trainee_iFortex\Git\job_posting
Config initialized


In [278]:
def size_memory_info(df: pd.DataFrame, name: str = 'current df'):
    size_in_bytes = df.memory_usage(deep=True).sum()
    size_in_megabytes = size_in_bytes / (1024 ** 2)
    size_in_gigabytes = size_in_bytes / (1024 ** 3)

    print(f"\nMemory usage of {name}: {size_in_megabytes:.2f} MB ~ {size_in_gigabytes:.2f} GB\
                \nNumber of rows in this table: {df.shape[0]}\
                \nNumber of columns in this table: {df.shape[1]}\n")

### Download Dataset directly from kaggle

In [2]:
# os.environ["KAGGLEHUB_CACHE"] = str(config.get('raw_dir'))
# # Set the path to the file you'd like to load
# path = kagglehub.dataset_download("arshkon/linkedin-job-postings")

# # Load the latest version
# df = kagglehub.load_dataset(
#   KaggleDatasetAdapter.PANDAS,
#   "arshkon/linkedin-job-postings",
#   file_path,
#   # Provide any additional arguments like 
#   # sql_query or pandas_kwargs. See the 
#   # documenation for more information:
#   # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
# )

# print("First 5 records:", df.head())

### Reading Raw_Data

In [163]:
postings_raw = pd.read_csv(config.get('postings'))

skills_id = pd.read_csv(config.get('skills_id'))
mapping_skills = pd.read_csv(config.get('mapping_skills'))

industries_id = pd.read_csv(config.get('industries_id'))
mapping_industries = pd.read_csv(config.get('mapping_industries'))

In [164]:
print(postings_raw.shape)

print(skills_id.shape)
print(mapping_skills.shape)

print(industries_id.shape)
print(mapping_industries.shape)

(123849, 31)
(213768, 2)
(35, 2)
(164808, 2)
(422, 2)


### Basic Info about Data

In [117]:
postings_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123849 entries, 0 to 123848
Data columns (total 31 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   job_id                      123849 non-null  int64  
 1   company_name                122130 non-null  object 
 2   title                       123849 non-null  object 
 3   description                 123842 non-null  object 
 4   max_salary                  29793 non-null   float64
 5   pay_period                  36073 non-null   object 
 6   location                    123849 non-null  object 
 7   company_id                  122132 non-null  float64
 8   views                       122160 non-null  float64
 9   med_salary                  6280 non-null    float64
 10  min_salary                  29793 non-null   float64
 11  formatted_work_type         123849 non-null  object 
 12  applies                     23320 non-null   float64
 13  original_liste

In [68]:
postings_raw[['job_id','title']].head(4)

Unnamed: 0,job_id,title
0,921716,Marketing Coordinator
1,1829192,Mental Health Therapist/Counselor
2,10998357,Assitant Restaurant Manager
3,23221523,Senior Elder Law / Trusts and Estates Associat...


In [59]:
industries_id.head(5)

Unnamed: 0,job_id,industry_id
0,3884428798,82
1,3887473071,48
2,3887465684,41
3,3887467939,82
4,3887467939,80


In [58]:
mapping_industries.head(5)

Unnamed: 0,industry_id,industry_name
0,1,Defense and Space Manufacturing
1,3,Computer Hardware Manufacturing
2,4,Software Development
3,5,Computer Networking Products
4,6,"Technology, Information and Internet"


Would like to see how our feature looks:

In [274]:
list(postings_raw.loc[:,'description'].sample(5)[1:20])

['Morrison Healthcare\n\nSalary: \n\nOther Forms of Compensation:\n\nPay Grade: 12\n\nMorrison Healthcare is a leading national food and nutrition services company exclusively dedicated to serving more than 600 hospitals and healthcare systems. Morrison\'s hospital kitchens, restaurants, and cafés feature socially responsible practices and exceptional guest experiences. The company\'s comprehensive Mindful Choices® wellness and sustainability platform includes the latest in healthful eating and an understanding of behavioral change in food consumption. Morrison\'s alignment with Partnership for a Healthier America\'s (PHA) Hospital Healthy Food Initiative positively impacts up to 41 million patients and 500 million hospital meals annually. Morrison has been named one of Modern Healthcare\'s "Top 100 Best Places to Work in Healthcare" for the past five years, and Training Magazine\'s Top 125 organizations for the past six consecutive years. The company is a division of Compass Group and

## DQC

### Duplicates Basic

In [165]:
postings_raw.duplicated().sum()

np.int64(0)

No full duplicates

In [166]:
postings_raw[postings_raw.duplicated(subset=["job_id"], keep = False)]

Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,normalized_salary,zip_code,fips


No duplicates for job_id.

### Description Duplicates

In [283]:
print('Number of deplicates values: ',postings_raw.duplicated(subset=["description"], keep = False).sum())

Number of deplicates values:  22122


In [167]:
def description_duplicates(postings_raw):
    print(f' \'description\' duplicates precentage: {postings_raw[postings_raw.duplicated(subset=["description"], keep=False)].sort_values("description").shape[0] / postings_raw.shape[0]*100:.2f}% ')

In [168]:
description_duplicates(postings_raw=postings_raw)

 'description' duplicates precentage: 17.86% 


Exact values:

In [169]:
postings_raw[postings_raw.duplicated(subset=["description"], keep=False)].sort_values("description").head(10)

Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,normalized_salary,zip_code,fips
88229,3904393150,Boehringer Ingelheim,"US_Specialist III, LMS",\n\n\nDescription \n\n\n\n\n\nLearning is a cr...,,,"Ridgefield, CT",3235.0,112.0,,...,,1713395000000.0,jobs.boehringer-ingelheim.com,0,FULL_TIME,,,,6877.0,9001.0
108217,3905337321,Boehringer Ingelheim,"Global Specialist III, LMS",\n\n\nDescription \n\n\n\n\n\nLearning is a cr...,,,"Ridgefield, CT",3235.0,4.0,,...,,1713481000000.0,jobs.boehringer-ingelheim.com,0,FULL_TIME,,,,6877.0,9001.0
41184,3899520881,TEKsystems,"Sales Development Rep - Jackson, MS",\n\nAbout TEKsystems and TEKsystems Global Ser...,,HOURLY,"Ridgeland, MS",2152.0,18.0,21.0,...,,1713277000000.0,ars2.equest.com,0,FULL_TIME,USD,BASE_SALARY,43680.0,39157.0,28089.0
41159,3899519973,TEKsystems,Sales Development Rep (Columbus),\n\nAbout TEKsystems and TEKsystems Global Ser...,,,"Dublin, OH",2152.0,7.0,,...,,1713277000000.0,ars2.equest.com,0,FULL_TIME,,,,43016.0,
42343,3899533522,TEKsystems,Sales Development Rep-Madison,\n\nAbout TEKsystems and TEKsystems Global Ser...,,HOURLY,"Madison, WI",2152.0,14.0,21.0,...,,1713280000000.0,ars2.equest.com,0,FULL_TIME,USD,BASE_SALARY,43680.0,53703.0,55025.0
41612,3899525622,TEKsystems,Sales Development Representative - Fort Worth,\n\nAbout TEKsystems and TEKsystems Global Ser...,,HOURLY,"Fort Worth, TX",2152.0,11.0,21.0,...,,1713278000000.0,ars2.equest.com,0,FULL_TIME,USD,BASE_SALARY,43680.0,76102.0,48439.0
42460,3899535274,TEKsystems,Sales Development Rep-2 Greensboro (Summer 202...,\n\nAbout TEKsystems and TEKsystems Global Ser...,,,"Greensboro, NC",2152.0,4.0,,...,,1713280000000.0,ars2.equest.com,0,FULL_TIME,,,,27395.0,
42459,3899535273,TEKsystems,"Sales Development Rep 2 - Roanoke, VA (Summer ...",\n\nAbout TEKsystems and TEKsystems Global Ser...,,,"Roanoke, VA",2152.0,14.0,,...,,1713280000000.0,ars2.equest.com,0,FULL_TIME,,,,24011.0,51770.0
42190,3899531573,TEKsystems,Sales Development Rep-2- San Antonio,\n\nAbout TEKsystems and TEKsystems Global Ser...,,,"San Antonio, TX",2152.0,27.0,,...,,1713280000000.0,ars2.equest.com,0,FULL_TIME,,,,78201.0,48029.0
57242,3901942961,CJ,Associate Program Compliance Manager,\n\nFrom helping to invent the affiliate indus...,,,"Chicago, IL",5679.0,2.0,,...,,1713473000000.0,jobs.smartrecruiters.com,0,FULL_TIME,,,,60601.0,17031.0


### Missing values

In [171]:
postings_raw.isnull().sum()

job_id                             0
company_name                    1719
title                              0
description                        7
max_salary                     94056
pay_period                     87776
location                           0
company_id                      1717
views                           1689
med_salary                    117569
min_salary                     94056
formatted_work_type                0
applies                       100529
original_listed_time               0
remote_allowed                108603
job_posting_url                    0
application_url                36665
application_type                   0
expiry                             0
closed_time                   122776
formatted_experience_level     29409
skills_desc                   121410
listed_time                        0
posting_domain                 39968
sponsored                          0
work_type                          0
currency                       87776
c

We are happy! At leas we don't see a lot nulls in 'description' columns

In [279]:
size_memory_info(postings_raw)


Memory usage of current df: 928.20 MB ~ 0.91 GB                
Number of rows in this table: 123849                
Number of columns in this table: 31



### Vocabs

Job id supposed to be a unique value as part of page URL, duplicates will be removed from skills_id

In [286]:
industries_id

Unnamed: 0,job_id,industry_id
0,3884428798,82
1,3887473071,48
2,3887465684,41
3,3887467939,82
4,3887467939,80
...,...,...
164803,3902882321,104
164804,3902879720,27
164805,3902876855,80
164806,3902878689,116


In [287]:
industries_id[industries_id.duplicated(subset=["job_id"], keep = False)]

Unnamed: 0,job_id,industry_id
3,3887467939,82
4,3887467939,80
5,3887471331,57
6,3887471331,332
7,3887471331,383
...,...,...
164791,3906264345,12
164792,3902880673,59
164793,3902880673,3243
164799,3906261853,15


In [None]:
skills_id['skill'] = skills_id['skill_abr'].\
    map(mapping_skills.set_index('skill_abr')['skill_name'])
skills_id

In [None]:
postings_raw['target'] = postings_raw['job_id'].map(skills_id.set_index('job_id')['skill'])
postings_raw

### Transform

In [121]:
postings_raw['description'] = postings_raw['description'].astype('string')

# Other

In [5]:


df1 = pd.DataFrame({
    "id": [1, 2, 3, 4]
})

df2 = pd.DataFrame({
    "id": [1, 2, 3],
    "value": ["a", "b", "c"]
})


In [6]:
df1

Unnamed: 0,id
0,1
1,2
2,3
3,4


In [7]:
df1["value"] = df1["id"].map(df2.set_index("id")["value"])