## Environment and Data

### Archive

Checked data sets: https://www.kaggle.com/datasets/arshkon/linkedin-job-postings/data  - multi categories dataset. Has less data.  

### Modules import, config, custom functions

Data:
https://huggingface.co/datasets/2024-mcm-everitt-ryan/job-postings-english-clean

In [1]:
# System/env config
import sys
import os
from pathlib import Path
from dotenv import load_dotenv

parent_dir = Path.cwd().resolve().parent
sys.path.append(str(parent_dir))
print('Current dir for import:', parent_dir)

from src.config import Config
config = Config()
print('Config initialized')


import kagglehub
from kagglehub import KaggleDatasetAdapter
from datasets import load_dataset

# Modules for data 
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

Current dir for import: C:\Users\Мариан\Desktop\Jupyter Notes\Projects\Trainee_iFortex\Git\job_posting
Config initialized


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def size_memory_info(df: pd.DataFrame, name: str = 'current df'):
    size_in_bytes = df.memory_usage(deep=True).sum()
    size_in_megabytes = size_in_bytes / (1024 ** 2)
    size_in_gigabytes = size_in_bytes / (1024 ** 3)

    print(f"\nMemory usage of {name}: {size_in_megabytes:.2f} MB ~ {size_in_gigabytes:.2f} GB\
                \nNumber of rows in this table: {df.shape[0]}\
                \nNumber of columns in this table: {df.shape[1]}\n")

### Download Dataset directly from HF

In [103]:
load_dotenv()
token = os.getenv("HF_TOKEN")
os.environ["HF_DATASETS_CACHE"] = str(config.get('raw_dir'))

data = load_dataset("2024-mcm-everitt-ryan/job-postings-english-clean", token = token)

In [104]:
print(data)

DatasetDict({
    train: Dataset({
        features: ['document_id', 'country', 'locale', 'language', 'source', 'scraped_date', 'company_id', 'companyName', 'city', 'dataset_id', 'dataset_type', 'dataset_license', 'dataset_source', 'dataset_title', 'dataset_url', 'category', 'position_name', 'job_posting', 'job_posting_html'],
        num_rows: 1760394
    })
})


### Converting to DF and Exploring

In [105]:
df = data["train"].select_columns(['category','job_posting']).to_pandas()

In [106]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1760394 entries, 0 to 1760393
Data columns (total 2 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   category     object
 1   job_posting  object
dtypes: object(2)
memory usage: 26.9+ MB


In [None]:
#df = df[['category','job_posting']]

In [107]:
size_memory_info(df)


Memory usage of current df: 4957.29 MB ~ 4.84 GB                
Number of rows in this table: 1760394                
Number of columns in this table: 2



In [108]:
list(df['job_posting'][0:50])

['$100 Sign On BONUS! We are seeking a caring, compassionate and reliable CAREGiver for a wonderful client in Lindenhurst. This is a fantastic position for someone who does not want to work weekends or evenings. Call or text now 631 Mon-Friday 10am-2pm. Primary responsibilities include: Companionship and conversation. Light housekeeping tasks and meal preparation. Medication and appointment reminders. Dementia CareEach Home Instead franchise is independently owned and operated.',
 "Read Below Only If You Want to Make Money Online & Be Financially Independent! Work From Home. Work Profile. Digital Media Projects. Recruitment and Team Management. Learn a New Things. Social Media Optimization. Jobs Description. Leads Generation and Sales Generation. Promotion and Advertisement. Assets Managment. Digital Entrepreneurship Project. Social Media Marketing. How To Generate Income By Using Social Media. Benefit. Communication Skills. Personal Development Skills. Confidence Level Skills. Financi

### DQC

In [109]:
df

Unnamed: 0,category,job_posting
0,Management,"$100 Sign On BONUS! We are seeking a caring, c..."
1,Entrepreneurship,Read Below Only If You Want to Make Money Onli...
2,UNKNOWN,FANTASTIC CAREER OPPORTUNITIES! Community Care...
3,Warehouse,£500 Bonus for Attendance during Black Friday ...
4,Seasonal,£500 Bonus for Attendance during Black Friday ...
...,...,...
1760389,UNKNOWN,€35k plus. Excellent Development opportunities...
1760390,UNKNOWN,"€45, 000 - €52, 000 Basic (OTE 70k). Hybrid C..."
1760391,Vertrieb,"Quick summary . Despite the global pandemic, C..."
1760392,Engineering,Are you a Fire Protection Engineer looking for...


In [116]:
df[df['category'] != 'UNKNOWN'].shape[0]

1041882

In [117]:
df = df[df['category'] != 'UNKNOWN']

In [118]:
def description_duplicates(df):
    print(f' \'job_posting\' duplicates precentage: {df[df.duplicated(subset=["job_posting"], keep=False)].sort_values("job_posting").shape[0] / df.shape[0]*100:.2f}% ')

In [119]:
description_duplicates(df)

 'job_posting' duplicates precentage: 7.57% 


In [120]:
df.isnull().sum()

category       0
job_posting    0
dtype: int64

In [121]:
df.columns

Index(['category', 'job_posting'], dtype='object')

In [122]:
df[df.duplicated(subset=["job_posting"], keep=False)].sort_values("job_posting")

Unnamed: 0,category,job_posting
575008,Medical,"$1, 000 SIGN ON BONUS - FULL-TIME. $500 SIGN O..."
574627,Medical,"$1, 000 SIGN ON BONUS - FULL-TIME. $500 SIGN O..."
574883,Medical,"$1, 000 SIGN ON BONUS - FULL-TIME. $500 SIGN O..."
495806,Medical,"$1, 000 SIGN-ON BONUS. FULL TIME AND PART TIME..."
495240,Nursing,"$1, 000 SIGN-ON BONUS. FULL TIME AND PART TIME..."
...,...,...
3180,Manufacturing,£9.75 per hour. IMMEDIATE START. Temp. Perm. A...
1758952,Manufacturing,£9.75 per hour. IMMEDIATE START. Temp. Perm. A...
3371,Manufacturing,£9.75 per hour. IMMEDIATE START. Temp. Perm. A...
1759025,Multilingual,£9.91 per hour. IMMEDIATE START. Pertemps are ...


In [41]:
list(df[df.duplicated(subset=["job_posting"], keep=False)].sort_values("job_posting")['job_posting'][0:10])

["$1, 000 SIGN ON BONUS - FULL-TIME. $500 SIGN ON BONUS - PART TIME. Differentials. -6p-6a $2.00. -weekends: $2.50. General Purpose. Perform direct resident care under the supervision of licensed nursing personnel. Assist with promoting a compassionate physical and psychosocial environment for the residents. Essential Job Duties. Properly use and maintain facility equipment. Assist residents with activities of daily living including bathing, dressing, grooming, toileting, changing of bed linens, and positioning in and out of bed, chair, etc. Prepare residents for meals and snacks. Assist residents in eating where needed and record food intake. Maintain residents' rooms in neat, orderly and clutter-free manner. Ensure residents are given correct diet. Support residents' participation in activity programs. Directly respond, within scope, to needs and concerns of residents and family members including call lights. Ensure residents rights are being honored at all times. Provide resident ca

In [123]:
df = df[~(df.duplicated(subset=["job_posting"], keep=False))]

In [124]:
size_memory_info(df)


Memory usage of current df: 2811.64 MB ~ 2.75 GB                
Number of rows in this table: 963026                
Number of columns in this table: 2



### Category

In [125]:
df['category'].nunique()

1138

In [128]:
list(df['category'].unique())

['Management',
 'Entrepreneurship',
 'Automotive',
 'Warehouse',
 'Manufacturing',
 'Seasonal',
 'Government',
 'Accounting',
 'Transportation',
 'Operations',
 'Telecommunications',
 'Executive',
 'Physician',
 'Legal',
 'Developer',
 'Administration',
 'Graduate',
 'Chemist',
 'Fertigung',
 'Insurance',
 'Service',
 'Engineering',
 'Nursing',
 'Science',
 'Finance',
 'Education',
 'Lawyer',
 'Research',
 'Facilities',
 'Banking',
 'Sales',
 'Logistics',
 'Marketing',
 'Healthcare',
 'Design',
 'Student',
 'Support',
 'Environmental',
 'ONET',
 'Temporary',
 'Aviation',
 'Pharmaceutical',
 'Technician',
 'Driving',
 'Bank',
 'Other',
 'Counselor',
 'Maintenance',
 'Retail',
 'Administrative',
 'Workplace',
 'Construction',
 'Chef',
 'Technology',
 'Internet',
 'Multilingual',
 'Accountancy',
 'Assistants',
 'Training',
 'Welding',
 'Factory',
 'Delivery',
 'Veterinarian',
 'Nonprofit',
 'Consulting',
 'Biotechnology',
 'Military',
 'Art',
 'Medical',
 'Caregiver',
 'Restaurant',
 'Car

In [126]:
df = df[~(df['category'].str.contains('UNKNOWN', na=False))]

In [127]:
df['category'].nunique()

462

In [53]:
list(df['category'].unique())

['Management',
 'Entrepreneurship',
 'Automotive',
 'Warehouse',
 'Manufacturing',
 'Seasonal',
 'Government',
 'Accounting',
 'Transportation',
 'Operations',
 'Telecommunications',
 'Executive',
 'Physician',
 'Legal',
 'Developer',
 'Administration',
 'Graduate',
 'Chemist',
 'Fertigung',
 'Insurance',
 'Service',
 'Engineering',
 'Nursing',
 'Science',
 'Finance',
 'Education',
 'Lawyer',
 'Research',
 'Facilities',
 'Banking',
 'Sales',
 'Logistics',
 'Marketing',
 'Healthcare',
 'Design',
 'Student',
 'Support',
 'Environmental',
 'ONET',
 'Temporary',
 'Aviation',
 'Pharmaceutical',
 'Technician',
 'Driving',
 'Bank',
 'Other',
 'Counselor',
 'Maintenance',
 'Retail',
 'Administrative',
 'Workplace',
 'Construction',
 'Chef',
 'Technology',
 'Internet',
 'Multilingual',
 'Accountancy',
 'Assistants',
 'Training',
 'Welding',
 'Factory',
 'Delivery',
 'Veterinarian',
 'Nonprofit',
 'Consulting',
 'Biotechnology',
 'Military',
 'Art',
 'Medical',
 'Caregiver',
 'Restaurant',
 'Car

In [129]:
df[df['category'] == 'Maschinenbauingenieure']

Unnamed: 0,category,job_posting
1447186,Maschinenbauingenieure,Manager Engineering System Operations (m/f/d) ...
1700215,Maschinenbauingenieure,We are an established partner and engineering ...


In [130]:
df[df['category'] == 'Kommissionierer']

Unnamed: 0,category,job_posting
1754116,Kommissionierer,You are looking for a new job in Monchengladba...


In [57]:
size_memory_info(df)


Memory usage of current df: 2642.31 MB ~ 2.58 GB                
Number of rows in this table: 919811                
Number of columns in this table: 2



In [100]:
df['job_posting'].unique().shape

(919169,)

In [None]:
counts = df["category"].value_counts()
max_count = counts.max()
valid_categories = counts[counts >= 0.01 * max_count].index
df[df["category"].isin(valid_categories)]['category'].unique()

array(['Management', 'Entrepreneurship', 'Automotive', 'Warehouse',
       'Manufacturing', 'Seasonal', 'Government', 'Accounting',
       'Transportation', 'Operations', 'Telecommunications', 'Executive',
       'Legal', 'Administration', 'Graduate', 'Insurance', 'Service',
       'Engineering', 'Nursing', 'Science', 'Finance', 'Education',
       'Research', 'Facilities', 'Banking', 'Sales', 'Logistics',
       'Marketing', 'Healthcare', 'Design', 'Support', 'Environmental',
       'ONET', 'Temporary', 'Pharmaceutical', 'Other', 'Maintenance',
       'Retail', 'Administrative', 'Workplace', 'Construction',
       'Technology', 'Internet', 'Multilingual', 'Accountancy',
       'Assistants', 'Training', 'Consulting', 'Art', 'Medical',
       'Restaurant', 'Careers', 'Managerial', 'IT', 'Cleaning',
       'Hospitality', 'Purchasing', 'Developers', 'Recruitment',
       'Procurement', 'Business', 'Computer', 'Architecture', 'Media',
       'NHS', 'Security', 'Creative', 'Travel', 'Energy

In [132]:
df = df[df["category"].isin(valid_categories)].copy()

In [134]:
list(df['category'].unique())

['Management',
 'Entrepreneurship',
 'Automotive',
 'Warehouse',
 'Manufacturing',
 'Seasonal',
 'Government',
 'Accounting',
 'Transportation',
 'Operations',
 'Telecommunications',
 'Executive',
 'Legal',
 'Administration',
 'Graduate',
 'Insurance',
 'Service',
 'Engineering',
 'Nursing',
 'Science',
 'Finance',
 'Education',
 'Research',
 'Facilities',
 'Banking',
 'Sales',
 'Logistics',
 'Marketing',
 'Healthcare',
 'Design',
 'Support',
 'Environmental',
 'ONET',
 'Temporary',
 'Pharmaceutical',
 'Other',
 'Maintenance',
 'Retail',
 'Administrative',
 'Workplace',
 'Construction',
 'Technology',
 'Internet',
 'Multilingual',
 'Accountancy',
 'Assistants',
 'Training',
 'Consulting',
 'Art',
 'Medical',
 'Restaurant',
 'Careers',
 'Managerial',
 'IT',
 'Cleaning',
 'Hospitality',
 'Purchasing',
 'Developers',
 'Recruitment',
 'Procurement',
 'Business',
 'Computer',
 'Architecture',
 'Media',
 'NHS',
 'Security',
 'Creative',
 'Travel',
 'Energy',
 'Others',
 'FMCG',
 'Consultancy

In [135]:
df["category"].value_counts().shape

(88,)

In [138]:
list(df["category"].value_counts())

[70873,
 53311,
 47889,
 47639,
 42576,
 42388,
 39392,
 32126,
 27999,
 26350,
 23620,
 22646,
 21218,
 20038,
 17451,
 17373,
 17057,
 16361,
 15251,
 12334,
 11888,
 11862,
 10737,
 10543,
 10145,
 9545,
 9507,
 8997,
 8872,
 8835,
 8296,
 7908,
 7320,
 7063,
 6761,
 6748,
 6626,
 5297,
 5283,
 4868,
 4830,
 4366,
 4317,
 4303,
 4217,
 3821,
 3675,
 3544,
 3529,
 3520,
 3516,
 3187,
 2982,
 2931,
 2716,
 2704,
 2664,
 2595,
 2540,
 2420,
 2168,
 2134,
 2032,
 1893,
 1857,
 1855,
 1738,
 1618,
 1610,
 1515,
 1510,
 1439,
 1431,
 1396,
 1266,
 1236,
 1227,
 1145,
 1099,
 1078,
 1077,
 1072,
 1004,
 921,
 907,
 831,
 828,
 757]

In [151]:
synonym_map = {
    
    'Administration': 'Administrative',
    'Administrative': 'Administrative',
    'Receptionists': 'Administrative',
    'Assistants': 'Administrative',

    
    'Accountancy': 'Accounting',
    'Accounting': 'Accounting',

    
    'Telecommunication': 'Telecommunications',
    'Telecommunications': 'Telecommunications',

    
    'Executive': 'Management',
    'Managerial': 'Management',
    'Management': 'Management',

    
    'IT': 'IT',
    'Computer': 'IT',
    'Technology': 'IT',
    'Developers': 'IT',
    'Web': 'IT',
    'Internet': 'IT',

    
    'Medical': 'Healthcare',
    'Healthcare': 'Healthcare',
    'Nursing': 'Healthcare',  

    
    'Scientific': 'Science',
    'Science': 'Science',
    'Research': 'Science',  

    
    'Restaurant': 'Hospitality',
    'Hospitality': 'Hospitality',
    'Housekeeping': 'Hospitality',

    
    'Consultancy': 'Consulting',
    'Consulting': 'Consulting',

    
    'QA': 'Quality Assurance',  
    'Manufacturing': 'Manufacturing',  
    'Production': 'Manufacturing',     

    
    'Graduate': 'Training',
    'Training': 'Training',
    'Education': 'Education',  

    
    'Purchasing': 'Procurement',  
}


bad_categories = {
    'ONET', 'NHS', 'Careers', 'Workplace', 'Other', 'Others', 'Multilingual', 'Seasonal', 'Temporary'
}

In [152]:
def _clean_text(x):
    if pd.isna(x):
        return x
    x = str(x).strip()
    
    return x

df['category'] = df['category'].apply(_clean_text)

In [153]:
df = df.loc[~(df['category'].isin(bad_categories))].copy()

df['category'] = df['category'].map(lambda x: synonym_map.get(x, x))

In [154]:
df['category'].unique()

array(['Management', 'Entrepreneurship', 'Automotive', 'Warehouse',
       'Manufacturing', 'Government', 'Accounting', 'Transportation',
       'Operations', 'Telecommunications', 'Legal', 'Administrative',
       'Training', 'Insurance', 'Service', 'Engineering', 'Healthcare',
       'Science', 'Finance', 'Education', 'Facilities', 'Banking',
       'Sales', 'Logistics', 'Marketing', 'Design', 'Support',
       'Environmental', 'Pharmaceutical', 'Maintenance', 'Retail',
       'Construction', 'IT', 'Consulting', 'Art', 'Hospitality',
       'Cleaning', 'Procurement', 'Recruitment', 'Business',
       'Architecture', 'Media', 'Security', 'Creative', 'Travel',
       'Energy', 'FMCG', 'Actuarial', 'Mechanical', 'Agriculture',
       'Labourers', 'Quality Assurance', 'Editorial', 'Farming',
       'Communications', 'Sports', 'Leisure'], dtype=object)

In [155]:
df.shape

(819711, 2)

In [156]:
size_memory_info(df)


Memory usage of current df: 2374.14 MB ~ 2.32 GB                
Number of rows in this table: 819711                
Number of columns in this table: 2



### Exploring Descriptions

In [158]:
df.isnull().sum()

category       0
job_posting    0
dtype: int64

In [159]:
df.columns

Index(['category', 'job_posting'], dtype='object')

In [160]:
df['job_posting'] = df['job_posting'].apply(_clean_text)

In [162]:
df.duplicated(subset=["job_posting"], keep = False).sum()

np.int64(0)

In [None]:
def get_len(df):
    print("Total rows:", len(df))
    print("Missing in job_posting:", df['job_posting'].isna().sum())
    print("Missing in category:", df['category'].isna().sum())
    print("Unique categories:", df['category'].nunique())

    print(df['job_posting'].str.len().describe())       # length in characters
    print(df['job_posting'].str.split().str.len().describe())  # length in words

    print(df['category'].value_counts().head(20))  # top categories


Total rows: 819711
Missing in job_posting: 0
Missing in category: 0
Unique categories: 57
count    819711.000000
mean       2829.764311
std        1918.435829
min           8.000000
25%        1509.000000
50%        2443.000000
75%        3804.000000
max       46599.000000
Name: job_posting, dtype: float64
count    819711.000000
mean        413.859602
std         275.013288
min           1.000000
25%         225.000000
50%         363.000000
75%         554.000000
max        6320.000000
Name: job_posting, dtype: float64
category
Engineering           70873
Healthcare            60415
IT                    58492
Management            55663
Sales                 53311
Education             47889
Accounting            46018
Construction          42576
Administrative        37157
Legal                 32126
Manufacturing         26160
Marketing             21218
Automotive            20038
Science               18833
Banking               17057
Finance               16361
Hospitality      

In [30]:
def show(df, number):
    print(list(df['job_posting'].sample(number)))

In [32]:
show(df_filtered, 50)

['A great career opportunity for a Nurse Manager Ambulatory Surgery is now available in at a fantastic medical facility in exciting New York City. The Nurse Manager Ambulatory Surgery will come work for a 500. bed medical center that has been serving its local community for over 100 years. This medical center is an accredited Level I Trauma Center and is fully equipped to meet both adult and pediatric health emergencies. This medical center is one of the Best Regional Hospitals according to US News and is equipped with a multilingual staff to make all patients welcome. This hospital is an accredited teaching institute and clinicians grow and develop in this prestigious setting. The Nurse Manager Ambulatory Surgery will be a Nurse with a strong Perioperative Services background. The ideal candidate for the position will have previous designated leadership experience as a Preceptor Surgery or Nurse Educator Surgery and is looking to continue to help develop and lead new OR nurses. This e

### Cut long and short rows

In [183]:
# filter by word count between 100 and 1000
df_filtered = df[
    df["job_posting"].astype(str).str.split().str.len().between(100, 1000)
].copy()

print("Before filtering:", len(df))
print("After filtering:", len(df_filtered))


Before filtering: 819711
After filtering: 716675


In [None]:
get_len(df_filtered)

Total rows: 716675
Missing in job_posting: 0
Missing in category: 0
Unique categories: 57
count    716675.000000
mean       2872.942521
std        1440.040629
min         453.000000
25%        1748.000000
50%        2570.000000
75%        3798.000000
max       11558.000000
Name: job_posting, dtype: float64
count    716675.000000
mean        420.812771
std         205.398371
min         100.000000
25%         260.000000
50%         380.000000
75%         553.000000
max        1000.000000
Name: job_posting, dtype: float64
category
Engineering           62046
IT                    54638
Healthcare            52937
Education             45885
Management            44629
Sales                 41903
Accounting            40186
Construction          35441
Administrative        34480
Legal                 29622
Manufacturing         22388
Science               17126
Automotive            16394
Marketing             15942
Finance               15553
Banking               12903
Telecommunication

### Preprocessing for BERT

In [None]:
import re
def clean_for_bert(text):
    text = str(text)
    # remove html tags
    text = re.sub(r"<.*?>", " ", text)
    # remove urls
    text = re.sub(r"http\S+|www.\S+", " ", text)
    # remove emails
    text = re.sub(r"\S+@\S+", " ", text)
    # remove phone numbers
    text = re.sub(r"\+?\d[\d\-\(\) ]{7,}\d", " ", text)
    # normalize spaces
    text = re.sub(r"\s+", " ", text).strip()
    return text


### Saving cleaned and interim for bert and lin reg

In [194]:
df_filtered.to_parquet('../data/02_cleaned/job_posting_filtered_len_full.parquet')

In [190]:
df_filtered.sample(10000).to_csv('../data/03_interim/job_posting_filtered_len_sample_10k.csv')

In [196]:
df.to_parquet('../data/03_interim/job_posting_length_unclean.parquet')

In [195]:
size_memory_info(df_filtered)


Memory usage of current df: 2486.31 MB ~ 2.43 GB                
Number of rows in this table: 716675                
Number of columns in this table: 2



In [3]:
df_filtered = pd.read_parquet('../data/02_cleaned/job_posting_filtered_len_full.parquet')

In [4]:
df_filtered

Unnamed: 0,category,job_posting
6,Automotive,£500 Bonus on Attandance during Black Friday a...
7,Warehouse,WAREHOUSE OPERATIVES WANTED FOR A FANTASTIC SI...
9,Warehouse,URGENT! Pertemps are currently seeking 3 peopl...
10,Manufacturing,"$2, 500 POTENTIAL RETENTION BONUS! WHAT'S NEW ..."
17,Automotive,£500 Bonus on Attandance during Black Friday a...
...,...,...
1760381,Sales,Structured training and development programmes...
1760386,Management,Be responsible for managing impact to people f...
1760387,Operations,"At Liberty Mutual, technology isn't just a par..."
1760388,Science,€30k per annum. 12 month contract. Dublin. We ...


### Cleaning function for TF-IDF

In [4]:
top_categories = df_filtered['category'].value_counts().head(25).index
df_filtered = df_filtered[df_filtered['category'].isin(top_categories)].copy()

In [None]:
# Long Operation
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')


# def preprocess_text(text):
    
#     text = text.lower()
    
#     text = re.sub(r'[^a-z\s]', '', text)
    
#     tokens = nltk.word_tokenize(text)
    
#     tokens = [t for t in tokens if t not in stopwords.words('english')]
    
#     lemmatizer = WordNetLemmatizer()
#     tokens = [lemmatizer.lemmatize(t) for t in tokens]
    
#     return " ".join(tokens)


# df_filtered["clean_description"] = df_filtered["job_posting"].astype(str).apply(preprocess_text)

# print(df_filtered[["job_posting", "clean_description"]].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Мариан\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Мариан\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Мариан\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [25]:
size_memory_info(df_filtered)


Memory usage of current df: 1847.37 MB ~ 1.80 GB                
Number of rows in this table: 633115                
Number of columns in this table: 2



In [11]:
df_filtered = df_filtered.sample(100000)

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score


X_train, X_test, y_train, y_test = train_test_split(
    df_filtered["job_posting"],
    df_filtered["category"],
    test_size=0.05,
    random_state=42,
    stratify=df_filtered["category"]    
)

vectorizer = TfidfVectorizer(
    stop_words="english",
    lowercase=True,
    strip_accents="unicode",
    token_pattern=r"(?u)\b[a-zA-Z]{2,}\b",
    ngram_range=(1,1),     
    max_df=0.95,
    min_df=5,              
    max_features=100_000,  
    dtype="float32",
    sublinear_tf=True
)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

clf = LogisticRegression(max_iter=1000, n_jobs=-1, solver='saga', multi_class='multinomial')
clf.fit(X_train_tfidf, y_train)
y_pred = clf.predict(X_test_tfidf)
 
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Macro F1:", f1_score(y_test, y_pred, average="macro"))



Accuracy: 0.634824361890321
Macro F1: 0.5961151185291798


In [28]:
from sklearn.metrics import classification_report

y_pred = clf.predict(X_test_tfidf)

report = classification_report(y_test, y_pred, digits=3)
print(report)


                    precision    recall  f1-score   support

        Accounting      0.653     0.689     0.671      2009
    Administrative      0.594     0.557     0.575      1724
        Automotive      0.761     0.749     0.755       820
           Banking      0.533     0.524     0.529       645
      Construction      0.687     0.657     0.672      1772
        Consulting      0.454     0.302     0.363       599
         Education      0.809     0.749     0.778      2294
       Engineering      0.616     0.687     0.650      3102
           Finance      0.509     0.288     0.368       778
        Government      0.624     0.603     0.613       393
        Healthcare      0.735     0.825     0.778      2647
       Hospitality      0.718     0.788     0.751       593
                IT      0.611     0.686     0.646      2732
         Insurance      0.680     0.641     0.660       513
             Legal      0.725     0.709     0.717      1481
        Management      0.484     0.543

In [20]:
df_filtered.nunique()

category           57
job_posting    716675
dtype: int64

In [29]:
df_filtered["category"].value_counts()

category
Engineering           62046
IT                    54638
Healthcare            52937
Education             45885
Management            44629
Sales                 41903
Accounting            40186
Construction          35441
Administrative        34480
Legal                 29622
Manufacturing         22388
Science               17126
Automotive            16394
Marketing             15942
Finance               15553
Banking               12903
Telecommunications    12060
Consulting            11973
Hospitality           11858
Transportation        10348
Insurance             10250
Procurement           10156
Retail                 8451
Operations             8096
Government             7850
Name: count, dtype: int64

### Training BERT for classifivcation

In [8]:
df_filtered['category']

6             Automotive
10         Manufacturing
17            Automotive
18            Automotive
20            Government
               ...      
1760381            Sales
1760386       Management
1760387       Operations
1760388          Science
1760392      Engineering
Name: category, Length: 633115, dtype: object

In [9]:
df_filtered.columns

Index(['category', 'job_posting'], dtype='object')

In [None]:
import re
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, EarlyStoppingCallback
)
import evaluate

# 0) Мини-очистка для BERT (убираем явный шум, не трогаем слова/пунктуацию)
def clean_for_bert(text: str) -> str:
    text = str(text)
    text = re.sub(r"<.*?>", " ", text)                     # HTML
    text = re.sub(r"http\S+|www\.\S+", " ", text)          # URL
    text = re.sub(r"\S+@\S+", " ", text)                   # email
    text = re.sub(r"\+?\d[\d\-\(\) ]{7,}\d", " ", text)    # телефоны
    text = re.sub(r"\s+", " ", text).strip()
    return text

# --- ВАЖНО: убеждаемся, что названия колонок такие, как сказал ты
assert {"job_posting", "category"}.issubset(df_filtered.columns), df_filtered.columns

# 1) Подготовка: текст + метки
df = df_filtered[["job_posting", "category"]].dropna().copy()
df["job_posting"] = df["job_posting"].apply(clean_for_bert)



In [None]:
del df_filtered

In [67]:
df.sample(49)

Unnamed: 0,job_posting,category,text
1065357,Our Client: This is an opportunity within the ...,Administrative,Our Client: This is an opportunity within the ...
959289,We are recruiting for an experienced Nursery N...,Education,We are recruiting for an experienced Nursery N...
1620619,Strata Solicitors have a great opportunity for...,Management,Strata Solicitors have a great opportunity for...
1365878,Influencer enables brands to be seen at scale ...,Management,Influencer enables brands to be seen at scale ...
424553,Description We are looking for a Grill Chef to...,Hospitality,Description We are looking for a Grill Chef to...
646279,"Job Description. At Blue. Scope, we create and...",Engineering,"Job Description. At Blue. Scope, we create and..."
1641313,"The Client. A market leading, cutting edge mar...",Management,"The Client. A market leading, cutting edge mar..."
1434427,Located at Orlando International Airport. WE O...,Transportation,Located at Orlando International Airport. WE O...
1121824,A Manufacturing Worker is responsible for asse...,Manufacturing,A Manufacturing Worker is responsible for asse...
1119323,"A highly respected, prestigeous Construction C...",Legal,"A highly respected, prestigeous Construction C..."


In [68]:

# кодируем метки
labels = sorted(df["category"].unique())
label2id = {lbl:i for i,lbl in enumerate(labels)}
id2label = {i:lbl for lbl,i in label2id.items()}
df["category"] = df["category"].map(label2id).astype(int)

# сохраняем маппинги (пригодятся на инференсе/проде)
with open("label2id.json", "w") as f: json.dump(label2id, f)
with open("id2label.json", "w") as f: json.dump(id2label, f)


Map: 100%|██████████| 569803/569803 [10:09<00:00, 935.00 examples/s] 
Map: 100%|██████████| 63312/63312 [01:11<00:00, 883.82 examples/s]


ValueError: PyTorch needs to be installed to be able to return PyTorch tensors.

In [74]:
df = df[['text','label']]

In [75]:
df

Unnamed: 0,text,label
6,£500 Bonus on Attandance during Black Friday a...,2
10,"$2, 500 POTENTIAL RETENTION BONUS! WHAT'S NEW ...",16
17,£500 Bonus on Attandance during Black Friday a...,2
18,£500 Bonus on Attandance during Black Friday a...,2
20,"AS9102 First article & ANSI Y14.5M 1982 "" Basi...",9
...,...,...
1760381,Structured training and development programmes...,21
1760386,Be responsible for managing impact to people f...,15
1760387,"At Liberty Mutual, technology isn't just a par...",18
1760388,€30k per annum. 12 month contract. Dublin. We ...,22


In [79]:
df.to_parquet('../data/02_cleaned/bert_train_data.parquet')

In [None]:
train, test_df = train_test_split(df[["text", "label"]], test_size=0.05, random_state=42, stratify=df["label"])

In [83]:

# 2) train/val split (стратифицированный)
train_df, val_df = train_test_split(
    train[["text", "label"]], test_size=0.1, random_state=42, stratify=train["label"]
)


In [87]:

# 3) Datasets + токенизация
model_name = "bert-base-uncased"   # можно попробовать roberta-base / distilbert-base-uncased
max_length = 512                   # BERT лимит

train_ds = Dataset.from_pandas(train_df, preserve_index=False)
val_ds   = Dataset.from_pandas(val_df,   preserve_index=False)

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=max_length,
        padding="max_length"
    )

train_ds = train_ds.map(tokenize, batched=True, remove_columns=["text"])
val_ds   = val_ds.map(tokenize,   batched=True, remove_columns=["text"])


KeyboardInterrupt: 

In [None]:

# datasets ожидает 'labels'
train_ds = train_ds.rename_column("label", "labels")
val_ds   = val_ds.rename_column("label", "labels")
train_ds.set_format(type="torch")
val_ds.set_format(type="torch")

# 4) Модель + Trainer
num_labels = len(labels)
config = AutoConfig.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

metric_acc = evaluate.load("accuracy")
metric_f1  = evaluate.load("f1")
def compute_metrics(eval_pred):
    logits, y_true = eval_pred
    y_pred = np.argmax(logits, axis=1)
    return {
        "accuracy": metric_acc.compute(predictions=y_pred, references=y_true)["accuracy"],
        "macro_f1": metric_f1.compute(predictions=y_pred, references=y_true, average="macro")["f1"]
    }

training_args = TrainingArguments(
    output_dir="bert_jobcls",
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=1000,
    save_steps=1000,
    logging_steps=200,
    per_device_train_batch_size=8,     # при хорошей GPU можно 16
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,     # «эмуляция» большего батча
    num_train_epochs=3,
    learning_rate=2e-5,
    warmup_ratio=0.06,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    fp16=True,                         # если нет GPU/AMP — поставь False
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

# 5) Оценка и сохранение
eval_metrics = trainer.evaluate()
print(eval_metrics)  # accuracy, macro_f1

trainer.save_model("bert_jobcls/best_model")
tokenizer.save_pretrained("bert_jobcls/best_model")

### 