In [1]:
from glob import glob
from datasets import Dataset
import numpy as np
import pandas as pd
from pprint import pprint
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import warnings

sns.set()
rcParams["figure.figsize"] = (20, 10)
pd.options.display.max_columns = None
warnings.filterwarnings("ignore")
from pprint import pprint
import json

In [10]:
import json

import joblib
import numpy as np

base_path = "/Users/phamhoang1408/Desktop/20231/DS/ds_project/models/features"
with open(f"{base_path}/cap_bac.json", "r") as f:
    cap_bac_feature_values = json.load(f)
with open(f"{base_path}/dia_diem_lam_viec.json", "r") as f:
    dia_diem_lam_viec_feature_values = json.load(f)
with open(f"{base_path}/hinh_thuc.json", "r") as f:
    hinh_thuc_feature_values = json.load(f)
with open(f"{base_path}/loai_hinh_hoat_dong.json", "r") as f:
    loai_hinh_hoat_dong_feature_values = json.load(f)
with open(f"{base_path}/nganh_nghe.json", "r") as f:
    nganh_nghe_feature_values = json.load(f)
with open(f"{base_path}/quy_mo_cong_ty.json", "r") as f:
    quy_mo_cong_ty_feature_values = json.load(f)
with open(f"{base_path}/ten_cong_ty.json", "r") as f:
    ten_cong_ty_feature_values = json.load(f)

with open(f"{base_path}/num_followers.json", "r") as f:
    num_followers_feature_values = json.load(f)

vi_tri_viec_vectorizer = joblib.load(f"{base_path}/vi_tri_viec_vectorizer.joblib")


def convert_raw_data_to_feature(raw_data):
    def convert_text_unique_to_feature(feature_values, value):
        vector = np.zeros(len(feature_values))
        if value in feature_values:
            vector[feature_values.index(value)] = 1
        elif None in feature_values:
            vector[-1] = 1
        return vector

    def convert_numeric_to_feature(ranges, value):
        vector = np.zeros(len(ranges))
        if value is None:
            vector[-1] = 1
        else:
            for i, r in enumerate(ranges):
                if r[0] <= value <= r[1]:
                    vector[i] = 1
                    break
        return vector

    def convert_tf_idf_to_feature(vectorizer, value):
        return vectorizer.transform([value]).toarray()[0]

    feature_vector = np.concatenate(
        [
            convert_text_unique_to_feature(cap_bac_feature_values, raw_data["cap_bac"]),
            convert_text_unique_to_feature(
                dia_diem_lam_viec_feature_values, raw_data["dia_diem_lam_viec"]
            ),
            convert_text_unique_to_feature(
                hinh_thuc_feature_values, raw_data["hinh_thuc"]
            ),
            convert_text_unique_to_feature(
                loai_hinh_hoat_dong_feature_values, raw_data["loai_hinh_hoat_dong"]
            ),
            convert_text_unique_to_feature(
                nganh_nghe_feature_values, raw_data["nganh_nghe"]
            ),
            convert_text_unique_to_feature(
                quy_mo_cong_ty_feature_values, raw_data["quy_mo_cong_ty"]
            ),
            convert_text_unique_to_feature(
                ten_cong_ty_feature_values, raw_data["ten_cong_ty"]
            ),
            convert_tf_idf_to_feature(vi_tri_viec_vectorizer, raw_data["vi_tri_viec"]),
            convert_numeric_to_feature(
                num_followers_feature_values, raw_data["num_followers"]
            ),
        ]
    )
    return feature_vector


def salary_mapper(x):
    try:
        t = None
        if "tr vnd" in x["luong"].lower():
            t = "tr vnd"
        elif "usd" in x["luong"].lower():
            t = "usd"
        min_luong, max_luong = None, None
        if "-" in x["luong"]:
            a, b = x["luong"].split("-")
            temp1 = a.strip().split(" ")[0].replace(",", "").replace(".", "")
            temp2 = b.strip().split(" ")[0].replace(",", "").replace(".", "")
            min_luong = int(temp1)
            max_luong = int(temp2)
        elif "Trên" in x["luong"]:
            min_luong = int(x["luong"].split(" ")[1].replace(",", "").replace(".", ""))
        elif "Lên đến" in x["luong"]:
            max_luong = int(x["luong"].split(" ")[2].replace(",", "").replace(".", ""))
        if t == "usd":
            if min_luong:
                min_luong = min_luong * 23 / 1000
            if max_luong:
                max_luong = max_luong * 23 / 1000
        return {
            "min_luong": min_luong,
            "max_luong": max_luong,
        }
    except Exception as e:
        raise e


def min_filter(x):
    return x["min_luong"] is not None and x["min_luong"] <= 200


def min_range_mapper(x):
    ranges = [
        (0, 5),
        (5, 10),
        (10, 15),
        (15, 20),
        (20, 25),
        (25, 30),
        (30, 50),
        (50, 75),
        (75, 100),
        (100, 999999),
    ]
    for i, r in enumerate(ranges):
        if r[0] <= x["min_luong"] <= r[1]:
            return {
                "min_luong_range": i,
            }


def max_filter(x):
    return x["max_luong"] is not None and x["max_luong"] <= 200


def max_range_mapper(x):
    ranges = [
        (0, 10),
        (10, 20),
        (20, 30),
        (30, 50),
        (50, 75),
        (75, 100),
        (100, 999999),
    ]
    for i, r in enumerate(ranges):
        if r[0] <= x["max_luong"] <= r[1]:
            return {
                "max_luong_range": i,
            }


In [11]:
ds = Dataset.from_json("../crawl/final/final_dataset.jsonl")

In [12]:
ds

Dataset({
    features: ['vi_tri_viec', 'ten_cong_ty', 'dia_diem_lam_viec', 'ngay_cap_nhat', 'nganh_nghe', 'hinh_thuc', 'luong', 'cap_bac', 'het_han_nop', 'dia_chi_cong_ty', 'loai_hinh_hoat_dong', 'quy_mo_cong_ty', 'num_followers', 'min_exp', 'max_exp'],
    num_rows: 7184
})

In [13]:
# 1) Load data and filter out invalid data
ds = ds.filter(lambda x: x["luong"] not in ["Thoả thuận", "Thương lượng", "Cạnh tranh"])
ds = ds.map(salary_mapper, remove_columns=["luong"])
ds_min = ds.filter(min_filter).map(min_range_mapper)
ds_max = ds.filter(max_filter).map(max_range_mapper)

Map:   0%|          | 0/3992 [00:00<?, ? examples/s]

In [14]:
ds_min, ds_max

(Dataset({
     features: ['vi_tri_viec', 'ten_cong_ty', 'dia_diem_lam_viec', 'ngay_cap_nhat', 'nganh_nghe', 'hinh_thuc', 'cap_bac', 'het_han_nop', 'dia_chi_cong_ty', 'loai_hinh_hoat_dong', 'quy_mo_cong_ty', 'num_followers', 'min_exp', 'max_exp', 'min_luong', 'max_luong', 'min_luong_range'],
     num_rows: 4185
 }),
 Dataset({
     features: ['vi_tri_viec', 'ten_cong_ty', 'dia_diem_lam_viec', 'ngay_cap_nhat', 'nganh_nghe', 'hinh_thuc', 'cap_bac', 'het_han_nop', 'dia_chi_cong_ty', 'loai_hinh_hoat_dong', 'quy_mo_cong_ty', 'num_followers', 'min_exp', 'max_exp', 'min_luong', 'max_luong', 'max_luong_range'],
     num_rows: 3992
 }))

In [15]:
ds_min = ds_min.map(
    lambda x: {
        "feature_vector": convert_raw_data_to_feature(x),
        "min_luong": x["min_luong"],
        "max_luong": x["max_luong"],
    }
).select_columns(["feature_vector", "min_luong", "min_luong_range"])

ds_max = ds_max.map(
    lambda x: {
        "feature_vector": convert_raw_data_to_feature(x),
        "min_luong": x["min_luong"],
        "max_luong": x["max_luong"],
    }
).select_columns(["feature_vector", "max_luong", "max_luong_range"])

Map:   0%|          | 0/3992 [00:00<?, ? examples/s]

In [16]:
ds_min_train = ds_min.train_test_split(train_size=0.9, seed=42)["train"]
ds_min_test = ds_min.train_test_split(train_size=0.9, seed=42)["test"]
X_train_min = ds_min_train["feature_vector"]
y_train_min = ds_min_train["min_luong_range"]
X_test_min = ds_min_test["feature_vector"]
y_test_min = ds_min_test["min_luong_range"]

ds_max_train = ds_max.train_test_split(train_size=0.9, seed=42)["train"]
ds_max_test = ds_max.train_test_split(train_size=0.9, seed=42)["test"]
X_train_max = ds_max_train["feature_vector"]
y_train_max = ds_max_train["max_luong_range"]
X_test_max = ds_max_test["feature_vector"]
y_test_max = ds_max_test["max_luong_range"]

In [22]:
# random forest
from sklearn.ensemble import RandomForestClassifier
rf_min = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_min.fit(X_train_min, y_train_min)
rf_max = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_max.fit(X_train_max, y_train_max)
print("Min score:",rf_min.score(X_test_min, y_test_min))
print("Max score:",rf_max.score(X_test_max, y_test_max))

Min score: 0.5656324582338902
Max score: 0.575


In [23]:
# lightgbm
from lightgbm import LGBMClassifier

model_min = LGBMClassifier()
model_min.fit(X_train_min, y_train_min)
print("Min score:",model_min.score(X_test_min, y_test_min))

model_max = LGBMClassifier()
model_max.fit(X_train_max, y_train_max)
print("Max score:",model_max.score(X_test_max, y_test_max))

# save model
joblib.dump(model_min, "../models/models/lgmb_min.joblib")
joblib.dump(model_max, "../models/models/lgmb_max.joblib")

Min score: 0.5990453460620525
Max score: 0.595


['../models/models/lgmb_max.joblib']

In [24]:
# xgboost
from xgboost import XGBClassifier

model_min = XGBClassifier()
model_min.fit(X_train_min, y_train_min)
print("Min score:",model_min.score(X_test_min, y_test_min))

model_max = XGBClassifier()
model_max.fit(X_train_max, y_train_max)
print("Max score:",model_max.score(X_test_max, y_test_max))

# save model
joblib.dump(model_min, "../models/models/xgb_min.joblib")
joblib.dump(model_max, "../models/models/xgb_max.joblib")

Min score: 0.5918854415274463
Max score: 0.5925


['../models/models/xgb_max.joblib']