In [2]:
import pandas as pd
import re
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,plot_confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.utils import column_or_1d
import numpy as np
import pickle
from imblearn.over_sampling import RandomOverSampler
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from xgboost import XGBClassifier

In [3]:
xgb.__version__

In [4]:
df_train_info = pd.read_csv('../input/uet-hackathon-dataset/uet-hackathon-2022-data-science/info_train.csv')
df_train_work = pd.read_csv('../input/uet-hackathon-dataset/uet-hackathon-2022-data-science/work_train.csv')
df_train_label = pd.read_csv('../input/uet-hackathon-dataset/uet-hackathon-2022-data-science/label_train.csv')
df_test_info = pd.read_csv('../input/uet-hackathon-dataset/uet-hackathon-2022-data-science/info_test.csv')
df_test_work = pd.read_csv('../input/uet-hackathon-dataset/uet-hackathon-2022-data-science/work_test.csv')
df_test_label = pd.read_csv('../input/uet-hackathon-dataset/uet-hackathon-2022-data-science/label_test.csv')

In [5]:
df_train_info.head()

In [6]:
df_train_work.head()

In [7]:
df_train_label.head()

# Khảo sát dữ liệu

In [8]:
df_train_work.rename(columns = {'address': 'work_address'}, inplace = True)
df_test_work.rename(columns = {'address': 'work_address'}, inplace = True)

df_train_info.rename(columns = {'address': 'home_address'}, inplace = True)
df_test_info.rename(columns = {'address': 'home_address'}, inplace = True)

# Tiền xử lý dữ liệu

## Missing values

In [9]:
df_train_work["job/role"] = df_train_work["job/role"].replace(np.nan, "chưa rõ")
df_train_work["work_address"] = df_train_work["work_address"].replace(np.nan, "chưa rõ")
df_train_info["home_address"] = df_train_info["home_address"].replace(np.nan, "chưa rõ")

df_test_work["job/role"] = df_test_work["job/role"].replace(np.nan, "chưa rõ")
df_test_work["work_address"] = df_test_work["work_address"].replace(np.nan, "chưa rõ")
df_test_info["home_address"] = df_test_info["home_address"].replace(np.nan, "chưa rõ")

In [10]:
pd.set_option("display.max_rows", None)

## Tiếng Việt

In [11]:
def process_accent_vietnamese(s):
    s = re.sub('[áàảãạăắằẳẵặâấầẩẫậ]', 'a', s)
    s = re.sub('[ÁÀẢÃẠĂẮẰẲẴẶÂẤẦẨẪẬ]', 'A', s)
    s = re.sub('[éèẻẽẹêếềểễệ]', 'e', s)
    s = re.sub('[ÉÈẺẼẸÊẾỀỂỄỆ]', 'E', s)
    s = re.sub('[óòỏõọôốồổỗộơớờởỡợ]', 'o', s)
    s = re.sub('[ÓÒỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢ]', 'O', s)
    s = re.sub('[íìỉĩị]', 'i', s)
    s = re.sub('[ÍÌỈĨỊ]', 'I', s)
    s = re.sub('[úùủũụưứừửữự]', 'u', s)
    s = re.sub('[ÚÙỦŨỤƯỨỪỬỮỰ]', 'U', s)
    s = re.sub('[ýỳỷỹỵ]', 'y', s)
    s = re.sub('[ÝỲỶỸỴ]', 'Y', s)
    s = re.sub('đ', 'd', s)
    s = re.sub('Đ', 'D', s)
    return s

In [12]:
df_train_info["home_address"] = df_train_info["home_address"].apply(process_accent_vietnamese)
df_train_work["work_address"] = df_train_work["work_address"].apply(process_accent_vietnamese)
df_train_work["job/role"] = df_train_work["job/role"].apply(process_accent_vietnamese)

df_test_info["home_address"] = df_test_info["home_address"].apply(process_accent_vietnamese)
df_test_work["work_address"] = df_test_work["work_address"].apply(process_accent_vietnamese)
df_test_work["job/role"] = df_test_work["job/role"].apply(process_accent_vietnamese)

## Địa chỉ (tham khảo Brain not found team)

In [13]:
def str_normalize(s):
    if type(s) == str:
        s = str(s).strip().lower()
        s = re.sub(' +', " ", s)
    return s

In [14]:
def address_category(x):
    if type(x) == str:
        if "ha noi" in x or "hn" in x or "ha no" in x  or "ha n?i" in x or "thµnh phe hµ nei" in x \
        or "ha tay" in x or "tu liem" in x \
        or "cau giay" in x or "thanh xuan" in x or "dong da" in x or "dich vong" in x or "ha dong" in x or "nhan chinh" in x \
        or "dao tan" in x or "kim ma" in x or "thanh cong" in x or "phuc xa" in x or "keangnam" in x \
        or "nui truc" in x or "lang ha" in x or "nguyen van cu" in x or "thanh oai" in x \
        or "ba dinh" in x or "doi can" in x or "hoang hoa tham" in x or "yen phu" in x or "lotte center" in x \
        or "hoang minh giam" in x or "pho bach dang" in x or "viet hung" in x or "pho vong duc" in x \
        or "lac long quan" in x or "doc tam da" in x or "bo de" in x or "quan nhan" in x or "giang vo" in x \
        or "nguyen chi thanh" in x or "tho quan" in x or "de la thanh" in x or "khu mieu" in x \
        or "nam dong" in x or "kham thien" in x or "dang tien dong" in x \
        or "thai ha" in x or "dang van ngu" in x or "thai thinh" in x or "phuong mai" in x or "tay ho" in x \
        or "hai ba trung" in x or "van ho " in x or "doi cung" in x or "bach khoa" in x or "vinh tuy" in x \
        or "mai huong" in x or "hai ba t" in x or "tap the det 8/3" in x or "truong dinh" in x or "tan trieu" in x \
        or "yen hoa" in x or "quan tho " in x or "nguyen khang" in x or "pham tuan tai" in x \
        or "me tro thuong" in x or "nguyen ngoc nai" in x or "linh dam" in x or "trung hoa" in x \
        or "nguyen trai" in x or "nguyen xien" in x or "hoang van thai" in x or "chelsea park" in x or "phu kieu" in x \
        or "dai thanh" in x or "duong 32" in x or "khu tap the 664" in x or "huynh cung" in x or "yen thi" in x:
            return "ha noi"
        elif "dak lak" in x or "daklak" in x or"dakk lak" in x or "tp.bmt" in x:
            return "dak lak"
        elif "an giang" in x:
            return "an giang"
        elif "ba ria" in x or "vung tau" in x or "brvt" in x:
            return "ba ria vung tau"
        elif "bac giang" in x or "bg" in x or "bac giang" in x or "yen the" in x or "bac gian" in x or "thon buom" in x or "lang giang" in x or "hiep hoa" in x:
            return "bac giang"
        elif "bac kan" in x or "bac can" in x:
            return "bac kan"
        elif "bac lieu" in x:
            return "bac lieu"
        elif "bac ninh" in x:
            return "bac ninh"
        elif "ben tre" in x:
            return "ben tre"
        elif "binh dinh" in x:
            return "binh dinh"
        elif "binh duong" in x:
            return "binh duong"
        elif "binh phuoc" in x:
            return "binh phuoc"
        elif "binh thuan" in x:
            return "binh thuan"
        elif "ca mau" in x:
            return "ca mau"
        elif "can tho" in x:
            return "can tho"
        elif "cao bang" in x:
            return "cao bang"
        elif "da nang" in x:
            return "da nang"
        elif "dak nong" in x or "daknong" in x:
            return "dak nong"
        elif "dien bien" in x:
            return "dien bien"
        elif "dong nai" in x:
            return "dong nai"
        elif "dong thap" in x:
            return "dong thap"
        elif "gia lai" in x:
            return "gia lai"
        elif "ha giang" in x:
            return "ha giang"
        elif "ha nam" in x:
            return "ha nam"
        elif "ha tinh" in x:
            return "ha tinh"
        elif "hai duong" in x or "an nhan tay" in x:
            return "hai duong"
        elif "hai phong" in x:
            return "hai phong"
        elif "hau giang" in x:
            return "hau giang"
        elif "hoa binh" in x or "hb" in x or "phuong huu nghi" in x:
            return "hoa binh"
        elif "hung yen" in x or "hung yon" in x:
            return "hung yen"
        elif "khanh hoa" in x:
            return "khanh hoa"
        elif "kien giang" in x:
            return "kien giang"
        elif "lam dong" in x or "ld" in x or "da lat" in x or "bao loc" in x or "duc trong" in x or "dalat" in x or "di linh" in x:
            return "lam dong"
        elif "thai nguyen" in x:
            return "thai nguyen"
        elif "nghe an" in x or "do luong" in x or "dien chau" in x or "quynh luu" in x or "q.luu" in x or "nghe an" in x or "n.an" in x or "nge an" in x:
            return "nghe an"
        elif "kon tum" in x:
            return "kon tum"
        elif "lai chau" in x:
            return "lai chau"
        elif "lam dong" in x:
            return "lam dong"
        elif "lang son" in x:
            return "lang son"
        elif "lao cai" in x:
            return "lao cai"
        elif "long an" in x:
            return "long an"
        elif "nam dinh" in x or "ý yen" in x:
            return "nam dinh"
        elif "ninh binh" in x:
            return "ninh binh"
        elif "ninh thuan" in x:
            return "ninh thuan"
        elif "phu tho" in x:
            return "phu tho"
        elif "phu yen" in x:
            return "phu yen"
        elif "quang binh" in x:
            return "quang binh"
        elif "quang nam" in x:
            return "quang nam"
        elif "quang ngai" in x:
            return "quang ngai"
        elif "quang ninh" in x or "ha long" in x:
            return "quang ninh"
        elif "quang tri" in x:
            return "quang tri"
        elif "thai nguyen" in x:
            return "thai nguyen"
        elif "ho chi minh" in x or "hcm" in x:
            return "tp ho chi minh"
        elif "soc trang" in x:
            return "soc trang"
        elif "son la" in x:
            return "son la"
        elif "tay ninh" in x:
            return "tay ninh"
        elif "thai binh" in x or "thon bich du" in x:
            return "thai binh"
        elif "thai nguyen" in x:
            return "thai nguyen"
        elif "thanh hoa" in x or "thanh hoa" in x:
            return "thanh hoa"
        elif "thua thien hue" in x or "hue" in x:
            return "thua thien hue"
        elif "tien giang" in x:
            return "tien giang"
        elif "tra vinh" in x:
            return "tra vinh"
        elif "tuyen quang" in x:
            return "tuyen quang"
        elif "vinh long" in x:
            return "vinh long"
        elif "vinh phuc" in x or "vp" in x or "vinh phu" in x or "vinhphuc" in x:
            return "vinh phuc"
        elif "yen bai" in x:
            return "yen bai"
        else:
            return "viet nam"

In [15]:
def address_preprocess(x):
    x = str_normalize(x)
    x = address_category(x)
    return x

In [16]:
df_train_info["home_address"] = df_train_info["home_address"].apply(address_preprocess)
df_train_work["work_address"] = df_train_work["work_address"].apply(address_preprocess)

df_test_info["home_address"] = df_test_info["home_address"].apply(address_preprocess)
df_test_work["work_address"] = df_test_work["work_address"].apply(address_preprocess)

In [17]:
df_train_info["home_address"].value_counts().sort_values().plot(kind = 'barh', figsize=(10,20))

## Nghề nghiệp

In [18]:
def str_normalize(s):
    if type(s) == str:
        s = str(s).strip().lower()
        s = re.sub(' +', " ", s)
    return s

In [19]:
def job_category(job):
    if 'lai' in job or 'tai xe' in job:
        return 'tai xe'
    elif job.startswith(('giao vien','giang vien','tro giang','teacher','co ','gv','gioo vion','chu nhiem lop','g. vien','g.vien')):
        return 'giao vien'

    elif 'bep' in job:
        return 'dau bep'

    elif 'thu ky' in job or 'tro ly' in job or job.startswith(('thu ky','assistant',)):
        return 'tro ly'

    elif 'ca sy' in job or 'hoa sy' in job or 'dien vien' in job or job.startswith(('nhac','hoa si')):
        return 'nghe sy'

    elif 'bac si' in job or 'bac sy' in job or 'y ta' in job or job.startswith(('dieu duong','y sy','y i','nha si','nha sy','y te')):
        return 'y te'

    elif job.startswith(('ban')):
       return 'ban hang'
    elif 'truong phong' in job or 'giam doc' in job  \
            or 'giom doc' in job or 'chu tich' in job or 'vien truong' in job \
            or 'vien pho' in job or 'hieu truong' in job \
            or 'hieu pho' in job or job.startswith(('dao dien','bi thu','tong','pgd','gd','pho giam doc','quan doc','pho gd','ptgd','ptgd','uy vien','pho bi thu',
                                                                          'bt ','ct ', 'ct.','chi cuc','chu tich','tham phan ')) :
        return 'lanh dao'

    elif job.startswith(('quan ly','truong','doi truong','to truong','quan tri','pho','chi','chanh','phu trach','quan','tp','chu quan','chu nhiem','to pho','thanh tra')) or 'giam sat' in job or 'truong' in job:
        return 'quan ly'

    elif job.startswith(('chuyen gia','thac sỹ','thac si','tien si','tien sy','chuyen vien','kiem soat','co van','duoc ','chuyen gia ')):
        return 'chuyen gia'

    elif job.startswith(('cong an ','chien si','chien sy','trung ta','đai ta','truong cong an',)):
        return 'cong an, quan su'

    elif job.startswith(('cu nhan','ki thuat','lap trinh','admin','ky thuat','chuyen trach ','nghien cứu','bao tri','uy vien','kiem dinh','ki thuat','ky su','dai dien',
                                    'ky thuat','ki su','kinh doanh','giao dich','kiem sat','kiem pham','to chuc','tu phap','tham ','thong ke','chuan doan','kien truc','cong trinh','thiet ke',
                                    'dieu',)):
        return 'trinh do cao'

    elif job.startswith(('che bien','nhan vien san xuat')) :
        return 'san xuat'

    elif job.startswith(('truc ','thu kho','an ','bao ve',)):
        return 'bao ve'

    elif job.startswith(('ke toan','kiem tra','kiem thu','kiem hoa','kiem dich','kiem hang','kinh te vien','thu ngan','ket toan','van thu','thu quy','ke toan','bien')):
        return 'kinh te vien'

    elif job.startswith(('tiep vien','hanh chinh','tap vụ','nu ho sinh','nu ho sinh','mau dich','giao nhan','bhxh ','khuyen ','customer ''phien dich',
                                    'dia chinh','cham soc','bao mau','le tan','ho ly','huan luyen','dien thoai','tu van','ho tro','thuong ',)):
        return 'dich vu vien'

    elif job.startswith(('nhan vien','nv','nhon vion','cb','can bo','phuc vu','phuc vu','nhan vien','nhan vien',
                                    'cv','can su','hoc viec','thuc tap sinh','trung cap','vu','giao hang','trac dac','hlv','cong chuc','nhanvien','nhan  vien','nhiep anh',
                                    'ke hoach','kiem lam','hop dong','ho sinh','hat pho','ctv ','cao dang','can bo','tap ','thong '))  :
        return 'nhan vien'

    elif job.startswith(('tho ','cung nhon','cn','lao dong','thuy thu','cong nhan','boc xep','lao cong','ve sinh','may','lan','ld','lam','kho',
                                    'han ','gap xep','Coong nhaon','c«ng nh©n','con nhan','cong  nhan','thuyen ','sua ','cong nhan','san ','lap','khai thac','in ',
                                    'moi truong','ve sinh','sua','tho','cn','sua chua','may',)):
        return 'cong nhan'

    else: return 'khong ro'

In [20]:
def job_preprocess(x):
    x = str_normalize(x)
    x = job_category(x)
    return x

In [21]:
df_train_work["job/role"] = df_train_work["job/role"].apply(job_preprocess)

df_test_work["job/role"] = df_test_work["job/role"].apply(job_preprocess)

In [22]:
df_train_work["job/role"].value_counts().sort_values().plot(kind = 'barh', figsize=(5,10))

# Feature engineering

In [23]:
df_train_info.head(7)

In [24]:
df_train_work.head(7)

In [25]:
df_train = df_train_info.merge(df_train_work, on="id_bh")
df_test = df_test_info.merge(df_test_work, on="id_bh")

In [26]:
df_train.gender = pd.Categorical(df_train.gender)
df_train['gender_num'] = df_train.gender.cat.codes
df_test.gender = pd.Categorical(df_test.gender)
df_test['gender_num'] = df_test.gender.cat.codes

In [27]:
df_train["id_office"] = df_train["id_office"].replace(np.nan, "ZZ000ZZ")
df_train["id_office_filter"] = df_train["id_office"].apply(lambda x: x[0:2])
df_test["id_office"] = df_test["id_office"].replace(np.nan, "ZZ000ZZ")
df_test["id_office_filter"] = df_test["id_office"].apply(lambda x: x[0:2])

In [28]:
df_train['age'] = 2022 - df_train['bithYear']
df_test['age'] = 2022 - df_test['bithYear']

In [29]:
def split_year(string):
    return string[:4]

def split_month(string):
    return string[4:6]

def split_month_year(string):
    return str(string[:6])

In [30]:
df_train['from_date_new']= df_train['from_date'].astype('str').apply(split_month_year)
df_train['end_date_new']= df_train['to_date'].astype('str').apply(split_month_year)
df_test['from_date_new']= df_test['from_date'].astype('str').apply(split_month_year)
df_test['end_date_new']= df_test['to_date'].astype('str').apply(split_month_year)

In [31]:
df_train['exp'] = pd.to_datetime(df_train['end_date_new'].apply(str), format='%Y%m') - pd.to_datetime(df_train['from_date_new'].apply(str), format='%Y%m')
df_train['exp'] = df_train['exp'].dt.days
df_test['exp'] = pd.to_datetime(df_test['end_date_new'].apply(str), format='%Y%m') - pd.to_datetime(df_test['from_date_new'].apply(str), format='%Y%m')
df_test['exp'] = df_test['exp'].dt.days

In [32]:
del df_train['bithYear']
del df_train['gender']
del df_train['id_office']
del df_train['from_date']
del df_train['to_date']
del df_train['from_date_new']
del df_train['end_date_new']
del df_train['home_address']

del df_test['bithYear']
del df_test['gender']
del df_test['id_office']
del df_test['from_date']
del df_test['to_date']
del df_test['from_date_new']
del df_test['end_date_new']
del df_test['home_address']

In [33]:
df_train = df_train.merge(df_train_label, on="id_bh")
df_test = df_test.merge(df_test_label, on="id_bh")

In [34]:
test_id = df_test['id_bh'].tolist()

In [35]:
del df_train['id']

del df_test['id']

In [36]:
df_train.head(5)

# Mô hình

In [37]:
df = df_train 

In [38]:
le = preprocessing.LabelEncoder()
df['job/role'] = le.fit_transform(df['job/role'])
df['work_address'] = le.fit_transform(df['work_address'])
df['id_office_filter'] = le.fit_transform(df['id_office_filter'])

df_test['job/role'] = le.fit_transform(df_test['job/role'])
df_test['work_address'] = le.fit_transform(df_test['work_address'])
df_test['id_office_filter'] = le.fit_transform(df_test['id_office_filter'])

In [39]:
df_train.head()

In [40]:
df_test.head()

In [42]:
X_train = df_train[['id_management', 'company_type', 'job/role', 'employee_lv', 'work_address', 'gender_num', 'id_office_filter', 'age', 'exp']]
Y_train = df[['label']]

X_test = df_test[['id_management', 'company_type', 'job/role', 'employee_lv', 'work_address', 'gender_num', 'id_office_filter', 'age', 'exp']]

In [43]:
X_test.head(5)

In [44]:
!pip install catboost

In [45]:
import catboost
from catboost import cv, Pool, CatBoostClassifier

In [46]:
cat_fts_idx = [0, 1, 5, 6]
cv_data = Pool(data=X_train, label=Y_train, cat_features=cat_fts_idx)

In [47]:
params = {"iterations": 500,
          'random_seed': 42,
          'bagging_temperature': 0.1,
          'l2_leaf_reg': 10,
          'leaf_estimation_iterations': 5,
          "loss_function": "MultiClass", 
          'custom_metric': ['TotalF1:average=Macro', 'TotalF1:average=Weighted']
         }

scores, models = cv(cv_data, params, fold_count=5, return_models=True)

In [48]:
pred_probs_1 = models[0].predict(X_test, prediction_type='Probability')
pred_probs_2 = models[1].predict(X_test, prediction_type='Probability')
pred_probs_3 = models[2].predict(X_test, prediction_type='Probability')
pred_probs_4 = models[3].predict(X_test, prediction_type='Probability')
pred_probs_5 = models[4].predict(X_test, prediction_type='Probability')

In [49]:
pred_probs = (pred_probs_1+pred_probs_2+pred_probs_3+pred_probs_4+pred_probs_5)/5

In [54]:
df_test['label'] = (pred_probs.argmax(axis=1)+1)

In [55]:
df_test.head()

In [63]:
res = df_test[['id_bh', 'label']].drop_duplicates(
    subset=['id_bh'], 
    keep='last', 
    ignore_index=True
)

In [64]:
res['label'].value_counts()

In [69]:
res.to_csv("./catboost_info/submission.csv", index=False)