### Understanding Data

In [3]:
import pandas as pd
import numpy as np
import json
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [2]:
pip install pandas

Collecting pandas
  Downloading pandas-2.2.1-cp310-cp310-macosx_10_9_x86_64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.1-cp310-cp310-macosx_10_9_x86_64.whl (12.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.5/12.5 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading pytz-2024.1-py2.py3-none-any.whl (505 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m505.5/505.5 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.4/345.4 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.2.1 pytz-2024.1

### Loading Data

In [11]:
X_train = pd.read_csv('/Users/louisedurand-janin/Documents/GitHub/HrFlow_Data_Challenge/data/X_train.csv', index_col=0)
X_test = pd.read_csv('/Users/louisedurand-janin/Documents/GitHub/HrFlow_Data_Challenge/data/X_test.csv', index_col=0)
y_train = pd.read_csv('/Users/louisedurand-janin/Documents/GitHub/HrFlow_Data_Challenge/data/y_train.csv', index_col=0)

In [3]:
X_train.head()

Unnamed: 0,id,employee embedding,company embedding
0,0,"[0.0132625512778759, -0.37616726756095886, -0....","[0.39761704206466675, 0.011816106736660004, 0...."
1,1,"[-0.2827381491661072, -0.012908441945910454, 0...","[0.31051310896873474, -0.33802372217178345, 0...."
2,2,"[0.3999897241592407, -0.2984728515148163, -0.2...","[0.1970466524362564, 0.062399972230196, 0.2565..."
3,3,"[0.3999897241592407, -0.2984728515148163, -0.2...","[0.6501612067222595, -0.17757245898246765, 0.1..."
4,4,"[0.010124210268259048, 0.05761045217514038, -0...","[0.5966811180114746, -0.06162050738930702, 0.0..."


In [4]:
y_train.head()

Unnamed: 0,id,position
0,0,Executive
1,1,Executive
2,2,Executive
3,3,Executive
4,4,Executive


In [6]:
X_train['employee embedding'] = X_train['employee embedding'].apply(lambda x: np.array(json.loads(x)))
X_train['company embedding'] = X_train['company embedding'].apply(lambda x: np.array(json.loads(x)))
X_test['employee embedding'] = X_test['employee embedding'].apply(lambda x: np.array(json.loads(x)))
X_test['company embedding'] = X_test['company embedding'].apply(lambda x: np.array(json.loads(x)))

In [12]:
X_train['employee embedding'] = X_train['employee embedding'].apply(lambda x: np.array(json.loads(x), dtype=np.float32))
X_train['company embedding'] = X_train['company embedding'].apply(lambda x: np.array(json.loads(x), dtype=np.float32))

X_test['employee embedding'] = X_test['employee embedding'].apply(lambda x: np.array(json.loads(x), dtype=np.float32))
X_test['company embedding'] = X_test['company embedding'].apply(lambda x: np.array(json.loads(x), dtype=np.float32))

In [6]:
X_train['employee embedding'][0].shape, X_train['company embedding'][0].shape

((32,), (32,))

Employee + company embeddings: size 32 tensors

In [7]:
X_train.shape, X_test.shape

((29273, 3), (7327, 3))

In [19]:
y_train['position'].values

array(['Executive', 'Executive', 'Executive', ..., 'Executive',
       'Executive', 'Executive'], dtype=object)

In [7]:
X_train.head()

Unnamed: 0,id,employee embedding,company embedding
0,0,"[0.0132625512778759, -0.37616726756095886, -0....","[0.39761704206466675, 0.011816106736660004, 0...."
1,1,"[-0.2827381491661072, -0.012908441945910454, 0...","[0.31051310896873474, -0.33802372217178345, 0...."
2,2,"[0.3999897241592407, -0.2984728515148163, -0.2...","[0.1970466524362564, 0.062399972230196, 0.2565..."
3,3,"[0.3999897241592407, -0.2984728515148163, -0.2...","[0.6501612067222595, -0.17757245898246765, 0.1..."
4,4,"[0.010124210268259048, 0.05761045217514038, -0...","[0.5966811180114746, -0.06162050738930702, 0.0..."


In [13]:
X_train.dtypes

id                     int64
employee embedding    object
company embedding     object
dtype: object

In [14]:
np.save('X_train.npy', X_train.iloc[:, 1:3].values, allow_pickle=True)

In [22]:
X_train.iloc[:, 1:3]

(29273, 2)

In [16]:
encoding_map = {
        "Assistant": 0,
        "Executive": 1,
        "Manager": 2,
        "Director": 3,
    }

In [17]:
y_train_arr = np.array([encoding_map[category] for category in y_train['position']])
np.save('y_train.npy', y_train_arr, allow_pickle=True)

Proportion of each class:

In [27]:
np.array([0, 0, 0, 1]) == np.array([0, 0, 2, 3])

array([ True,  True, False, False])

In [8]:
y_train['position'].value_counts() / y_train.shape[0]

position
Executive    0.607830
Assistant    0.308236
Director     0.046323
Manager      0.037611
Name: count, dtype: float64

Approaches:
- multiclass classification with 4 classes
- RL: all start as Assistant -> Executive -> Manager -> Director, move to the next state rewarded if possible else punished?