In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from re import search
import joblib
import warnings
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer

pd.set_option("display.max_columns", None)
pd.set_option('display.max_rows', None)
warnings.filterwarnings('ignore')

In [2]:
def save_object(obj, filename):
    current_working_dir = os.path.abspath(os.getcwd())
    file_path = os.path.join(current_working_dir, 'models', filename)
    joblib.dump(obj, file_path)
    
def load_object(filename):
    current_working_dir = os.path.abspath(os.getcwd())
    file_path = os.path.join(current_working_dir, 'models', filename)
    obj = joblib.load(file_path)
    return obj

In [3]:
selected_technologies = [
    'ruby', 'windows', 'oracle', 'mysql', 'hadoop(hdfs)', 'nosq', 'redshift', 'pig', 'map-reduce', 'hbase',
    'yarn', 'postgresql', 'kafka', 'tableau', 'vertica', 'mariadb', 'hdfs', 'numpy', 'deep learning', 'ai', 'scoring',
    'matplotlib', 'pycharm', 'scikit-learn', 'tensorflow', 'teradata', 'anglais', 'big data', 'r', 'machine learning', 
    'microsoft azure', 'spss', 'excel', 'sas', 'vba', 'matlab', 'aws', 'gnu', 'linux'
]
selected_features = [ *selected_technologies, 'Experience' ]
target_feature = 'Metier'

pipeline_filename = 'pipeline.joblib'
target_encoder_filename = 'label_encoder.joblib'

In [4]:
class CreateIndividualTechnologyColumns(BaseEstimator, TransformerMixin):
    def __init__(self, technologies):
        self.technologies = np.array(selected_technologies)
    
    def set_individual_technology(self, row):
        row_technologies = np.array(list(filter(None, row['Technologies'].lower().strip().split('/'))))
        individual_technologies = np.isin(self.technologies, row_technologies).astype(int)
        return np.concatenate((row, individual_technologies))
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_ = X.copy()
        transformed_X = X_.apply(
            self.set_individual_technology,
            axis=1,
            result_type='expand'
        )
        transformed_X.columns = [*X_.columns, *self.technologies.tolist()]
        return transformed_X

In [5]:
class TransformExperienceColumn(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_ = X.copy()
        X_['Experience'] = X_['Experience'].str.replace(',', '.').astype(float)
        return X_

In [6]:
class DropNotSelectedColumns(BaseEstimator, TransformerMixin):
    def __init__(self, selected_features):
        self.selected_features = selected_features
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_ = X.copy()
        result = X_[self.selected_features]
        return result

In [7]:
def clean_raw_data(df):
    _df = df.drop_duplicates()
    _df = df.dropna(subset=['Experience'])
    return _df

def split_train_data_raw(data, target_feature):
    le = LabelEncoder()
    X = data.drop(target_feature, axis=1)
    y = le.fit_transform(data[target_feature])

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.25, random_state = 0)
    
    save_object(le, target_encoder_filename)
    return X_train, X_val, y_train, y_val

def predict(data):
    pipeline = load_object(pipeline_filename)
    y_pred = pipeline.predict(data)
    return y_pred

def train_model(X, y):
    # Train
    pipe = Pipeline(
        steps=[
            ('experience_transformer', TransformExperienceColumn()),
            ('individual_technologies', CreateIndividualTechnologyColumns(selected_technologies)),
            ('drop_not_selected_columns', DropNotSelectedColumns(selected_features)),
            ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
            ('classifier', RandomForestClassifier(
                n_estimators=52, class_weight={0: 1, 1: 1, 2: 1, 3: 3}, max_depth=12
            ))
        ]
    )
    pipe.fit(X, y)
    save_object(pipe, pipeline_filename)

In [8]:
def build_model(data_raw):
    cleaned_data = clean_raw_data(data_raw)
    X_train, X_val, y_train, y_val = split_train_data_raw(cleaned_data, target_feature)
    # Train model and predict
    train_model(X_train, y_train)
    y_pred = predict(X_val)
    return y_val, y_pred

In [9]:
raw = pd.read_csv('./dataset_train_test.csv', sep=';')
y_val, y_pred = build_model(raw)

In [10]:
le = load_object(target_encoder_filename)
print(classification_report(y_val, y_pred, target_names=le.classes_))

                     precision    recall  f1-score   support

    Data architecte       0.66      0.98      0.79       430
      Data engineer       0.98      0.66      0.79       572
     Data scientist       0.90      0.72      0.80       849
Lead data scientist       0.49      0.75      0.59       282

           accuracy                           0.76      2133
          macro avg       0.76      0.78      0.74      2133
       weighted avg       0.82      0.76      0.77      2133



In [11]:
def make_predictions(predict_data_raw):
    y_pred = predict(predict_data_raw)
    le = load_object(target_encoder_filename)
    return le.inverse_transform(y_pred)

In [12]:
predict_raw = pd.read_csv('./dataset_predict.csv', sep=';')
make_predictions(predict_raw)

array(['Data scientist', 'Data engineer', 'Lead data scientist',
       'Data scientist', 'Data scientist', 'Data architecte',
       'Data scientist', 'Data scientist', 'Lead data scientist',
       'Data architecte', 'Lead data scientist', 'Data architecte',
       'Data engineer', 'Data architecte', 'Data scientist',
       'Data engineer', 'Data scientist', 'Data architecte',
       'Data scientist', 'Lead data scientist'], dtype=object)