In [1]:
# important paths
DATA_PATH = r"..\data\processed\processed_1.pkl"
FEATURES_PATH = r"..\data\processed\cleaned_df.pkl"


In [2]:
# used libraries
import pandas as pd
import mlflow
import os

import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV

from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import warnings

In [3]:
warnings.filterwarnings("ignore")

In [4]:
# reading data
processed_df = pd.read_pickle(DATA_PATH)
combined_df = pd.read_pickle(FEATURES_PATH)
df = combined_df.copy()

In [5]:
roles_df = combined_df['DevType'].copy()
roles_df.sum(axis=0)

Academic researcher                               1283
Cloud infrastructure engineer                      931
Data or business analyst                           813
Data scientist or machine learning specialist     1474
Database administrator                             245
DevOps specialist                                 1217
Developer, QA or test                              562
Developer, back-end                              13045
Developer, desktop or enterprise applications     3745
Developer, front-end                              4915
Developer, full-stack                            23303
Developer, game or graphics                        824
Developer, mobile                                 2512
Engineer, data                                    1170
Scientist                                          331
Security professional                              416
System administrator                               689
dtype: int64

#### Handling the imbalance

In [6]:
# Resample roles
samples_per_class = 1400
resampled_roles = []

In [7]:
for role_col in roles_df.columns:
    sub_df = roles_df.loc[roles_df[role_col] == 1].copy()

    if len(sub_df) / samples_per_class < 1 :
        # Upsample
        t = len(sub_df) *   (len(sub_df) / samples_per_class) 
        amo = t + samples_per_class
        sub_df = sub_df.sample(int(amo), replace=True, random_state=0)
    else:
        # Downsample
        t = (len(sub_df) / samples_per_class) 
        if t > 2 :
            t/= 2
        amo = samples_per_class * t

        sub_df = sub_df.sample(int(amo), random_state=0)

    resampled_roles.append(sub_df)

In [8]:
# Construct dfs
roles_df = pd.concat(resampled_roles)
df = df.loc[roles_df.index].copy()

In [9]:
roles_df.sum(axis=0)

Academic researcher                               2575
Cloud infrastructure engineer                     2019
Data or business analyst                          1872
Data scientist or machine learning specialist     1474
Database administrator                            1442
DevOps specialist                                 2457
Developer, QA or test                             1625
Developer, back-end                               6522
Developer, desktop or enterprise applications     1872
Developer, front-end                              2457
Developer, full-stack                            11651
Developer, game or graphics                       1884
Developer, mobile                                 2512
Engineer, data                                    2377
Scientist                                         1478
Security professional                             1523
System administrator                              1739
dtype: int64

In [10]:
X=df['Clusters']
y = df['DevType'].idxmax(axis=1)

### Basic RandomForest model

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15,stratify=y,  random_state=42)

In [12]:
numeric_cols = X_train.columns

In [13]:
preprocessor = ColumnTransformer(
                transformers=[
                    ('num', RobustScaler(), numeric_cols)
                ]
)

In [14]:
rf_clf = Pipeline([
    
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.99)),
    ('classifier', RandomForestClassifier(n_jobs=-1, verbose=1, random_state=42))
])

rf_clf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.4s finished


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', RobustScaler(),
                                                  Index(['skills_group_0', 'skills_group_1', 'skills_group_10',
       'skills_group_11', 'skills_group_12', 'skills_group_13',
       'skills_group_14', 'skills_group_15', 'skills_group_16',
       'skills_group_17', 'skills_group_18', 'skills_group_19',
       'skills_group_2', 'skills_group_20', 'skills_group_21',
       'skills_group_22', 'skills_group_23', 'skills_group_24',
       'skills_group_3', 'skills_group_4', 'skills_group_5', 'skills_group_6',
       'skills_group_7', 'skills_group_9'],
      dtype='object'))])),
                ('pca', PCA(n_components=0.99)),
                ('classifier',
                 RandomForestClassifier(n_jobs=-1, random_state=42,
                                        verbose=1))])

In [15]:
y_pred = rf_clf.predict(X_test)

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished


In [16]:
print (classification_report(y_test, y_pred))

                                               precision    recall  f1-score   support

                          Academic researcher       0.76      0.79      0.77       386
                Cloud infrastructure engineer       0.91      0.88      0.90       303
                     Data or business analyst       0.80      0.83      0.81       281
Data scientist or machine learning specialist       0.66      0.57      0.61       221
                       Database administrator       0.85      0.94      0.89       216
                            DevOps specialist       0.90      0.81      0.85       369
                        Developer, QA or test       0.92      0.84      0.88       244
                          Developer, back-end       0.56      0.42      0.48       978
Developer, desktop or enterprise applications       0.40      0.21      0.28       281
                         Developer, front-end       0.61      0.33      0.43       368
                        Developer, full-st

In [17]:
f1=f1_score(y_test, y_pred, average='weighted')
f1

0.7003388489850783

In [18]:
precision=precision_score(y_test, y_pred, average='weighted')

In [19]:
recall=recall_score(y_test, y_pred, average='weighted')

In [20]:
# where to create the mlruns folder
MLFLOW_TRACKING_URI = '../models/mlruns'
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)


In [21]:
experiment_name = "rf_job_predict"
artifact_location = r"..\models\Models"
# experiment = mlflow.create_experiment(experiment_name, artifact_location)
experiment = mlflow.get_experiment_by_name(experiment_name)

In [22]:
# Set MLFLOW_EXPERIMENT_LOCATION to the desired artifact location
os.environ["MLFLOW_EXPERIMENT_LOCATION"] = artifact_location

In [23]:
with mlflow.start_run(experiment_id=experiment.experiment_id):
    # Log parameters
    mlflow.log_param("PCA n_components", 0.99)

    # Log metrics
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("precision", precision)
    mlflow.sklearn.log_model(rf_clf, " Basic RandomForestClassifier with PCA")

### Hyperparameter Tuning

In [24]:
param_grid = {
    'pca__n_components': [0.9, 0.95, 0.99], 
    'classifier__n_estimators': [10, 20, 25, 30 ],
    'classifier__max_depth': [None,5, 10],
    'classifier__min_samples_split': [2, 3, 4],
    'classifier__min_samples_leaf': [1, 2, 4]
    
}


In [25]:
hpt_rf_clf = GridSearchCV(rf_clf, param_grid, cv=3, scoring='accuracy', n_jobs=-1)


In [26]:
hpt_rf_clf.fit(X_train, y_train)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.4s finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         RobustScaler(),
                                                                         Index(['skills_group_0', 'skills_group_1', 'skills_group_10',
       'skills_group_11', 'skills_group_12', 'skills_group_13',
       'skills_group_14', 'skills_group_15', 'skills_group_16',
       'skills_group_17', 'skills_group_18', 'skills_group_19',
       'skills_group_2...
      dtype='object'))])),
                                       ('pca', PCA(n_components=0.99)),
                                       ('classifier',
                                        RandomForestClassifier(n_jobs=-1,
                                                               random_state=42,
                                                               verbose=1))]),
             n_job

In [27]:
# Get the best parameters and best estimator
best_params = hpt_rf_clf.best_params_
best_estimator = hpt_rf_clf.best_estimator_

# Print the best parameters
print("Best Parameters:", best_params)

Best Parameters: {'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 3, 'classifier__n_estimators': 30, 'pca__n_components': 0.99}


In [28]:
best_estimator.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.4s finished


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', RobustScaler(),
                                                  Index(['skills_group_0', 'skills_group_1', 'skills_group_10',
       'skills_group_11', 'skills_group_12', 'skills_group_13',
       'skills_group_14', 'skills_group_15', 'skills_group_16',
       'skills_group_17', 'skills_group_18', 'skills_group_19',
       'skills_group_2', 'skills_group_20', 'skills_group_21',
       'skills_group_22', 'skills_group_23', 'skills_group_24',
       'skills_group_3', 'skills_group_4', 'skills_group_5', 'skills_group_6',
       'skills_group_7', 'skills_group_9'],
      dtype='object'))])),
                ('pca', PCA(n_components=0.99)),
                ('classifier',
                 RandomForestClassifier(min_samples_split=3, n_estimators=30,
                                        n_jobs=-1, random_state=42,
                                        verbose=1))])

In [29]:
y_pred = best_estimator.predict(X_test)

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  30 out of  30 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=16)]: Done  30 out of  30 | elapsed:    0.0s finished


In [30]:
f1=f1_score(y_test,y_pred, average='micro')
f1

0.6982589160348217

In [31]:
precision=precision_score(y_test, y_pred, average='weighted')

In [32]:
recall=recall_score(y_test, y_pred, average='weighted')

In [33]:
experiment = mlflow.get_experiment_by_name('rf_job_predict')

In [34]:
artifact_location = r"..\models\Models"
os.environ["MLFLOW_EXPERIMENT_LOCATION"] = artifact_location

In [35]:
with mlflow.start_run(experiment_id=experiment.experiment_id):
    # Log parameters
    mlflow.log_param("PCA n_components", 0.99)
    mlflow.log_param('classifier__max_depth', None)
    mlflow.log_param('min_samples_leaf', 1)
    mlflow.log_param('min_samples_split', 3)
    mlflow.log_param('n_estimators', 30)

    # Log metrics
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("precision", precision)
    mlflow.sklearn.log_model(best_estimator, "rf with hyperparameter tuning")