In [1]:
# important paths
DATA_PATH = r"..\data\processed\processed_1.pkl"
FEATURES_PATH = r"..\data\processed\cleaned_df.pkl"


In [2]:
TECH_COLS = ['LanguageHaveWorkedWith', 'DatabaseHaveWorkedWith', 
             'PlatformHaveWorkedWith', 'WebframeHaveWorkedWith', 
             'MiscTechHaveWorkedWith', 'ToolsTechHaveWorkedWith']

In [3]:
# used libraries
import pandas as pd
import mlflow
import os

import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import MultiLabelBinarizer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
import warnings


In [4]:
warnings.filterwarnings("ignore")

In [5]:
# reading data
processed_df = pd.read_pickle(DATA_PATH)
combined_df = pd.read_pickle(FEATURES_PATH)
df = combined_df.copy()

### Balance Role Classes

#### Before Balancing

In [6]:
roles_df = combined_df['DevType'].copy()
roles_df.sum(axis=0)

Academic researcher                               1283
Cloud infrastructure engineer                      931
Data or business analyst                           813
Data scientist or machine learning specialist     1474
Database administrator                             245
DevOps specialist                                 1217
Developer, QA or test                              562
Developer, back-end                              13045
Developer, desktop or enterprise applications     3745
Developer, front-end                              4915
Developer, full-stack                            23303
Developer, game or graphics                        824
Developer, mobile                                 2512
Engineer, data                                    1170
Scientist                                          331
Security professional                              416
System administrator                               689
dtype: int64

In [7]:
X=df['Clusters']
y = df['DevType'].idxmax(axis=1)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y,  random_state=42)

In [9]:
numeric_features = X_train.columns


In [10]:
preprocessor = ColumnTransformer(
                    transformers=[
                        ('numeric', StandardScaler(), numeric_features),
                    ])

In [11]:
clf = make_pipeline(preprocessor, RandomForestClassifier())
clf.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('numeric', StandardScaler(),
                                                  Index(['skills_group_0', 'skills_group_1', 'skills_group_10',
       'skills_group_11', 'skills_group_12', 'skills_group_13',
       'skills_group_14', 'skills_group_15', 'skills_group_16',
       'skills_group_17', 'skills_group_18', 'skills_group_19',
       'skills_group_2', 'skills_group_20', 'skills_group_21',
       'skills_group_22', 'skills_group_23', 'skills_group_24',
       'skills_group_3', 'skills_group_4', 'skills_group_5', 'skills_group_6',
       'skills_group_7', 'skills_group_9'],
      dtype='object'))])),
                ('randomforestclassifier', RandomForestClassifier())])

In [12]:
y_pred = clf.predict(X_test)


In [13]:
# f1 = f1_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
precision

0.4758842275485163

In [14]:
recall = recall_score(y_test, y_pred, average='weighted')
recall

0.5076246157555301

In [15]:
f1 = f1_score(y_test, y_pred, average='weighted')
f1

0.4814628392358536

#### After Balancing

In [16]:
# Resample roles
samples_per_class = 1400
resampled_roles = []

In [17]:
for role_col in roles_df.columns:
    sub_df = roles_df.loc[roles_df[role_col] == 1].copy()

    if len(sub_df) / samples_per_class < 1 :
        # Upsample
        t = len(sub_df) *   (len(sub_df) / samples_per_class) 
        amo = t + samples_per_class
        sub_df = sub_df.sample(int(amo), replace=True, random_state=0)
    else:
        # Downsample
        t = (len(sub_df) / samples_per_class) 
        if t > 2 :
            t/= 2
        amo = samples_per_class * t

        sub_df = sub_df.sample(int(amo), random_state=0)

    resampled_roles.append(sub_df)

In [18]:
# Construct dfs
roles_df = pd.concat(resampled_roles)
df = df.loc[roles_df.index].copy()

In [19]:
roles_df.sum(axis=0)

Academic researcher                               2575
Cloud infrastructure engineer                     2019
Data or business analyst                          1872
Data scientist or machine learning specialist     1474
Database administrator                            1442
DevOps specialist                                 2457
Developer, QA or test                             1625
Developer, back-end                               6522
Developer, desktop or enterprise applications     1872
Developer, front-end                              2457
Developer, full-stack                            11651
Developer, game or graphics                       1884
Developer, mobile                                 2512
Engineer, data                                    2377
Scientist                                         1478
Security professional                             1523
System administrator                              1739
dtype: int64

In [20]:
X=df['Clusters']
y = df['DevType'].idxmax(axis=1)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y,  random_state=42)

In [22]:
clf = make_pipeline(preprocessor, RandomForestClassifier())
clf.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('numeric', StandardScaler(),
                                                  Index(['skills_group_0', 'skills_group_1', 'skills_group_10',
       'skills_group_11', 'skills_group_12', 'skills_group_13',
       'skills_group_14', 'skills_group_15', 'skills_group_16',
       'skills_group_17', 'skills_group_18', 'skills_group_19',
       'skills_group_2', 'skills_group_20', 'skills_group_21',
       'skills_group_22', 'skills_group_23', 'skills_group_24',
       'skills_group_3', 'skills_group_4', 'skills_group_5', 'skills_group_6',
       'skills_group_7', 'skills_group_9'],
      dtype='object'))])),
                ('randomforestclassifier', RandomForestClassifier())])

In [23]:
y_pred = clf.predict(X_test)


In [24]:
precision = precision_score(y_test, y_pred, average='weighted')
precision

0.6889262578024073

In [25]:
recall = recall_score(y_test, y_pred, average='weighted')
recall

0.6960825610783488

In [26]:
f1 = f1_score(y_test, y_pred, average='weighted')
f1

0.6856964530395366

### Baseline Model
#### Intializing mlflow and saving our base model

In [27]:
# where to create the mlruns folder
MLFLOW_TRACKING_URI = '../models/mlruns'
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)


In [35]:
experiment_name = "Baseline Model"
artifact_location = r"..\models\Models"
experiment = mlflow.create_experiment(experiment_name, artifact_location)
# experiment = mlflow.get_experiment_by_name(experiment_name)

In [30]:
# Set MLFLOW_EXPERIMENT_LOCATION to the desired artifact location
os.environ["MLFLOW_EXPERIMENT_LOCATION"] = artifact_location

In [36]:
with mlflow.start_run(experiment_id=experiment.experiment_id):
    # Log parameters
    mlflow.log_param("param", 'No parameters specified')

    # Log metrics
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("precision", precision)
    mlflow.sklearn.log_model(clf, "RandomForestClassifier")