In [1]:
# important paths
DATA_PATH = r"..\data\processed\processed_1.pkl"
FEATURES_PATH = r"..\data\processed\cleaned_df.pkl"


In [2]:
TECH_COLS = ['LanguageHaveWorkedWith', 'DatabaseHaveWorkedWith', 
             'PlatformHaveWorkedWith', 'WebframeHaveWorkedWith', 
             'MiscTechHaveWorkedWith', 'ToolsTechHaveWorkedWith']

In [3]:
# used libraries
import pandas as pd
import mlflow
import os

import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import MultiLabelBinarizer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
import warnings


In [4]:
warnings.filterwarnings("ignore")

In [5]:
# reading data
processed_df = pd.read_pickle(DATA_PATH)
combined_df = pd.read_pickle(FEATURES_PATH)
df = combined_df.copy()

In [6]:
def laterFunction () :
    pass


### Balance Role Classes

In [7]:
roles_df = combined_df['DevType'].copy()
roles_df.sum(axis=0)

Academic researcher                               1283
Blockchain                                         294
Cloud infrastructure engineer                      931
Data or business analyst                           813
Data scientist or machine learning specialist     1474
Database administrator                             245
DevOps specialist                                 1217
Developer Advocate                                 192
Developer Experience                               303
Developer, QA or test                              562
Developer, back-end                              13045
Developer, desktop or enterprise applications     3745
Developer, embedded applications or devices       1780
Developer, front-end                              4915
Developer, full-stack                            23303
Developer, game or graphics                        824
Developer, mobile                                 2512
Engineer, data                                    1170
Hardware E

In [8]:
df['Clusters'].sum(axis=0)

skills_group_0      13017
skills_group_1       8403
skills_group_10     20792
skills_group_11     12352
skills_group_12      3364
skills_group_13      4984
skills_group_14      8915
skills_group_15     16944
skills_group_16      6100
skills_group_17    183995
skills_group_18     31274
skills_group_19     60618
skills_group_2     143555
skills_group_20      1902
skills_group_21     40646
skills_group_22      1378
skills_group_23      1767
skills_group_24     28344
skills_group_3     157348
skills_group_4     124210
skills_group_5     241873
skills_group_6     118073
skills_group_7      75503
skills_group_9       5591
dtype: int64

In [9]:
X=df['Clusters']
y = df['DevType'].idxmax(axis=1)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y,  random_state=42)

In [11]:
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

In [12]:
X_train_resampled.sum(axis=0)

skills_group_0      80507
skills_group_1      66529
skills_group_10     97848
skills_group_11     44231
skills_group_12     20377
skills_group_13     25886
skills_group_14     48221
skills_group_15     53518
skills_group_16     31821
skills_group_17    948954
skills_group_18    175814
skills_group_19    220949
skills_group_2     630711
skills_group_20     11899
skills_group_21    193879
skills_group_22      4869
skills_group_23      8685
skills_group_24     94060
skills_group_3     742639
skills_group_4     827454
skills_group_5     898323
skills_group_6     776443
skills_group_7     338282
skills_group_9      39416
dtype: int64

In [13]:
len(y_train_resampled)

428766

In [14]:
numeric_features = X_train_resampled.columns


In [15]:
preprocessor = ColumnTransformer(
                    transformers=[
                        ('numeric', StandardScaler(), numeric_features),
                    ])

In [16]:
clf = make_pipeline(preprocessor, LogisticRegression())
clf.fit(X_train_resampled, y_train_resampled)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('numeric', StandardScaler(),
                                                  Index(['skills_group_0', 'skills_group_1', 'skills_group_10',
       'skills_group_11', 'skills_group_12', 'skills_group_13',
       'skills_group_14', 'skills_group_15', 'skills_group_16',
       'skills_group_17', 'skills_group_18', 'skills_group_19',
       'skills_group_2', 'skills_group_20', 'skills_group_21',
       'skills_group_22', 'skills_group_23', 'skills_group_24',
       'skills_group_3', 'skills_group_4', 'skills_group_5', 'skills_group_6',
       'skills_group_7', 'skills_group_9'],
      dtype='object'))])),
                ('logisticregression', LogisticRegression())])

In [17]:
y_pred = clf.predict(X_test)


In [18]:
# f1 = f1_score(y_test, y_pred, average='weighted')
perc = precision_score(y_test, y_pred, average='weighted')
perc

0.46384715254676706

In [19]:
rec = recall_score(y_test, y_pred, average='weighted')
rec

0.22042071002350672

In [20]:
f1 = f1_score(y_test, y_pred, average='weighted')
f1

0.22411855258375235

In [21]:
clf = make_pipeline(preprocessor, LogisticRegression())
clf.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('numeric', StandardScaler(),
                                                  Index(['skills_group_0', 'skills_group_1', 'skills_group_10',
       'skills_group_11', 'skills_group_12', 'skills_group_13',
       'skills_group_14', 'skills_group_15', 'skills_group_16',
       'skills_group_17', 'skills_group_18', 'skills_group_19',
       'skills_group_2', 'skills_group_20', 'skills_group_21',
       'skills_group_22', 'skills_group_23', 'skills_group_24',
       'skills_group_3', 'skills_group_4', 'skills_group_5', 'skills_group_6',
       'skills_group_7', 'skills_group_9'],
      dtype='object'))])),
                ('logisticregression', LogisticRegression())])

In [22]:
y_pred = clf.predict(X_test)


In [23]:
perc = precision_score(y_test, y_pred, average='weighted')
perc

0.4321365833587833

In [24]:
rec = recall_score(y_test, y_pred, average='weighted')
rec

0.4730878186968839

In [25]:
f1 = f1_score(y_test, y_pred, average='weighted')
f1

0.4331869812706021

#### As we can see handling the imbalancing of the data seems to not help so we will use the data as it's, also when using ensemble methods it won't get affected with that imbalancing so it doesn't matter for now 

### Baseline Model
#### Intializing mlflow and saving our base model

In [30]:
experiment_name = "your_experiment_name"
artifact_location = r"..\models\Models"
experiment_id = mlflow.create_experiment(experiment_name, artifact_location)


In [31]:
# Set MLFLOW_EXPERIMENT_LOCATION to the desired artifact location
os.environ["MLFLOW_EXPERIMENT_LOCATION"] = artifact_location

In [34]:
# where to create the mlruns folder
MLFLOW_TRACKING_URI = '../models/mlruns'
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)


In [32]:
with mlflow.start_run(experiment_id=experiment_id):
    # Log parameters
    mlflow.log_param("param", 'No parameters specified')

    # Log metrics
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("recall", rec)
    mlflow.log_metric("precision", perc)
    mlflow.sklearn.log_model(clf, "logisticRegression")