<center><span style="color:#b30000;font-size:35px;"><strong>Modeling Phase</strong></span></center>

<span style="color:#2929a3;font-size:20px;">Import Libraries</span>

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MultiLabelBinarizer, LabelEncoder, FunctionTransformer
from category_encoders import BinaryEncoder
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as imb_Pipeline
from sklearn.pipeline import Pipeline
from imblearn.under_sampling import TomekLinks, RandomUnderSampler
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_validate , StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer

<span style="color:#2929a3;font-size:20px;">Constants</span>

In [3]:
TECH_COLS = ['LanguageHaveWorkedWith',
            'DatabaseHaveWorkedWith',
            'PlatformHaveWorkedWith',
            'WebframeHaveWorkedWith',
            'MiscTechHaveWorkedWith',
            'ToolsTechHaveWorkedWith',
            'NEWCollabToolsHaveWorkedWith']

<span style="color:#2929a3;font-size:20px;">Read Dataset </span>

In [4]:
df = pd.read_pickle('Data/Cleaned_original_data.pkl')

In [5]:
df.head()

Unnamed: 0,DevType,LanguageHaveWorkedWith,DatabaseHaveWorkedWith,PlatformHaveWorkedWith,WebframeHaveWorkedWith,MiscTechHaveWorkedWith,ToolsTechHaveWorkedWith,NEWCollabToolsHaveWorkedWith
0,Web Developer,Bash/Shell (all shells);Go,,Amazon Web Services (AWS);Google Cloud;OpenSta...,,,Cargo;Docker;Kubernetes;Make;Nix,Emacs;Helix
1,Web Developer,Bash/Shell (all shells);HTML/CSS;JavaScript;PH...,PostgreSQL;Redis,Cloudflare;Heroku,Node.js;React;Ruby on Rails;Vue.js;WordPress,,Homebrew;npm;Vite;Webpack;Yarn,IntelliJ IDEA;Vim;Visual Studio Code;WebStorm
2,Web Developer,Bash/Shell (all shells);HTML/CSS;JavaScript;Ru...,BigQuery;Cloud Firestore;PostgreSQL;Redis,Amazon Web Services (AWS);Cloudflare;Google Cloud,Angular;Express;NestJS;Node.js,,Docker;Homebrew;Kubernetes;npm;pnpm;Terraform,Helix;Neovim
3,"Developer, QA or test",C;C++;Python;Rust,Redis,,,,Cargo;CMake;Docker;GNU GCC;Make,Code::Blocks;Sublime Text;Vim;Xcode
4,Web Developer,Java;Perl;TypeScript,MySQL,,Fastify;Node.js;React,Spring Framework,Kubernetes;Yarn,Visual Studio Code


### Note
- In this dataset, I retained the null values in their original state due to the logic of the data, which allows for the presence of null values. It is possible that the user did not work with any type of databases or did not have any miscellaneous technologies .. etc

<span style="color:#2929a3;font-size:20px;">Shape of Dataset </span>

In [6]:
df.shape

(38545, 8)

<span style="color:#2929a3;font-size:20px;">Check Target Value Counts</span>

In [7]:
df['DevType'].value_counts()

DevType
Web Developer                           30693
Developer, mobile                        2261
Engineer, data                           1046
DevOps specialist                        1004
Database/System Admininstrator            776
Data or business analyst                  655
Developer, game or graphics               516
Developer, QA or test                     453
Security professional                     305
Blockchain                                287
Cloud infrastructure engineer             286
Developer, Hardware/Embedded Systems      263
Name: count, dtype: int64

<span style="color:#2929a3;font-size:20px;">Encode Target</span>

In [8]:
target_encoder = LabelEncoder()
df['DevType'] = target_encoder.fit_transform(df['DevType'])

<span style="color:#2929a3;font-size:20px;">Encode Features</span>

In [9]:
Objects_dict = {}
def Create_Encoded_Features():
    global Objects_dict
    encoded_dfs = {}
    TF_languages = TfidfVectorizer(stop_words='english')
    df_languages = pd.DataFrame(TF_languages.fit_transform(df['LanguageHaveWorkedWith'].fillna('NaN')).toarray(), columns=TF_languages.get_feature_names_out())
    encoded_dfs['LanguageHaveWorkedWith'] = df_languages
    
    TF_databases = TfidfVectorizer(stop_words='english')
    df_databases = pd.DataFrame(TF_databases.fit_transform(df['DatabaseHaveWorkedWith'].fillna('NaN')).toarray(), columns=TF_databases.get_feature_names_out())
    encoded_dfs['DatabaseHaveWorkedWith'] = df_databases
    
    TF_platforms = TfidfVectorizer(stop_words='english')
    df_platforms = pd.DataFrame(TF_platforms.fit_transform(df['PlatformHaveWorkedWith'].fillna('NaN')).toarray(), columns=TF_platforms.get_feature_names_out())
    encoded_dfs['PlatformHaveWorkedWith'] = df_platforms
    
    TF_webframes = TfidfVectorizer(stop_words='english')
    df_webframes = pd.DataFrame(TF_webframes.fit_transform(df['WebframeHaveWorkedWith'].fillna('NaN')).toarray(), columns=TF_webframes.get_feature_names_out())
    encoded_dfs['WebframeHaveWorkedWith'] = df_webframes
    
    TF_MiscTech = TfidfVectorizer(stop_words='english')
    df_MiscTech = pd.DataFrame(TF_MiscTech.fit_transform(df['MiscTechHaveWorkedWith'].fillna('NaN')).toarray(), columns=TF_MiscTech.get_feature_names_out())
    encoded_dfs['MiscTechHaveWorkedWith'] = df_MiscTech
    
    TF_tools = TfidfVectorizer(stop_words='english')
    df_tools = pd.DataFrame(TF_tools.fit_transform(df['ToolsTechHaveWorkedWith'].fillna('NaN')).toarray(), columns=TF_tools.get_feature_names_out())
    encoded_dfs['ToolsTechHaveWorkedWith'] = df_tools
    
    TF_NEWCollabTools = TfidfVectorizer(stop_words='english')
    df_NEWCollabTools = pd.DataFrame(TF_NEWCollabTools.fit_transform(df['NEWCollabToolsHaveWorkedWith'].fillna('NaN')).toarray(), columns=TF_NEWCollabTools.get_feature_names_out())
    encoded_dfs['NEWCollabToolsHaveWorkedWith'] = df_NEWCollabTools
    
    res = pd.concat(encoded_dfs, axis=1)
    Objects_dict['TF_languages'] = TF_languages
    Objects_dict['TF_databases'] = TF_databases
    Objects_dict['TF_platforms'] = TF_platforms
    Objects_dict['TF_webframes'] = TF_webframes
    Objects_dict['TF_MiscTech'] = TF_MiscTech
    Objects_dict['TF_tools'] = TF_tools
    Objects_dict['TF_NEWCollabTools'] = TF_NEWCollabTools
    return res

In [10]:
encoded_df = Create_Encoded_Features()

In [11]:
encoded_df['DevType'] = df['DevType']

In [12]:
encoded_df

Unnamed: 0_level_0,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,...,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,DevType
Unnamed: 0_level_1,ada,apex,apl,assembly,bash,basic,clojure,cobol,crystal,css,...,studio,sublime,text,textmate,vim,visual,vscodium,webstorm,xcode,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.577350,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.00000,0.000000,11
1,0.0,0.0,0.0,0.0,0.317944,0.0,0.0,0.0,0.0,0.232598,...,0.195961,0.000000,0.000000,0.0,0.438451,0.202721,0.0,0.62268,0.000000,11
2,0.0,0.0,0.0,0.0,0.345199,0.0,0.0,0.0,0.0,0.252536,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.00000,0.000000,11
3,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.371879,0.371879,0.0,0.308942,0.000000,0.0,0.00000,0.401176,6
4,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.554801,0.000000,0.000000,0.0,0.000000,0.573941,0.0,0.00000,0.000000,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38540,0.0,0.0,0.0,0.0,0.223604,0.0,0.0,0.0,0.0,0.163582,...,0.000000,0.000000,0.000000,0.0,1.000000,0.000000,0.0,0.00000,0.000000,11
38541,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.511641,...,0.367583,0.000000,0.000000,0.0,0.000000,0.380264,0.0,0.00000,0.000000,11
38542,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.350208,0.000000,0.000000,0.0,0.000000,0.241526,0.0,0.00000,0.000000,8
38543,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.214398,0.000000,0.000000,0.0,0.000000,0.221795,0.0,0.00000,0.000000,11


<span style="color:#2929a3;font-size:20px;">Split Data to X and Y</span>

In [13]:
x = encoded_df.drop('DevType', axis=1)
y = encoded_df['DevType']

<span style="color:#2929a3;font-size:20px;">Model Selection Process</span>

In [14]:
Models = []
Models.append(('Logistic Regression',LogisticRegression(max_iter=10000)))
Models.append(('Knn',KNeighborsClassifier()))
Models.append(('MultinomialNB',MultinomialNB()))
Models.append(('GaussianNB',GaussianNB()))
Models.append(('BernoulliNB',BernoulliNB()))
Models.append(('Decision Tree', DecisionTreeClassifier()))
Models.append(('Random Forest',RandomForestClassifier()))
Models.append(('Voting Classifier',VotingClassifier(estimators=[('Logistic Regression',LogisticRegression(max_iter=10000)), ('Knn',KNeighborsClassifier()), ('MultinomialNB',MultinomialNB()), ('GaussianNB',GaussianNB()), ('Decision Tree', DecisionTreeClassifier()), ('Random Forest',RandomForestClassifier())])))
Models.append(('XGBoost',XGBClassifier()))

In [15]:
y.value_counts()

DevType
11    30693
8      2261
9      1046
4      1004
3       776
2       655
7       516
6       453
10      305
0       287
1       286
5       263
Name: count, dtype: int64

In [119]:
for model in Models:
    steps = []
    steps.append(('Scaler', MinMaxScaler()))
    steps.append(('UnderSampling',TomekLinks()))
    steps.append(('UnderSampling2',RandomUnderSampler(sampling_strategy={11:2000, 8:1500})))
    steps.append(('OverSampling', SMOTETomek(sampling_strategy={0:1000, 1:1000, 5:1000, 10:1000, 6:1000, 7:1000, 2:1000, 3:1000})))
    #steps.append(('PCA',PCA(n_components=0.90)))
    steps.append(model)
    pipeline = imb_Pipeline(steps=steps)
    res = cross_validate(pipeline , x, y, scoring='accuracy', cv=5, return_train_score=True, n_jobs=-1)
    print(f"Train Accuracy of {model[0]} is ",res['train_score'].mean())
    print(f"Test Accuracy of {model[0]} is ",res['test_score'].mean())
    print("*" * 70)

Train Accuracy of Logistic Regression is  0.651355558438189
Test Accuracy of Logistic Regression is  0.6332598261771955
**********************************************************************
Train Accuracy of Knn is  0.4078609417563886
Test Accuracy of Knn is  0.3567777921909457
**********************************************************************
Train Accuracy of MultinomialNB is  0.5318523803346739
Test Accuracy of MultinomialNB is  0.5262679984433778
**********************************************************************
Train Accuracy of GaussianNB is  0.07579452587884292
Test Accuracy of GaussianNB is  0.06929562848618499
**********************************************************************
Train Accuracy of BernoulliNB is  0.5573744973407705
Test Accuracy of BernoulliNB is  0.5535867168244909
**********************************************************************
Train Accuracy of Decision Tree is  0.6442923855234142
Test Accuracy of Decision Tree is  0.5276170709560254
********

<span style="color:#2929a3;font-size:20px;">Create RandomForest Pipeline</span>

In [26]:
steps = []
steps.append(('Scaler', MinMaxScaler()))
steps.append(('UnderSampling',TomekLinks()))
steps.append(('UnderSampling2',RandomUnderSampler(sampling_strategy={11:2000, 8:1500})))
steps.append(('OverSampling', SMOTETomek(sampling_strategy={0:1000, 1:1000, 5:1000, 10:1000, 6:1000, 7:1000, 2:1000, 3:1000})))
steps.append(('Model',RandomForestClassifier()))
pipeline = imb_Pipeline(steps=steps)
pipeline.fit(x,y)

<span style="color:#2929a3;font-size:20px;">Hyperparameter Tuning</span>

In [27]:
from sklearn.model_selection import GridSearchCV

In [57]:
param = [
    {'Model__criterion':['gini','entropy'],
     'Model__max_depth':[25,30,35],
     'Model__min_samples_split':[3,4,5],
     'Model__n_estimators':[150,160,170]}
]

In [58]:
grid = GridSearchCV(estimator=pipeline, param_grid=param, cv=5, scoring='accuracy' , return_train_score=True, n_jobs=-1)

In [70]:
grid.fit(x,y)

In [71]:
grid.best_params_

In [74]:
grid.best_score_

<span style="color:#2929a3;font-size:20px;">Create Final Pipeline</span>

In [35]:
steps = []
steps.append(('Scaler', MinMaxScaler()))
steps.append(('UnderSampling',TomekLinks()))
steps.append(('UnderSampling2',RandomUnderSampler(sampling_strategy={11:2000, 8:1500})))
steps.append(('OverSampling', SMOTETomek(sampling_strategy={0:1000, 1:1000, 5:1000, 10:1000, 6:1000, 7:1000, 2:1000, 3:1000})))
steps.append(('Model',RandomForestClassifier(criterion='entropy', max_depth=30, min_samples_split=4, n_estimators=150)))
pipeline = imb_Pipeline(steps=steps)
pipeline.fit(x,y)

<span style="color:#2929a3;font-size:20px;">Dump The Model</span>

In [36]:
import joblib

df_cols_names = pd.DataFrame(columns=encoded_df.columns)
joblib.dump(df_cols_names,'df_cols_names.pkl')
joblib.dump(pipeline,'Model.pkl')
joblib.dump(target_encoder,'target_encoder.pkl')
joblib.dump(Objects_dict,'Objects_dict.pkl')

['Objects_dict.pkl']