### Building Model

- in this step we will make machine learning model to predict a job based on some user skills

- before model step we detected in analysis that we have unbalanced classes in our data so we should resample our data to keep the performance of the model

In [1]:
import pandas as pd 
import numpy as np 
import pickle 
from matplotlib import pyplot as plt 
import plotly.express as px 
import plotly.graph_objects as go 
import seaborn as sns

In [2]:
df=pd.read_pickle('C:\\Users\\PC\\DS_PRO\\Classification\\data\\processed\\3_preprocessed_df.pickle')

In [3]:
df.head()

Unnamed: 0_level_0,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,...,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType
Unnamed: 0_level_1,APL,Assembly,Bash/Shell,C,C#,C++,COBOL,Clojure,Crystal,Dart,...,"Developer, back-end","Developer, desktop or enterprise applications","Developer, embedded applications or devices","Developer, front-end","Developer, full-stack","Developer, game or graphics","Developer, mobile","Engineer, data",Scientist,System administrator
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
9,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
def calculate_quality(ground_truth,predictions,metric_function) : 
    
    quality_scores={} 
    for col in predictions.columns : 
        role_pred=predictions[col].copy() 
        role_truth=ground_truth[col].copy() 
        quality_scores[col]=round(metric_function(role_truth,role_pred)*100,2) 
    quality_scores=pd.Series(quality_scores.values(),index=quality_scores.keys())
    
    quality_scores=quality_scores.sort_values(ascending=False) 
    
    return quality_scores

In [5]:
freq_roles=df['DevType'].copy().sum(axis=0) 
freq_roles

Academic researcher                               1708
Data or business analyst                          1658
Data scientist or machine learning specialist     2460
Database administrator                            1210
DevOps specialist                                 3056
Developer, QA or test                             1135
Developer, back-end                              17084
Developer, desktop or enterprise applications     4845
Developer, embedded applications or devices       2138
Developer, front-end                              8932
Developer, full-stack                            20655
Developer, game or graphics                        899
Developer, mobile                                 4751
Engineer, data                                    1941
Scientist                                         1046
System administrator                              2069
dtype: int64

- we have unbalanced classes so we can balance classes

**Resample Roles**

In [6]:
roles_df=df['DevType'].copy()

In [7]:
SAMPLES_PER_CLASS=1200 
resampled_df=[]
for role in roles_df.columns : 
    role_mask=(roles_df[role]==1) 
    sub_df=df.loc[role_mask]
    
    if len(sub_df) < SAMPLES_PER_CLASS : 
        sub_df=sub_df.sample(SAMPLES_PER_CLASS,replace=True,random_state=0) 
    else : 
        sub_df=sub_df.sample(SAMPLES_PER_CLASS,random_state=0) 
    resampled_df.append(sub_df)

In [8]:
resampled_df=pd.concat(resampled_df,axis=0)

In [9]:
resampled_df['DevType'].copy().sum(axis=0).sort_values(ascending=False)

Developer, back-end                              5710
Developer, full-stack                            5602
Developer, desktop or enterprise applications    2690
Developer, front-end                             2614
Data scientist or machine learning specialist    2576
Academic researcher                              2280
DevOps specialist                                2170
Developer, mobile                                2155
System administrator                             2110
Engineer, data                                   2046
Data or business analyst                         1965
Scientist                                        1910
Developer, embedded applications or devices      1773
Database administrator                           1765
Developer, QA or test                            1514
Developer, game or graphics                      1441
dtype: int64

- so we have semi balanced dataframe

In [10]:
resampled_df.shape

(19200, 141)

**Split data into train & test**

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(resampled_df.drop('DevType', axis=1),
                                                    resampled_df['DevType'],test_size=0.3,
                                                    random_state=0)

In [13]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.metrics import precision_score ,recall_score ,accuracy_score,precision_recall_curve 
from sklearn.model_selection import cross_val_score

In [14]:
pca=PCA(n_components=0.7)

In [15]:
X_train_trans=pca.fit_transform(X_train)

In [16]:
rf_clf=RandomForestClassifier()

In [17]:
pipe=make_pipeline(PCA(),RandomForestClassifier())

In [18]:
grid_params=[{
    'pca__n_components': [0.7, 0.85, 0.95],
    'randomforestclassifier__n_estimators': [250, 500],
    'randomforestclassifier__max_depth':    [3, 10, None],
}]


In [19]:
grid=GridSearchCV(pipe,param_grid=grid_params,cv=3)

In [20]:
grid.fit(X_train_trans,Y_train)

In [21]:
train_preds=pd.DataFrame(grid.predict(X_train_trans),columns=Y_train.columns)

In [22]:
X_test_trans=pca.transform(X_test)

In [23]:
test_preds=pd.DataFrame(grid.predict(X_test_trans),columns=Y_test.columns)

In [24]:
train_scores = {score.__name__: calculate_quality(Y_train, train_preds, score) 
                for score in [accuracy_score, precision_score, recall_score]}
train_scores = pd.concat(train_scores,axis=1)

In [25]:
train_scores

Unnamed: 0,accuracy_score,precision_score,recall_score
"Developer, front-end",99.99,100.0,99.94
"Developer, full-stack",99.99,100.0,99.97
"Developer, mobile",99.99,99.93,100.0
Data or business analyst,99.98,99.86,99.93
Database administrator,99.98,99.92,99.84
DevOps specialist,99.98,100.0,99.8
"Developer, desktop or enterprise applications",99.98,99.95,99.9
"Developer, embedded applications or devices",99.98,100.0,99.76
"Engineer, data",99.98,100.0,99.79
System administrator,99.98,100.0,99.8


In [26]:
test_scores = {score.__name__: calculate_quality(Y_test, test_preds, score) 
                for score in [accuracy_score, precision_score, recall_score]}
test_scores = pd.concat(test_scores,axis=1)

In [27]:
test_scores

Unnamed: 0,accuracy_score,precision_score,recall_score
"Developer, game or graphics",97.6,98.0,69.01
Scientist,96.93,95.06,73.18
"Developer, QA or test",96.44,99.26,56.81
Database administrator,95.07,99.16,45.56
Academic researcher,94.64,89.77,59.29
"Developer, embedded applications or devices",94.46,91.53,41.94
Data or business analyst,94.44,91.67,49.31
Data scientist or machine learning specialist,94.08,88.06,65.17
"Developer, mobile",93.91,91.47,51.81
"Engineer, data",93.78,96.97,42.24


- that's good we can see that the generalization of precision is very good 