In [1]:
import pandas as pd
# read data
Workers = pd.read_csv("../Workers.csv")
Skills = pd.read_csv("../WSkills.csv")
# Job Recommended for
Job = {'Worker':'Data Scientist'}
JobSkills = [{'Skill':'python','Skill_Rating':3},{'Skill':'SQL','Skill_Rating':5},{'Skill':'Cloud','Skill_Rating':2},{'Skill':'HTML','Skill_Rating':3}]

JobSkillIds = [skill['Skill'] for skill in JobSkills] # get skills id 
JobSkillRatingSum = sum([skill['Skill_Rating'] for skill in JobSkills]) # sum Skill Rating
# filter and merge Worker data
filtered_skills = Skills[Skills['Skill'].isin(JobSkillIds)]# Filter Skills to include skill in JobSkills list
worker_skill_ratings = filtered_skills.groupby('Worker')['Skill_Rating'].sum().reset_index()# Group the data by worker and calculate the sum
worker_skill_ratings['Skill_Rating'] = worker_skill_ratings['Skill_Rating'].div(JobSkillRatingSum).mul(100)
Workers['Profession'] = Workers.apply(lambda x: 1 if x['Profession'].lower().find(Job['Worker'].lower())!=-1 else 0, axis=1)

Workers_Data = pd.merge(Workers, worker_skill_ratings, left_on='id', right_on='Worker')
Workers_Data = Workers_Data.rename(columns={'Skill_Rating': 'Skill_Percentage_Required'})
Workers_Data.drop('Worker', axis=1, inplace=True)
# calculate Percentage
Workers_Data['Rating'] = Workers_Data['Rating'].div(5).mul(100)
Workers_Data['P_Rating'] = Workers_Data['Nbr_Rating'].mul(0.1)
Workers_Data['P_Rating'] = Workers_Data.apply(lambda x: 30 if x['P_Rating']>30 else x['P_Rating'], axis=1)
Workers_Data['Percentage_Rating'] = Workers_Data.apply(lambda x: x['P_Rating']*x['Rating'], axis=1).div(100)
Workers_Data['Percentage_Skills'] = Workers_Data.apply(lambda x: (100-x['P_Rating'])*x['Skill_Percentage_Required'], axis=1).div(100)
Workers_Data['Percentage'] = Workers_Data.apply(lambda x:x['Percentage_Rating']+x['Percentage_Skills']+0.1*x['Profession'], axis=1)
Workers_Data['Recommended'] = Workers_Data.apply(lambda x:1 if x['Percentage']>=35 else 0, axis=1)
Workers_Data

Unnamed: 0,id,Profession,Rating,Nbr_Rating,Skill_Percentage_Required,P_Rating,Percentage_Rating,Percentage_Skills,Percentage,Recommended
0,1,1,87.280731,86,15.384615,8.6,7.506143,14.061538,21.667681,0
1,3,1,81.091368,366,23.076923,30.0,24.327410,16.153846,40.581257,1
2,4,1,99.896298,210,38.461538,21.0,20.978223,30.384615,51.462838,1
3,9,1,68.456907,459,30.769231,30.0,20.537072,21.538462,42.175534,1
4,10,1,54.867288,388,23.076923,30.0,16.460186,16.153846,32.714033,0
...,...,...,...,...,...,...,...,...,...,...
3095,11993,0,48.985805,446,23.076923,30.0,14.695742,16.153846,30.849588,0
3096,11995,0,47.050126,488,30.769231,30.0,14.115038,21.538462,35.653499,1
3097,11996,0,63.447600,170,15.384615,17.0,10.786092,12.769231,23.555323,0
3098,11998,0,63.265163,417,30.769231,30.0,18.979549,21.538462,40.518010,1


In [2]:
 # Train the model
 # Init
Workers_Data.drop(['P_Rating','Percentage_Rating','Percentage_Skills','Percentage'],axis='columns',inplace=True)
target = Workers_Data.Recommended # prcise the target
inputs = Workers_Data.drop('Recommended',axis='columns') # Init the triening input
inputs

Unnamed: 0,id,Profession,Rating,Nbr_Rating,Skill_Percentage_Required
0,1,1,87.280731,86,15.384615
1,3,1,81.091368,366,23.076923
2,4,1,99.896298,210,38.461538
3,9,1,68.456907,459,30.769231
4,10,1,54.867288,388,23.076923
...,...,...,...,...,...
3095,11993,0,48.985805,446,23.076923
3096,11995,0,47.050126,488,30.769231
3097,11996,0,63.447600,170,15.384615
3098,11998,0,63.265163,417,30.769231


In [121]:
 # split dataset to training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(inputs,target,test_size=0.02)
 # model creat
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [122]:
 # Train
model.fit(X_train,y_train) # train the model

GaussianNB()

In [123]:
model.score(X_test,y_test) # test the model

0.9516129032258065

In [39]:
model.predict_proba(inputs) # show probability for predictions

array([[0.92466244, 0.07533756],
       [0.48162055, 0.51837945],
       [0.01206381, 0.98793619],
       ...,
       [0.92580874, 0.07419126],
       [0.11939097, 0.88060903],
       [0.48662621, 0.51337379]])

In [22]:
output = model.predict(inputs) # predict if person is recommended or not

In [23]:
concatenated = pd.concat([inputs, pd.DataFrame(output, columns=["predicted_output"])], axis=1)
concatenated

Unnamed: 0,id,Profession,Rating,Nbr_Rating,Skill_Percentage_Required,predicted_output
0,1,1,87.280731,86,15.384615,0
1,3,1,81.091368,366,23.076923,1
2,4,1,99.896298,210,38.461538,1
3,9,1,68.456907,459,30.769231,1
4,10,1,54.867288,388,23.076923,0
...,...,...,...,...,...,...
3095,11993,0,48.985805,446,23.076923,0
3096,11995,0,47.050126,488,30.769231,1
3097,11996,0,63.447600,170,15.384615,0
3098,11998,0,63.265163,417,30.769231,1


In [11]:
 #Save the model
import pickle
pickle.dump(model, open('../model_Recommend_job.pkl', 'wb'))