# importing dependancies and Downloading/reading data

In [240]:
pip install scikit-multilearn



In [241]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import ast
import re
import nltk
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix, lil_matrix
from IPython.core.display import HTML
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.adapt import MLkNN
from sklearn.preprocessing import MultiLabelBinarizer
nltk.download('stopwords')

target_url = "https://drive.google.com/uc?export=download&id=1cnUY_KwtTfY09NcSgg_h0BX8cURAf1CH"
df = pd.read_csv(target_url)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Discovering data and subsetting only the needed columns

In [242]:
# subsetting only title and job function
df = df.ix[:,(1,2)]
print (df.sort_values(by=['title']))

                                        title                                        jobFunction
9149                   .NET Backend Developer  ['Engineering - Telecom/Technology', 'IT/Softw...
1045                   .NET Backend Developer  ['Engineering - Telecom/Technology', 'IT/Softw...
9413                   .NET Backend Developer  ['IT/Software Development', 'Engineering - Tel...
3694   .NET Core Developer - Senior\Team Lead  ['IT/Software Development', 'Engineering - Tel...
5263                           .NET Developer  ['Engineering - Telecom/Technology', 'IT/Softw...
...                                       ...                                                ...
7055         موظفة نشر و سوشيال ميديا اونلاين  ['Media/Journalism/Publishing', 'Marketing/PR/...
10171        موظفة نشر و سوشيال ميديا اونلاين  ['Media/Journalism/Publishing', 'Marketing/PR/...
1668         موظفة نشر و سوشيال ميديا اونلاين  ['Media/Journalism/Publishing', 'Marketing/PR/...
8514         موظفة نشر و سوشيا

# Cleaning  job titles

In [243]:
stemmer = PorterStemmer()
words = stopwords.words("english")
df['title'] = df['title'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())
df.head()

Unnamed: 0,title,jobFunction
0,full stack php develop,"['Engineering - Telecom/Technology', 'IT/Softw..."
1,cisco collabor specialist engin,"['Installation/Maintenance/Repair', 'IT/Softwa..."
2,senior back end php develop,"['Engineering - Telecom/Technology', 'IT/Softw..."
3,ux design,"['Creative/Design/Art', 'IT/Software Developme..."
4,java technic lead,"['Engineering - Telecom/Technology', 'IT/Softw..."


# cleaning job function Grouping according to title and removing duplicates


In [244]:
# custom function to flatten lists of lists while removing duplicate and missing values with a little cleaning
def flatten(x):
  flat_list = []
  for sublist in x:
    for item in sublist:
      item = item.replace(" " , "")
      item = item.replace("/" , "")
      item = item.replace("-" , "")
      if(item !='nan'):
        flat_list.append(item)
  return list(dict.fromkeys(flat_list))

# seprate the string of each row into lists of each job function
df['jobFunction'] = df['jobFunction'].apply(ast.literal_eval).apply(np.sort)

# grouping job functions for same title into list of lists
Clean_df = df.groupby('title').jobFunction.apply(list).reset_index()

# Applying Flatten
Clean_df['jobFunction'] = Clean_df['jobFunction'].apply(flatten)
b_len = len(Clean_df)

# remove any empty lists(data with missing job functions)
Clean_df = Clean_df[Clean_df['jobFunction'].map(lambda d: len(d)) > 0].reset_index()
Clean_df = Clean_df[Clean_df['title'].map(lambda d: len(d)) > 0].reset_index()


print ("Data before: {}\nData after: {}\nDeleted missing Values: {}".format(b_len , len(Clean_df) , b_len - len(Clean_df)))
Clean_df
#display(HTML(Clean_df.to_html()))


Data before: 3120
Data after: 3053
Deleted missing Values: 67


Unnamed: 0,level_0,index,title,jobFunction
0,1,2,abap consult,"[EngineeringTelecomTechnology, ITSoftwareDevel..."
1,2,3,account,"[AccountingFinance, Administration, HumanResou..."
2,3,4,account account payabl,[AccountingFinance]
3,4,5,account account receiv,[AccountingFinance]
4,5,6,account alexandria,[AccountingFinance]
...,...,...,...,...
3048,3049,3115,workshop manag,"[EngineeringMechanicalElectrical, Installation..."
3049,3050,3116,writer editor,"[MediaJournalismPublishing, WritingEditorial]"
3050,3051,3117,xamarin form develop,"[EngineeringTelecomTechnology, ITSoftwareDevel..."
3051,3052,3118,yard manag prestress precast team leader,"[EngineeringConstructionCivilArchitecture, Eng..."


# Getting Unique Job functions (number of classes)

In [245]:
# Calculating Unique Job function
unique_jobfn = []
for i in Clean_df.jobFunction:
  for x in i:
    unique_jobfn.append(x)
    unique_jobfn = list(set(unique_jobfn))
print("num of unique JobFunctions:{}".format(len(unique_jobfn)))
print(unique_jobfn)

num of unique JobFunctions:37
['TourismTravel', 'Fashion', 'ITSoftwareDevelopment', 'CustomerServiceSupport', 'EducationTeaching', 'Legal', 'Banking', 'CreativeDesignArt', 'BusinessDevelopment', 'EngineeringOil&GasEnergy', 'PurchasingProcurement', 'EngineeringOther', 'HospitalityHotelsFoodServices', 'ManufacturingProduction', 'MedicalHealthcare', 'LogisticsSupplyChain', 'StrategyConsulting', 'MediaJournalismPublishing', 'CLevelExecutiveGMDirector', 'Pharmaceutical', 'TrainingInstructor', 'EngineeringTelecomTechnology', 'AccountingFinance', 'EngineeringMechanicalElectrical', 'WritingEditorial', 'Administration', 'ProjectProgramManagement', 'SalesRetail', 'HumanResources', 'Quality', 'MarketingPRAdvertising', 'InstallationMaintenanceRepair', 'EngineeringConstructionCivilArchitecture', 'OperationsManagement', 'AnalystResearch', 'SportsandLeisure', 'R&DScience']


# Splitting labels into different columns (Multi-labels)

In [246]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit_transform(Clean_df['jobFunction'])
y = multilabel_binarizer.transform(Clean_df['jobFunction'])
for idx, job in enumerate(multilabel_binarizer.classes_):
  Clean_df[job] = y[:,idx]

#### remove created index by binarizer
Clean_df = Clean_df.ix[:,2:]
Clean_df.head()

Unnamed: 0,title,jobFunction,AccountingFinance,Administration,AnalystResearch,Banking,BusinessDevelopment,CLevelExecutiveGMDirector,CreativeDesignArt,CustomerServiceSupport,EducationTeaching,EngineeringConstructionCivilArchitecture,EngineeringMechanicalElectrical,EngineeringOil&GasEnergy,EngineeringOther,EngineeringTelecomTechnology,Fashion,HospitalityHotelsFoodServices,HumanResources,ITSoftwareDevelopment,InstallationMaintenanceRepair,Legal,LogisticsSupplyChain,ManufacturingProduction,MarketingPRAdvertising,MediaJournalismPublishing,MedicalHealthcare,OperationsManagement,Pharmaceutical,ProjectProgramManagement,PurchasingProcurement,Quality,R&DScience,SalesRetail,SportsandLeisure,StrategyConsulting,TourismTravel,TrainingInstructor,WritingEditorial
0,abap consult,"[EngineeringTelecomTechnology, ITSoftwareDevel...",0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,account,"[AccountingFinance, Administration, HumanResou...",1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,account account payabl,[AccountingFinance],1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,account account receiv,[AccountingFinance],1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,account alexandria,[AccountingFinance],1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# Our Final Data that will work on the classifer


In [247]:
#Dropping Job function and randomize data
fdata = Clean_df.loc[np.random.choice(Clean_df.index, size=3053)]
fdata = fdata.drop(labels = ['jobFunction'] , axis = 1)
fdata

Unnamed: 0,title,AccountingFinance,Administration,AnalystResearch,Banking,BusinessDevelopment,CLevelExecutiveGMDirector,CreativeDesignArt,CustomerServiceSupport,EducationTeaching,EngineeringConstructionCivilArchitecture,EngineeringMechanicalElectrical,EngineeringOil&GasEnergy,EngineeringOther,EngineeringTelecomTechnology,Fashion,HospitalityHotelsFoodServices,HumanResources,ITSoftwareDevelopment,InstallationMaintenanceRepair,Legal,LogisticsSupplyChain,ManufacturingProduction,MarketingPRAdvertising,MediaJournalismPublishing,MedicalHealthcare,OperationsManagement,Pharmaceutical,ProjectProgramManagement,PurchasingProcurement,Quality,R&DScience,SalesRetail,SportsandLeisure,StrategyConsulting,TourismTravel,TrainingInstructor,WritingEditorial
1499,network technician,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
626,electromechan engin site technic offic,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1593,outdoor market agent,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2091,sale suppli chain specialist,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
631,elementari english teacher,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
976,hr support execut,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2867,technic pre sale engin lc system secur,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1316,market research specialist,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
652,english content editor,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1


**Our final Data set is 3053 rows × 38 columns**

# Preparing Train/Test sets

In [248]:
import pickle

train, test = train_test_split(fdata, random_state=42, test_size=0.20, shuffle=True)
train_title  = train['title']
test_title  = test['title']

print(train.shape)
print(test.shape)

vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
vectorizer.fit(train_title)
vectorizer.fit(test_title)

x_train = vectorizer.transform(train_title)
x_test = vectorizer.transform(test_title)

# dumping vectorizer to be used in API
with open('TfidfVectorizer.pk', 'wb') as fin:
  pickle.dump(vectorizer, fin)

y_train = train.drop(labels = ['title' ], axis=1)
y_test = test.drop(labels = ['title'], axis=1)

t_acc = {} #array to keep all accuracy for different classifiers
t_f1 = {}  #array to keep all F1 score for different classifiers

(2442, 38)
(611, 38)


# Clasifiers

### 1st Classifer BinaryRelevance 

In [249]:
# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
br_classifier = BinaryRelevance(GaussianNB())

# train
br_classifier.fit(x_train, y_train)

# dumping model to be used in API
with open('br_classifier', 'wb') as fin:
  pickle.dump(br_classifier, fin)

# predict
br_predictions  = br_classifier.predict(x_test)

t_f1["BinaryRelevance"] = f1_score(y_test, br_predictions, average='micro')
t_acc["BinaryRelevance"] = accuracy_score(y_test,br_predictions)

print("Accuracy = ",accuracy_score(y_test,br_predictions))
print('f1 Score is : ' , f1_score(y_test, br_predictions, average='micro') )

Accuracy =  0.6955810147299509
f1 Score is :  0.771022934109938


### Trying data

In [0]:
g = "Real Estate"
usr = " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", g).split() if i not in words]).lower()
x_user = vectorizer.transform(np.array([usr]))
usr_predictions  = br_classifier.predict(x_user)

print(unique_jobfn)
for t in (usr_predictions):
  print(t.indices)
  for i in t.indices:
    print(unique_jobfn[i])


### 2nd Classifer - Label Powerset

In [253]:
lp_classifier = LabelPowerset(LogisticRegression())
lp_classifier.fit(x_train, y_train)
lp_predictions = lp_classifier.predict(x_test)
print("Accuracy = ",accuracy_score(y_test,lp_predictions))
print("F1 score = ",f1_score(y_test,lp_predictions, average="micro"))
print("Hamming loss = ",hamming_loss(y_test,lp_predictions))

t_acc["LabelPowerset"]  = accuracy_score(y_test,lp_predictions) 
t_f1["LabelPowerset"]  = f1_score(y_test, lp_predictions, average='micro')

Accuracy =  0.5286415711947627
F1 score =  0.6888111888111889
Hamming loss =  0.03149466979254213


### Third Classifer - MLKMM

In [254]:
#MLkNN
ml_classifier = MLkNN(k=10)
# to prevent errors when handling sparse matrices.
x_train_ml = lil_matrix(x_train).toarray()
y_train_ml = lil_matrix(y_train).toarray()
x_test_ml = lil_matrix(x_test).toarray()
ml_classifier.fit(x_train_ml, y_train_ml)
# predict
ml_predictions = ml_classifier.predict(x_test_ml)
# accuracy
print("Accuracy = ",accuracy_score(y_test,ml_predictions))
print("F1 score = ",f1_score(y_test,ml_predictions, average="micro"))

t_acc["MLKMM"] = accuracy_score(y_test,ml_predictions)
t_f1["MLKMM"] = f1_score(y_test, ml_predictions, average='micro')

Accuracy =  0.27986906710310966
F1 score =  0.5445263754963131


### Fourth Classifer - Classifier Chain

In [255]:
selected_labels = y_train.columns[y_train.sum(axis = 0, skipna = True) > 0].tolist()
y_train = y_train.filter(selected_labels, axis=1)
y_test = y_test.filter(selected_labels, axis=1)

x_train = vectorizer.transform(train_title)
x_test = vectorizer.transform(test_title)

cc_classifier = ClassifierChain(LogisticRegression(C=1))
cc_classifier.fit(x_train, y_train)
cc_predictions_proba = cc_classifier.predict_proba(x_test)
y_pred_new = (cc_predictions_proba >= 25/100).astype(int)

print("Accuracy = ",accuracy_score(y_test,y_pred_new))
print("F1 score = ",f1_score(y_test,y_pred_new, average="micro"))

t_acc["ClassifierChain"] = accuracy_score(y_test,y_pred_new)
t_f1["ClassifierChain"] = f1_score(y_test, y_pred_new, average='micro')

Accuracy =  0.41734860883797054
F1 score =  0.7384615384615384


### **Comparing Classifiers**


In [256]:
from tabulate import tabulate
models = ['BinaryRelevance' , 'LabelPowerSet' , 'MLKMM' , 'Classifier Chain' ]
results = [models , t_acc.values() ,  t_f1.values()]
print( tabulate (list(map(list, zip(*results))) , headers = ["model" , "Accuracy" , "F1-Score"]))

model               Accuracy    F1-Score
----------------  ----------  ----------
BinaryRelevance     0.695581    0.771023
LabelPowerSet       0.528642    0.688811
MLKMM               0.279869    0.544526
Classifier Chain    0.417349    0.738462


# **Conclusion**
It is clear that Binary Relevance has the best accuracy and F1-Score.It is the best classifier the used one.

The accuracy could be improved using Deep learning methods