In [1]:
import pandas as pd
import string
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

# understanding data

In [2]:
df=pd.read_csv('Job titles and industries.csv')
df.head()
df.info()
# df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8586 entries, 0 to 8585
Data columns (total 2 columns):
job title    8586 non-null object
industry     8586 non-null object
dtypes: object(2)
memory usage: 134.3+ KB


In [3]:
df.head()

Unnamed: 0,job title,industry
0,technical support and helpdesk supervisor - co...,IT
1,senior technical support engineer,IT
2,head of it services,IT
3,js front end engineer,IT
4,network and telephony controller,IT


In [4]:
df['industry'].value_counts()

IT             4746
Marketing      2031
Education      1435
Accountancy     374
Name: industry, dtype: int64

In [5]:
df.dropna(inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8586 entries, 0 to 8585
Data columns (total 2 columns):
job title    8586 non-null object
industry     8586 non-null object
dtypes: object(2)
memory usage: 201.2+ KB


In [7]:
df.duplicated().sum()

4618

In [8]:
df.drop_duplicates(subset=['job title'],inplace =True)

In [9]:
df.duplicated().sum()

0

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3890 entries, 0 to 8585
Data columns (total 2 columns):
job title    3890 non-null object
industry     3890 non-null object
dtypes: object(2)
memory usage: 91.2+ KB


In [11]:
#expanding the dispay of text sms column
pd.set_option('display.max_colwidth', -1)
df.head(30)

Unnamed: 0,job title,industry
0,"technical support and helpdesk supervisor - county buildings, ayr soa04086",IT
1,senior technical support engineer,IT
2,head of it services,IT
3,js front end engineer,IT
4,network and telephony controller,IT
5,privileged access management expert,IT
6,devops engineers x 3 - global brand,IT
8,data modeller,IT
9,"php web developer £45,000 based in london",IT
12,solution / technical architect - ethical brand,IT


# data preprocessing

In [12]:
#defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree
#storing the puntuation free text
df['job title']= df['job title'].apply(lambda x:remove_punctuation(x))
df.head(30)

Unnamed: 0,job title,industry
0,technical support and helpdesk supervisor county buildings ayr soa04086,IT
1,senior technical support engineer,IT
2,head of it services,IT
3,js front end engineer,IT
4,network and telephony controller,IT
5,privileged access management expert,IT
6,devops engineers x 3 global brand,IT
8,data modeller,IT
9,php web developer £45000 based in london,IT
12,solution technical architect ethical brand,IT


In [13]:
df['job title']= df['job title'].apply(lambda x: x.lower())
df['industry']= df['industry'].apply(lambda x: x.lower())

In [14]:
df.head(30)

Unnamed: 0,job title,industry
0,technical support and helpdesk supervisor county buildings ayr soa04086,it
1,senior technical support engineer,it
2,head of it services,it
3,js front end engineer,it
4,network and telephony controller,it
5,privileged access management expert,it
6,devops engineers x 3 global brand,it
8,data modeller,it
9,php web developer £45000 based in london,it
12,solution technical architect ethical brand,it


In [15]:
#defining function for tokenization

def tokenization(text):
    tokens = re.split('W+',text)
    tokens=' '.join(tokens)
    tokens=tokens.split(' ')
    if '' in tokens:
        tokens.remove('')
    return tokens
#applying function to the column
df['job title']= df['job title'].apply(lambda x: tokenization(x))
df.head(30)

Unnamed: 0,job title,industry
0,"[technical, support, and, helpdesk, supervisor, county, buildings, ayr, soa04086]",it
1,"[senior, technical, support, engineer]",it
2,"[head, of, it, services]",it
3,"[js, front, end, engineer]",it
4,"[network, and, telephony, controller]",it
5,"[privileged, access, management, expert]",it
6,"[devops, engineers, x, 3, global, brand]",it
8,"[data, modeller]",it
9,"[php, web, developer, £45000, based, in, london]",it
12,"[solution, technical, architect, , ethical, brand]",it


In [16]:
#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')
stopwords[0:10]


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [17]:
#defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output
#applying the function
df['job title']= df['job title'].apply(lambda x:remove_stopwords(x))

In [18]:
df.head(30)

Unnamed: 0,job title,industry
0,"[technical, support, helpdesk, supervisor, county, buildings, ayr, soa04086]",it
1,"[senior, technical, support, engineer]",it
2,"[head, services]",it
3,"[js, front, end, engineer]",it
4,"[network, telephony, controller]",it
5,"[privileged, access, management, expert]",it
6,"[devops, engineers, x, 3, global, brand]",it
8,"[data, modeller]",it
9,"[php, web, developer, £45000, based, london]",it
12,"[solution, technical, architect, , ethical, brand]",it


In [19]:
#defining the object for stemming
porter_stemmer = PorterStemmer()
#defining a function for stemming
def stemming(text):
    stem_text = [porter_stemmer.stem(word) for word in text]
    return stem_text
df['job title']=df['job title'].apply(lambda x: stemming(x))
df.head(30)

Unnamed: 0,job title,industry
0,"[technic, support, helpdesk, supervisor, counti, build, ayr, soa04086]",it
1,"[senior, technic, support, engin]",it
2,"[head, servic]",it
3,"[js, front, end, engin]",it
4,"[network, telephoni, control]",it
5,"[privileg, access, manag, expert]",it
6,"[devop, engin, x, 3, global, brand]",it
8,"[data, model]",it
9,"[php, web, develop, £45000, base, london]",it
12,"[solut, technic, architect, , ethic, brand]",it


In [20]:
#defining the object for Lemmatization
#nltk.download('wordnet')
wordnet_lemmatizer = WordNetLemmatizer()

In [21]:
#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text
df['job title']=df['job title'].apply(lambda x:lemmatizer(x))
df.head(30)

Unnamed: 0,job title,industry
0,"[technic, support, helpdesk, supervisor, counti, build, ayr, soa04086]",it
1,"[senior, technic, support, engin]",it
2,"[head, servic]",it
3,"[j, front, end, engin]",it
4,"[network, telephoni, control]",it
5,"[privileg, access, manag, expert]",it
6,"[devop, engin, x, 3, global, brand]",it
8,"[data, model]",it
9,"[php, web, develop, £45000, base, london]",it
12,"[solut, technic, architect, , ethic, brand]",it


In [22]:
df['job title']=[' '.join(i) for i in df['job title']]
df.head(10)

Unnamed: 0,job title,industry
0,technic support helpdesk supervisor counti build ayr soa04086,it
1,senior technic support engin,it
2,head servic,it
3,j front end engin,it
4,network telephoni control,it
5,privileg access manag expert,it
6,devop engin x 3 global brand,it
8,data model,it
9,php web develop £45000 base london,it
12,solut technic architect ethic brand,it


In [23]:
df['industry'].value_counts()

it             1528
marketing      1151
education      953 
accountancy    258 
Name: industry, dtype: int64

In [24]:
#manual ordinal encoding since there are only 4 classes
df['industry'].replace('it',1,inplace=True)
df['industry'].replace('marketing',2,inplace=True)
df['industry'].replace('accountancy',3,inplace=True)
df['industry'].replace('education',4,inplace=True)
df['industry']=df['industry'].astype('int64')
df.head(20)

Unnamed: 0,job title,industry
0,technic support helpdesk supervisor counti build ayr soa04086,1
1,senior technic support engin,1
2,head servic,1
3,j front end engin,1
4,network telephoni control,1
5,privileg access manag expert,1
6,devop engin x 3 global brand,1
8,data model,1
9,php web develop £45000 base london,1
12,solut technic architect ethic brand,1


# Modeling

In [25]:
from sklearn import model_selection
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(df['job title'],df['industry'], test_size=0.2, random_state=0)

In [26]:
# from sklearn import model_selection
# X_train, X_test, Y_train, Y_test = model_selection.train_test_split(df['job title'], pd.get_dummies( df['industry']), test_size=0.25, random_state=0)

In [27]:
Y_train.shape

(3112,)

In [28]:
# TfidfVectorizer is equivilent to using CountVectorizer followed by TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train= vectorizer.fit_transform(X_train)
X_test= vectorizer.transform(X_test)


In [29]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier,LogisticRegression
from sklearn.metrics import accuracy_score
#clf =OneVsRestClassifier(SGDClassifier())
clf =LogisticRegression(multi_class='ovr')
clf.fit(X_train,Y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [30]:
clf.score(X_train,Y_train)

0.9187017994858612

In [31]:
clf.score(X_test,Y_test)

0.9023136246786633

In [32]:
y_pred=clf.predict(X_test)

In [33]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           1       0.88      0.96      0.92       319
           2       0.90      0.85      0.88       231
           3       0.82      0.71      0.76        51
           4       0.97      0.92      0.94       177

    accuracy                           0.90       778
   macro avg       0.89      0.86      0.87       778
weighted avg       0.90      0.90      0.90       778



In [34]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test, y_pred)

array([[307,  11,   1,   0],
       [ 26, 197,   3,   5],
       [  9,   6,  36,   0],
       [  6,   5,   4, 162]], dtype=int64)

In [35]:
#saving the best model to reuse it in API

In [36]:
from joblib import dump, load
dump(clf, 'model.joblib')