In [1]:
import pandas as pd
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation
import re 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC 
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [2]:
data = pd.read_csv('./input/naukri_com-job_sample.csv')

In [3]:
data.head()

Unnamed: 0,company,education,experience,industry,jobdescription,jobid,joblocation_address,jobtitle,numberofpositions,payrate,postdate,site_name,skills,uniq_id
0,MM Media Pvt Ltd,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,0 - 1 yrs,Media / Entertainment / Internet,Job Description Send me Jobs like this Quali...,210516002263,Chennai,Walkin Data Entry Operator (night Shift),,"1,50,000 - 2,25,000 P.A",2016-05-21 19:30:00 +0000,,ITES,43b19632647068535437c774b6ca6cf8
1,find live infotech,UG: B.Tech/B.E. - Any Specialization PG:MBA/PG...,0 - 0 yrs,Advertising / PR / MR / Event Management,Job Description Send me Jobs like this Quali...,210516002391,Chennai,Work Based Onhome Based Part Time.,60.0,"1,50,000 - 2,50,000 P.A. 20000",2016-05-21 19:30:00 +0000,,Marketing,d4c72325e57f89f364812b5ed5a795f0
2,Softtech Career Infosystem Pvt. Ltd,UG: Any Graduate - Any Specialization PG:Any P...,4 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this - as ...,101016900534,Bengaluru,Pl/sql Developer - SQL,,Not Disclosed by Recruiter,2016-10-13 16:20:55 +0000,,IT Software - Application Programming,c47df6f4cfdf5b46f1fd713ba61b9eba
3,Onboard HRServices LLP,UG: Any Graduate - Any Specialization PG:CA Do...,11 - 15 yrs,Banking / Financial Services / Broking,Job Description Send me Jobs like this - Inv...,81016900536,"Mumbai, Bengaluru, Kolkata, Chennai, Coimbator...",Manager/ad/partner - Indirect Tax - CA,,Not Disclosed by Recruiter,2016-10-13 16:20:55 +0000,,Accounts,115d28f140f694dd1cc61c53d03c66ae
4,Spire Technologies and Solutions Pvt. Ltd.,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,6 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this Pleas...,120916002122,Bengaluru,JAVA Technical Lead (6-8 yrs) -,4.0,Not Disclosed by Recruiter,2016-10-13 16:20:55 +0000,,IT Software - Application Programming,a12553fc03bc7bcced8b1bb8963f97b4


In [4]:
data.isnull().sum()[data.isnull().sum()>0]

company                    4
education               1996
experience                 4
industry                   5
jobdescription             4
joblocation_address      501
numberofpositions      17536
payrate                   97
postdate                  23
site_name              18013
skills                   528
dtype: int64

In [5]:
data.shape

(22000, 14)

In [6]:
print(data.columns)

Index(['company', 'education', 'experience', 'industry', 'jobdescription',
       'jobid', 'joblocation_address', 'jobtitle', 'numberofpositions',
       'payrate', 'postdate', 'site_name', 'skills', 'uniq_id'],
      dtype='object')


In [7]:
data.isnull().sum()[data.isnull().sum() > 0]

company                    4
education               1996
experience                 4
industry                   5
jobdescription             4
joblocation_address      501
numberofpositions      17536
payrate                   97
postdate                  23
site_name              18013
skills                   528
dtype: int64

In [8]:
from sklearn.impute import SimpleImputer

to_fill = ['education', 'skills']
imputer = SimpleImputer(strategy='most_frequent')

data[to_fill] = imputer.fit_transform(data[to_fill])

In [9]:
data.isnull().sum()[data.isnull().sum()>0]

company                    4
experience                 4
industry                   5
jobdescription             4
joblocation_address      501
numberofpositions      17536
payrate                   97
postdate                  23
site_name              18013
dtype: int64

In [10]:
data.drop(['jobid','uniq_id'],axis=1,inplace=True)

In [11]:
data.head()

Unnamed: 0,company,education,experience,industry,jobdescription,joblocation_address,jobtitle,numberofpositions,payrate,postdate,site_name,skills
0,MM Media Pvt Ltd,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,0 - 1 yrs,Media / Entertainment / Internet,Job Description Send me Jobs like this Quali...,Chennai,Walkin Data Entry Operator (night Shift),,"1,50,000 - 2,25,000 P.A",2016-05-21 19:30:00 +0000,,ITES
1,find live infotech,UG: B.Tech/B.E. - Any Specialization PG:MBA/PG...,0 - 0 yrs,Advertising / PR / MR / Event Management,Job Description Send me Jobs like this Quali...,Chennai,Work Based Onhome Based Part Time.,60.0,"1,50,000 - 2,50,000 P.A. 20000",2016-05-21 19:30:00 +0000,,Marketing
2,Softtech Career Infosystem Pvt. Ltd,UG: Any Graduate - Any Specialization PG:Any P...,4 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this - as ...,Bengaluru,Pl/sql Developer - SQL,,Not Disclosed by Recruiter,2016-10-13 16:20:55 +0000,,IT Software - Application Programming
3,Onboard HRServices LLP,UG: Any Graduate - Any Specialization PG:CA Do...,11 - 15 yrs,Banking / Financial Services / Broking,Job Description Send me Jobs like this - Inv...,"Mumbai, Bengaluru, Kolkata, Chennai, Coimbator...",Manager/ad/partner - Indirect Tax - CA,,Not Disclosed by Recruiter,2016-10-13 16:20:55 +0000,,Accounts
4,Spire Technologies and Solutions Pvt. Ltd.,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,6 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this Pleas...,Bengaluru,JAVA Technical Lead (6-8 yrs) -,4.0,Not Disclosed by Recruiter,2016-10-13 16:20:55 +0000,,IT Software - Application Programming


In [12]:
data[data['experience'].eq('Not Mentioned')].shape

(111, 12)

In [13]:
data = data[data['experience'] != 'Not Mentioned']

In [14]:
experience_lower = []
experience_upper = []
invalid = []

for idx, row in data.iterrows():
    try:
        text = re.sub('yrs', '', row['experience'])
        splits = text.split('-')
        experience_lower.append(int(splits[0]))
        experience_upper.append(int(splits[1]))
    except:
        invalid.append(row['experience'])

In [15]:
data = data[~data['experience'].isin(invalid)]

In [16]:
data['experience_lower'] = data['experience'].apply(lambda x: int(x.split('-')[0]))
data['experience_upper'] = data['experience'].apply(lambda x: int(re.sub('yrs', '', x.split('-')[1])))

In [17]:
data.isnull().sum()

company                    0
education                  0
experience                 0
industry                   1
jobdescription             0
joblocation_address      497
jobtitle                   0
numberofpositions      17420
payrate                    1
postdate                  19
site_name              17962
skills                     0
experience_lower           0
experience_upper           0
dtype: int64

In [18]:
data.head()

Unnamed: 0,company,education,experience,industry,jobdescription,joblocation_address,jobtitle,numberofpositions,payrate,postdate,site_name,skills,experience_lower,experience_upper
0,MM Media Pvt Ltd,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,0 - 1 yrs,Media / Entertainment / Internet,Job Description Send me Jobs like this Quali...,Chennai,Walkin Data Entry Operator (night Shift),,"1,50,000 - 2,25,000 P.A",2016-05-21 19:30:00 +0000,,ITES,0,1
1,find live infotech,UG: B.Tech/B.E. - Any Specialization PG:MBA/PG...,0 - 0 yrs,Advertising / PR / MR / Event Management,Job Description Send me Jobs like this Quali...,Chennai,Work Based Onhome Based Part Time.,60.0,"1,50,000 - 2,50,000 P.A. 20000",2016-05-21 19:30:00 +0000,,Marketing,0,0
2,Softtech Career Infosystem Pvt. Ltd,UG: Any Graduate - Any Specialization PG:Any P...,4 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this - as ...,Bengaluru,Pl/sql Developer - SQL,,Not Disclosed by Recruiter,2016-10-13 16:20:55 +0000,,IT Software - Application Programming,4,8
3,Onboard HRServices LLP,UG: Any Graduate - Any Specialization PG:CA Do...,11 - 15 yrs,Banking / Financial Services / Broking,Job Description Send me Jobs like this - Inv...,"Mumbai, Bengaluru, Kolkata, Chennai, Coimbator...",Manager/ad/partner - Indirect Tax - CA,,Not Disclosed by Recruiter,2016-10-13 16:20:55 +0000,,Accounts,11,15
4,Spire Technologies and Solutions Pvt. Ltd.,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,6 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this Pleas...,Bengaluru,JAVA Technical Lead (6-8 yrs) -,4.0,Not Disclosed by Recruiter,2016-10-13 16:20:55 +0000,,IT Software - Application Programming,6,8


In [19]:
data.drop(['experience'], axis=1, inplace=True)

In [20]:
data.columns

Index(['company', 'education', 'industry', 'jobdescription',
       'joblocation_address', 'jobtitle', 'numberofpositions', 'payrate',
       'postdate', 'site_name', 'skills', 'experience_lower',
       'experience_upper'],
      dtype='object')

In [21]:
data['postdate'] = data['postdate'].astype(str).apply(lambda x: x[:-5])

In [22]:
data.head()

Unnamed: 0,company,education,industry,jobdescription,joblocation_address,jobtitle,numberofpositions,payrate,postdate,site_name,skills,experience_lower,experience_upper
0,MM Media Pvt Ltd,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,Media / Entertainment / Internet,Job Description Send me Jobs like this Quali...,Chennai,Walkin Data Entry Operator (night Shift),,"1,50,000 - 2,25,000 P.A",2016-05-21 19:30:00,,ITES,0,1
1,find live infotech,UG: B.Tech/B.E. - Any Specialization PG:MBA/PG...,Advertising / PR / MR / Event Management,Job Description Send me Jobs like this Quali...,Chennai,Work Based Onhome Based Part Time.,60.0,"1,50,000 - 2,50,000 P.A. 20000",2016-05-21 19:30:00,,Marketing,0,0
2,Softtech Career Infosystem Pvt. Ltd,UG: Any Graduate - Any Specialization PG:Any P...,IT-Software / Software Services,Job Description Send me Jobs like this - as ...,Bengaluru,Pl/sql Developer - SQL,,Not Disclosed by Recruiter,2016-10-13 16:20:55,,IT Software - Application Programming,4,8
3,Onboard HRServices LLP,UG: Any Graduate - Any Specialization PG:CA Do...,Banking / Financial Services / Broking,Job Description Send me Jobs like this - Inv...,"Mumbai, Bengaluru, Kolkata, Chennai, Coimbator...",Manager/ad/partner - Indirect Tax - CA,,Not Disclosed by Recruiter,2016-10-13 16:20:55,,Accounts,11,15
4,Spire Technologies and Solutions Pvt. Ltd.,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,IT-Software / Software Services,Job Description Send me Jobs like this Pleas...,Bengaluru,JAVA Technical Lead (6-8 yrs) -,4.0,Not Disclosed by Recruiter,2016-10-13 16:20:55,,IT Software - Application Programming,6,8


In [23]:
data['job_age'] = pd.to_datetime('today') - pd.to_datetime(data['postdate'])
data['job_age'] = data['job_age'].dt.days

In [24]:
data.head()

Unnamed: 0,company,education,industry,jobdescription,joblocation_address,jobtitle,numberofpositions,payrate,postdate,site_name,skills,experience_lower,experience_upper,job_age
0,MM Media Pvt Ltd,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,Media / Entertainment / Internet,Job Description Send me Jobs like this Quali...,Chennai,Walkin Data Entry Operator (night Shift),,"1,50,000 - 2,25,000 P.A",2016-05-21 19:30:00,,ITES,0,1,2886.0
1,find live infotech,UG: B.Tech/B.E. - Any Specialization PG:MBA/PG...,Advertising / PR / MR / Event Management,Job Description Send me Jobs like this Quali...,Chennai,Work Based Onhome Based Part Time.,60.0,"1,50,000 - 2,50,000 P.A. 20000",2016-05-21 19:30:00,,Marketing,0,0,2886.0
2,Softtech Career Infosystem Pvt. Ltd,UG: Any Graduate - Any Specialization PG:Any P...,IT-Software / Software Services,Job Description Send me Jobs like this - as ...,Bengaluru,Pl/sql Developer - SQL,,Not Disclosed by Recruiter,2016-10-13 16:20:55,,IT Software - Application Programming,4,8,2741.0
3,Onboard HRServices LLP,UG: Any Graduate - Any Specialization PG:CA Do...,Banking / Financial Services / Broking,Job Description Send me Jobs like this - Inv...,"Mumbai, Bengaluru, Kolkata, Chennai, Coimbator...",Manager/ad/partner - Indirect Tax - CA,,Not Disclosed by Recruiter,2016-10-13 16:20:55,,Accounts,11,15,2741.0
4,Spire Technologies and Solutions Pvt. Ltd.,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,IT-Software / Software Services,Job Description Send me Jobs like this Pleas...,Bengaluru,JAVA Technical Lead (6-8 yrs) -,4.0,Not Disclosed by Recruiter,2016-10-13 16:20:55,,IT Software - Application Programming,6,8,2741.0


In [25]:
data.columns

Index(['company', 'education', 'industry', 'jobdescription',
       'joblocation_address', 'jobtitle', 'numberofpositions', 'payrate',
       'postdate', 'site_name', 'skills', 'experience_lower',
       'experience_upper', 'job_age'],
      dtype='object')

In [26]:
data['skills'].nunique()

45

In [27]:
replacements = {
   'joblocation_address': {
      r'(Bengaluru/Bangalore)': 'Bangalore',
      r'Bengaluru': 'Bangalore',
      r'Hyderabad / Secunderabad': 'Hyderabad',
      r'Mumbai , Mumbai': 'Mumbai',
      r'Noida': 'NCR',
      r'Delhi': 'NCR',
      r'Gurgaon': 'NCR', 
      r'Delhi/NCR(National Capital Region)': 'NCR',
      r'Delhi , Delhi': 'NCR',
      r'Noida , Noida/Greater Noida': 'NCR',
      r'Ghaziabad': 'NCR',
      r'Delhi/NCR(National Capital Region) , Gurgaon': 'NCR',
      r'NCR , NCR': 'NCR',
      r'NCR/NCR(National Capital Region)': 'NCR',
      r'NCR , NCR/Greater NCR': 'NCR',
      r'NCR/NCR(National Capital Region) , NCR': 'NCR', 
      r'NCR , NCR/NCR(National Capital Region)': 'NCR', 
      r'Bangalore , Bangalore / Bangalore': 'Bangalore',
      r'Bangalore , karnataka': 'Bangalore',
      r'NCR/NCR(National Capital Region)': 'NCR',
      r'NCR/Greater NCR': 'NCR',
      r'NCR/NCR(National Capital Region) , NCR': 'NCR'
       
   }
}

data.replace(replacements, regex=True, inplace=True)

In [28]:
data.head()

Unnamed: 0,company,education,industry,jobdescription,joblocation_address,jobtitle,numberofpositions,payrate,postdate,site_name,skills,experience_lower,experience_upper,job_age
0,MM Media Pvt Ltd,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,Media / Entertainment / Internet,Job Description Send me Jobs like this Quali...,Chennai,Walkin Data Entry Operator (night Shift),,"1,50,000 - 2,25,000 P.A",2016-05-21 19:30:00,,ITES,0,1,2886.0
1,find live infotech,UG: B.Tech/B.E. - Any Specialization PG:MBA/PG...,Advertising / PR / MR / Event Management,Job Description Send me Jobs like this Quali...,Chennai,Work Based Onhome Based Part Time.,60.0,"1,50,000 - 2,50,000 P.A. 20000",2016-05-21 19:30:00,,Marketing,0,0,2886.0
2,Softtech Career Infosystem Pvt. Ltd,UG: Any Graduate - Any Specialization PG:Any P...,IT-Software / Software Services,Job Description Send me Jobs like this - as ...,Bangalore,Pl/sql Developer - SQL,,Not Disclosed by Recruiter,2016-10-13 16:20:55,,IT Software - Application Programming,4,8,2741.0
3,Onboard HRServices LLP,UG: Any Graduate - Any Specialization PG:CA Do...,Banking / Financial Services / Broking,Job Description Send me Jobs like this - Inv...,"Mumbai, Bangalore, Kolkata, Chennai, Coimbator...",Manager/ad/partner - Indirect Tax - CA,,Not Disclosed by Recruiter,2016-10-13 16:20:55,,Accounts,11,15,2741.0
4,Spire Technologies and Solutions Pvt. Ltd.,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,IT-Software / Software Services,Job Description Send me Jobs like this Pleas...,Bangalore,JAVA Technical Lead (6-8 yrs) -,4.0,Not Disclosed by Recruiter,2016-10-13 16:20:55,,IT Software - Application Programming,6,8,2741.0


In [29]:
data['industry'] = data['industry'].astype(str).apply(lambda x: x.split('/')[0])

In [30]:
data['industry'] = data['industry'].str.strip()

In [31]:
data['Education'] = data['education'].str.split(' ').apply(lambda x: x[1] if len(x) > 1 else x[0])

data['Education'] = data['Education'].replace(('B.Tech/B.E.','Graduation','Other','-','Not','B.Tech/B.E.,','Postgraduate',
                                               'PG:CA','Diploma,','B.Com,','B.Pharma,','B.A,','BCA,','B.Sc,','MBA/PGDM','B.B.A,',
                                               'PG:Other','Doctorate:Doctorate','Post'),
                                              ('B.Tech','Graduation','Other','Other','Not Specified','B.Tech','Postgraduate',
                                               'CA','Diploma','B.Com','B.Pharma','B.A','BCA','B.Sc','MBA','BBA',
                                               'Other','Doctorate','Post'))

data['Skills'] = data['skills'].str.split(" - ").apply(lambda x: x[1] if len(x) > 1 else x[0])

In [32]:
majority_industries = data['industry'].value_counts()[data['industry'].value_counts() >= 10].index
data = data[data['industry'].isin(majority_industries)]
data.isnull().sum()[data.isnull().sum() > 0]

joblocation_address      497
numberofpositions      17402
site_name              17938
job_age                   19
dtype: int64

In [33]:
most_frequent_value = data['joblocation_address'].mode()[0]
data['joblocation_address'].fillna(most_frequent_value, inplace=True)

In [34]:
data.isnull().sum()

company                    0
education                  0
industry                   0
jobdescription             0
joblocation_address        0
jobtitle                   0
numberofpositions      17402
payrate                    0
postdate                   0
site_name              17938
skills                     0
experience_lower           0
experience_upper           0
job_age                   19
Education                  0
Skills                     0
dtype: int64

In [35]:
data.head()

Unnamed: 0,company,education,industry,jobdescription,joblocation_address,jobtitle,numberofpositions,payrate,postdate,site_name,skills,experience_lower,experience_upper,job_age,Education,Skills
0,MM Media Pvt Ltd,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,Media,Job Description Send me Jobs like this Quali...,Chennai,Walkin Data Entry Operator (night Shift),,"1,50,000 - 2,25,000 P.A",2016-05-21 19:30:00,,ITES,0,1,2886.0,B.Tech,ITES
1,find live infotech,UG: B.Tech/B.E. - Any Specialization PG:MBA/PG...,Advertising,Job Description Send me Jobs like this Quali...,Chennai,Work Based Onhome Based Part Time.,60.0,"1,50,000 - 2,50,000 P.A. 20000",2016-05-21 19:30:00,,Marketing,0,0,2886.0,B.Tech,Marketing
2,Softtech Career Infosystem Pvt. Ltd,UG: Any Graduate - Any Specialization PG:Any P...,IT-Software,Job Description Send me Jobs like this - as ...,Bangalore,Pl/sql Developer - SQL,,Not Disclosed by Recruiter,2016-10-13 16:20:55,,IT Software - Application Programming,4,8,2741.0,Any,Application Programming
3,Onboard HRServices LLP,UG: Any Graduate - Any Specialization PG:CA Do...,Banking,Job Description Send me Jobs like this - Inv...,"Mumbai, Bangalore, Kolkata, Chennai, Coimbator...",Manager/ad/partner - Indirect Tax - CA,,Not Disclosed by Recruiter,2016-10-13 16:20:55,,Accounts,11,15,2741.0,Any,Accounts
4,Spire Technologies and Solutions Pvt. Ltd.,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,IT-Software,Job Description Send me Jobs like this Pleas...,Bangalore,JAVA Technical Lead (6-8 yrs) -,4.0,Not Disclosed by Recruiter,2016-10-13 16:20:55,,IT Software - Application Programming,6,8,2741.0,B.Tech,Application Programming


In [36]:
from sklearn.model_selection import train_test_split

X = data['jobdescription']
y = data['industry']

test_size = 0.1
val_size = 0.1

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=(test_size + val_size), stratify=y)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=val_size / (test_size + val_size), stratify=y_temp)

In [37]:
print(X_train.iloc[0])

Job Description   Send me Jobs like this Requirement : Hands on experience in developing RIA applications using Flex 3. 0 and Flash Ability to design exceptionally well Flash interfaces and expertise on building Flex applications. Master level knowledge on Action Script is required Deep development and issue resolution skills Salary: Not Disclosed by Recruiter Industry: IT-Software / Software Services Functional Area: IT Software - Application Programming , Maintenance Role Category:Programming & Design Role:Software Developer Keyskills development flex interfaces ria issue resolution flex 3 0 flash action script Desired Candidate Profile Education- UG: B.Tech/B.E. PG:MCA Doctorate:Any Doctorate - Any Specialization, Doctorate Not Required Please refer to the Job description above Company Profile: Bloom Systems Pvt. Ltd. Bloom Systems is the Next-Generation HR Services provider, collaborating with client teams to offer solutions in areas including Recruitment Process Outsourcing (RPO) 

In [38]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, WordNetLemmatizer
from string import punctuation
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

class Preprocess:
    def __init__(self, method='WordNetLemmatizer'):
        self.method = method
        self.stemmers = {
            'PorterStemmer': PorterStemmer(),
            'LancasterStemmer': LancasterStemmer(),
            'SnowballStemmer': SnowballStemmer(language='english'),
            'WordNetLemmatizer': WordNetLemmatizer()
        }
        self.stemmer = self.stemmers[self.method]
        self.stopWords = list(punctuation) + list(stopwords.words('english'))
        self.moreStopWords = ['job', 'description', 'requirement', 'skill', 'qualification']
        self.stopWords.extend(self.moreStopWords)
        self.encoder = LabelEncoder()
        self.vectorizer = TfidfVectorizer()
        self.isFitted = False
    
    def preprocess(self, message):
        message = message.lower()
        message = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                        '(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', message)
        message = re.sub(' +', ' ', message)
        message = re.sub("(@[A-Za-z0-9_]+)", "", message)
        message = re.sub('#[A-Za-z0-9_]+', '', message)
        message = re.sub("^[A-Za-z0-9_-]*$", "", message)
        emoji_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"  
            u"\U0001F300-\U0001F5FF"  
            u"\U0001F680-\U0001F6FF"  
            u"\U0001F1E0-\U0001F1FF"  
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            "]+",
            flags=re.UNICODE,
        )
        message = emoji_pattern.sub('', message)
        if self.method == 'WordNetLemmatizer':
            message = ' '.join([self.stemmer.lemmatize(word) for word in message.split() if word not in self.moreStopWords])
        else:
            message = ' '.join([self.stemmer.stem(word) for word in message.split() if word not in self.moreStopWords])
        return message 

    def fit(self, X, y=None):
        self.vectorizer.fit(X)
        if y is not None:
            self.encoder.fit(y)
        self.isFitted = True

    def transform(self, X, y=None):
        if not self.isFitted:
            raise NotImplementedError('Please fit first by calling the fit function')
        X = self.vectorizer.transform(X)
        if y is not None:
            y = self.encoder.transform(y)
            return X, y
        else:
            return X 

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        X, y = self.transform(X, y)
        return X, y

In [39]:
preprocessor = Preprocess()
print(preprocessor.preprocess(X_train.iloc[0]))

send me job like this : hand on experience in developing ria application using flex 3. 0 and flash ability to design exceptionally well flash interface and expertise on building flex applications. master level knowledge on action script is required deep development and issue resolution skill salary: not disclosed by recruiter industry: it-software / software service functional area: it software - application programming , maintenance role category:programming & design role:software developer keyskills development flex interface ria issue resolution flex 3 0 flash action script desired candidate profile education- ug: b.tech/b.e. pg:mca doctorate:any doctorate - any specialization, doctorate not required please refer to the above company profile: bloom system pvt. ltd. bloom system is the next-generation hr service provider, collaborating with client team to offer solution in area including recruitment process outsourcing (rpo) services, executive search and selection; staff augmentatio

In [40]:
preprocessor = Preprocess()
X_train, y_train = preprocessor.fit_transform(X_train, y_train)
X_val, y_val = preprocessor.transform(X_val, y_val)
X_test, y_test = preprocessor.transform(X_test, y_test)

In [41]:
import time
from sklearn.metrics import accuracy_score, classification_report

class Models:
    def __init__(self, models=['dt', 'rf', 'xgb']):
        self.model_dict = {
            'dt': DecisionTreeClassifier(),
            'rf': RandomForestClassifier(n_jobs=-1),
            'xgb': XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_jobs=-1),
            'logistic': LogisticRegression(),
            'gb': GradientBoostingClassifier(),
            'adaboost': AdaBoostClassifier(),
            'svc': SVC()
        }
        self.models = models 
        self.selected_models = {}
        for model in self.models:
            self.selected_models[model] = self.model_dict[model]
        self.isFitted = False 
    
    def fit(self, X, y):
        if pd.Series(y).nunique() > 2 and 'xgb' in self.selected_models:
            self.model_dict['xgb'].objective = 'multi:softmax'
        elif pd.Series(y).nunique == 2 and 'xgb' in self.selected_models:
            self.model_dict['xgb'].objective = 'binary:logistic'
        for name, model in self.selected_models.items():
            print(f'Training model {name}')
            start = time.time()
            model.fit(X, y)
            print(f'Took {time.time() - start}s to train')
        self.isFitted = True 
        print('Finished training all models')
    
    def evaluate(self, X_train, y_train, X_val, y_val, X_test, y_test):
        if not self.isFitted:
            raise NotImplementedError('Please fit the models first by calling the fit function')
        for name, model in self.selected_models.items():
            print(f'Evaluating model {name}')
            train_preds = model.predict(X_train)
            val_preds = model.predict(X_val)
            test_preds = model.predict(X_test)
            train_acc = accuracy_score(y_true=y_train, y_pred=train_preds)
            val_acc = accuracy_score(y_true=y_val, y_pred=val_preds)
            test_acc = accuracy_score(y_true=y_test, y_pred=test_preds)
            print(f'Accuracy on train set is {train_acc:.3%}')
            print(f'Accuracy on validation set is {val_acc:.3%}')
            print(f'Accuracy on test set is {test_acc:.3%}')
            print('Classification report for train set')
            print(classification_report(y_true=y_train, y_pred=train_preds))
            print('Classification report for validation set')
            print(classification_report(y_true=y_val, y_pred=val_preds))
            print('Classification report for test set')
            print(classification_report(y_true=y_test, y_pred=test_preds))
        print('Finished evaluating the models')

In [42]:
models = Models()
models.fit(X_train, y_train)

Training model dt
Took 47.47252082824707s to train
Training model rf
Took 18.174415111541748s to train
Training model xgb
Took 480.132283449173s to train
Finished training all models


In [43]:
models.evaluate(X_train, y_train, X_val, y_val, X_test, y_test)

Evaluating model dt
Accuracy on train set is 99.960%
Accuracy on validation set is 93.638%
Accuracy on test set is 92.952%
Classification report for train set
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       182
           1       1.00      1.00      1.00       201
           2       1.00      1.00      1.00        35
           3       1.00      1.00      1.00        19
           4       1.00      1.00      1.00        53
           5       0.99      1.00      0.99       382
           6       1.00      1.00      1.00        25
           7       1.00      1.00      1.00      1003
           8       1.00      1.00      1.00       990
           9       1.00      1.00      1.00        17
          10       1.00      1.00      1.00        72
          11       1.00      1.00      1.00       359
          12       1.00      1.00      1.00       125
          13       1.00      1.00      1.00        84
          14       1.00      1

Accuracy on train set is 99.960%
Accuracy on validation set is 83.112%
Accuracy on test set is 82.975%
Classification report for train set
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       182
           1       1.00      1.00      1.00       201
           2       1.00      1.00      1.00        35
           3       1.00      1.00      1.00        19
           4       1.00      1.00      1.00        53
           5       1.00      1.00      1.00       382
           6       1.00      1.00      1.00        25
           7       1.00      1.00      1.00      1003
           8       1.00      1.00      1.00       990
           9       1.00      1.00      1.00        17
          10       1.00      1.00      1.00        72
          11       1.00      1.00      1.00       359
          12       1.00      1.00      1.00       125
          13       1.00      1.00      1.00        84
          14       1.00      1.00      1.00       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy on train set is 99.954%
Accuracy on validation set is 98.307%
Accuracy on test set is 97.849%
Classification report for train set
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       182
           1       1.00      1.00      1.00       201
           2       1.00      1.00      1.00        35
           3       1.00      1.00      1.00        19
           4       1.00      1.00      1.00        53
           5       1.00      1.00      1.00       382
           6       1.00      1.00      1.00        25
           7       1.00      1.00      1.00      1003
           8       1.00      1.00      1.00       990
           9       1.00      1.00      1.00        17
          10       1.00      1.00      1.00        72
          11       1.00      1.00      1.00       359
          12       1.00      1.00      1.00       125
          13       1.00      1.00      1.00        84
          14       1.00      1.00      1.00       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [44]:
import joblib

best_model = models.selected_models['xgb']
joblib.dump(best_model, 'best_model.pkl')

['best_model.pkl']

In [45]:
loaded_model = joblib.load('./best_model.pkl')

In [46]:
def predict_new(message, preprocessor, model):
    message = preprocessor.transform([message])
    pred = model.predict(message)
    decoded_pred = preprocessor.encoder.inverse_transform(pred)[0]
    return decoded_pred

In [47]:
random_job = data.sample(n=1, random_state=1234)

In [48]:
print(random_job['jobdescription'].values[0])
print(random_job['industry'].values[0])

Job Description   Send me Jobs like this Job Roles & Responsibilities:- Abi lity to provide turnkey solution in Automation and Test Equipment Relevant experience in Defence/Space/ Process Industry will be preferred Other Requirements: Responsible for achieving the sales targets set for the region Excellent in written, verbal communication and presentation skills In depth technical knowledge in field of instrumentation Job Description: Participate in trade shows, seminars to generate leads. Prepare technically correct, competent turn-key proposals adhering to quality norms. Create and maintain ISO documents. Demonstration and technical presentation. Salary: Not Disclosed by Recruiter Industry: Construction / Engineering / Cement / Metals Functional Area: Sales , Retail , Business Development Role Category:Retail Sales Role:Sales/Business Development Manager Keyskills Business Development Desired Candidate Profile Education- UG: Any Graduate PG:Any Postgraduate Doctorate:Any Doctorate - 

In [49]:
pred = predict_new(message=random_job['jobdescription'].values[0], preprocessor=preprocessor, model=loaded_model)
print(pred)

Construction


In [50]:
recommendation_df = data[data['industry'] == pred].sort_values(by='postdate', ascending=True)
n_recommendations = 200
recommendations = recommendation_df.iloc[:n_recommendations]

In [52]:
recommendations

Unnamed: 0,company,education,industry,jobdescription,joblocation_address,jobtitle,numberofpositions,payrate,postdate,site_name,skills,experience_lower,experience_upper,job_age,Education,Skills
4905,Acetech Solutions,"UG: Any Graduate - Any Specialization, Graduat...",Construction,Job Description Send me Jobs like this Track...,"Bangalore , Mumbai , Chennai , Pune , Hyderaba...",Junior Officer Stores,8.0,"1,75,000 - 3,75,000 P.A",2015-11-21 01:33:33,,Sales,0,4,3069.0,Any,Sales
4846,sumitkumar55456@gmail.com,"UG: Any Graduate - Any Specialization, Graduat...",Construction,Job Description Send me Jobs like this Shoul...,"Bangalore , Mumbai , Chennai , Pune , Hyderaba...",Assistant Construction Manager - Freshers Also...,,Not Disclosed by Recruiter,2015-11-22 01:33:31,,Site Engineering,0,2,3068.0,Any,Site Engineering
4965,sumitkumar55456@gmail.com,"UG: Any Graduate - Any Specialization, Graduat...",Construction,Job Description Send me Jobs like this Diplo...,"Bangalore , Chennai , Hyderabad",Civil Engineers - (quantity Survey / Billing ),,Not Disclosed by Recruiter,2015-11-22 01:33:34,,Site Engineering,1,3,3068.0,Any,Site Engineering
18990,Prateek Group,UG: Any Graduate PG:MBA/PGDM Doctorate:Any Doc...,Construction,Job Description Send me Jobs like this Job D...,"NCR , NCR/Greater NCR",Corporate Communication,,Not Disclosed by Recruiter,2015-11-22 22:18:04,,Marketing,6,8,3067.0,Any,Marketing
19017,Hill International (Middle East) Ltd.,UG: B.Tech/B.E. PG:M.Tech Doctorate:Any Doctor...,Construction,Job Description Send me Jobs like this Posit...,"NCR , India , NCR",QA/QC Engineer( Civil),,Not Disclosed by Recruiter,2015-11-22 22:18:05,,Site Engineering,7,12,3067.0,B.Tech,Site Engineering
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20380,Z Star Consultancy,"UG: Graduation Not Required, B.Tech/B.E. - Any...",Construction,Job Description Send me Jobs like this The p...,"Bangalore, Mumbai, Chennai, Pune, Hyderabad, B...",Auto Cad Draftsman - Civil - Freshers,,Not Disclosed by Recruiter,2016-12-25 18:18:35,www.naukri.com,Production,0,4,2668.0,Graduation,Production
21037,TCA Consultants pvt ltd,UG: B.Tech/B.E. - Electrical PG:Post Graduatio...,Construction,Job Description Send me Jobs like this BE/B ...,"Hyderabad, Kakinada",Jr. Engineer - Electrical,,Not Disclosed by Recruiter,2017-01-03 18:19:11,www.naukri.com,Site Engineering,4,6,2659.0,B.Tech,Site Engineering
20769,Aparna Constructions,"UG: Diploma - Civil, B.Tech/B.E. - Civil PG:An...",Construction,Job Description Send me Jobs like this B.E /...,Hyderabad,Engineer (or) Assistant Engineer - Civil,,Not Disclosed by Recruiter,2017-01-07 18:18:55,www.naukri.com,Site Engineering,2,7,2655.0,Diploma,Site Engineering
21086,Myk Laticrete India Pvt Ltd,"UG: Any Graduate - Any Specialization, Graduat...",Construction,Job Description Send me Jobs like this a) In...,Hyderabad,Requirement for Channel Sales,5.0,Not Disclosed by Recruiter,2017-01-07 18:19:14,www.naukri.com,Sales,1,6,2655.0,Any,Sales
