In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from gensim.models import Word2Vec
import datetime
from dateutil.relativedelta import relativedelta
from collections import Counter



In [2]:
train=pd.read_csv("Training_set.csv")

In [3]:
test=pd.read_csv("Test_set.csv")

In [4]:
train.iloc[0]

Unnamed: 0                                                                              0
user_id                                                                             98304
job_id                                                                               3163
employer_id                                                                           208
job_opening_date                                                               2017-10-01
job_closing_date                                                               2018-11-19
job_description                         <h2 style="margin-left: 0px !important;">Overv...
job_type                                                                                1
job_title                                                      UK Sales Graduate Job 2019
employer_title                                                                        P&G
employer_description                    <p>Nearly five billion times a day, P&amp;G br...
employer_s

In [5]:
def get_job_info(df):
    job_info=[]
    for i in range(df.shape[0]):
        data=df.iloc[i]
        title=data["job_title"]
        job_sector_title=data["job_sector_title"]
        description=data["job_description"]
        em_title=data["employer_title"]
        em_sector_title=data["employer_sector_title"]
        em_description=data["employer_description"]
        a=""
        a+=title+" "
        a+=job_sector_title+" "
        a+=description+" "
        a+=em_title+" "
        a+=em_sector_title+" "
        a+=em_description+" "
        job_info.append(a)
    return job_info

In [6]:
train_job_info=get_job_info(train)

In [7]:
def clean(x):
    res=[]
    for content in x:
        content=content.replace("\r\r\n\r\r\n"," ").replace("..."," ").replace("&#39;s","").replace(":","")
        content=content.replace("&eacute;","e").replace("&lsquo;","").replace("&#39;","")
        content=content.replace("&nbsp;"," ").replace("&#39;","").replace("amp;","").replace("&middot;","")
        content=content.replace("."," ").replace(",","").replace("?"," ").replace("!","").replace("  "," ")
        content=content.replace("&rsquo;","").replace("\t"," ").replace("\r\n\r\n","").replace(" \r\n ","")
        content=content.replace("&ndash;"," ").replace("\r\n\r\n"," ").replace(" \r\n "," ").replace("\r\n"," ")
        content=content.replace("\r"," ").replace("  ","").replace("   ","").replace("  ","").replace("   ","")
        content=content.replace("(","").replace(")","").replace(" - "," ").replace(" / "," ").replace("/"," ").replace("   ","")
        content=re.sub('<.*?>',"",content , flags=re.DOTALL)
        content=content.split(" ")
        y=[]
        for i in content:
            if len(i)>1:
                y.append(i.lower())
        res.append(y)
    return res

In [8]:
cleaned_train_job_info=clean(train_job_info)

In [9]:
cleaned_train_job_info[100]

['graduate',
 'consultant',
 'climate',
 'change',
 'cities',
 'technology',
 'consulting',
 'project',
 'management',
 'at',
 'arup',
 'we',
 'are',
 'passionate',
 'about',
 'designing',
 'and',
 'delivering',
 'ground-breaking',
 'work',
 'in',
 'infrastructure',
 'building',
 'design',
 'and',
 'specialist',
 'technical',
 'services',
 'independent',
 'in',
 'ownership',
 'and',
 'spirit',
 'we',
 'are',
 'global',
 'firm',
 'of',
 'designers',
 'planners',
 'engineers',
 'consultants',
 'and',
 'technical',
 'experts',
 'using',
 'our',
 'skills',
 'to',
 'help',
 'shape',
 'better',
 'world',
 'our',
 'energy',
 'climate',
 'change',
 'and',
 'cities',
 'consulting',
 'team',
 'in',
 'london',
 'is',
 'multi-disciplinary',
 'team',
 'of',
 'engineers',
 'policy',
 'experts',
 'and',
 'environmental',
 'scientists',
 'who',
 'focus',
 'on',
 'the',
 'challenges',
 'and',
 'opportunities',
 'of',
 'energy',
 'climate',
 'change',
 'and',
 'cities',
 'our',
 'areas',
 'of',
 'busine

In [10]:
def remove_stopwords(x):
    res=[]
    with open('stopwords.txt') as stopfile:
        stopwords = stopfile.read()
        stop = stopwords.split()
    for i,job in enumerate(x):
        a=[]
        for word in job:
            if word not in stop:
                a.append(word)
        res.append(a)
    return res

In [11]:
train_clean=remove_stopwords(cleaned_train_job_info)

In [12]:
train_clean[0]

['uk',
 'sales',
 'graduate',
 'job',
 '2019',
 'sales',
 'commercial',
 'overview',
 'want',
 'work',
 'brands',
 'millions',
 'consumers',
 'live',
 'without',
 'want',
 'work',
 'like-minded',
 'talented',
 'colleagues',
 'motivated',
 'challenging',
 'problems',
 'real',
 'responsibility',
 'winning',
 'perfect',
 'opportunity',
 'apply',
 'now',
 'join',
 'p&g',
 'sales',
 'team',
 'expect',
 'working',
 'sales',
 'p&g',
 'means',
 'getting',
 'multi',
 'million',
 'pound',
 'business',
 'budget',
 'lead',
 'day',
 'means',
 'working',
 'great',
 'brands',
 'like',
 'pampers',
 'fairy',
 'gillette',
 'uk',
 'biggest',
 'grocery',
 'retailers',
 'will',
 'key',
 'contact',
 'business',
 'retailer',
 'responsible',
 'building',
 'maintaining',
 'relationship',
 'will',
 'build',
 'design',
 'ideas',
 'strategies',
 'using',
 'analytics',
 'shopper',
 'insights',
 'use',
 'relationship',
 'understanding',
 'market',
 'sell',
 'negotiate',
 'execute',
 'ideas',
 'full',
 'time',
 'hir

In [13]:
def get_occurence(x):
    res=[]
    for job in x:
        res.append(dict(Counter(job)))
    return res

In [14]:
occurences=get_occurence(train_clean)

In [15]:
occurences[0]

{'uk': 2,
 'sales': 8,
 'graduate': 2,
 'job': 3,
 '2019': 1,
 'commercial': 4,
 'overview': 1,
 'want': 3,
 'work': 6,
 'brands': 3,
 'millions': 1,
 'consumers': 1,
 'live': 1,
 'without': 1,
 'like-minded': 1,
 'talented': 1,
 'colleagues': 1,
 'motivated': 1,
 'challenging': 2,
 'problems': 1,
 'real': 3,
 'responsibility': 2,
 'winning': 1,
 'perfect': 1,
 'opportunity': 2,
 'apply': 3,
 'now': 1,
 'join': 1,
 'p&g': 13,
 'team': 2,
 'expect': 1,
 'working': 3,
 'means': 2,
 'getting': 1,
 'multi': 1,
 'million': 1,
 'pound': 1,
 'business': 5,
 'budget': 1,
 'lead': 1,
 'day': 4,
 'great': 2,
 'like': 1,
 'pampers': 1,
 'fairy': 2,
 'gillette': 2,
 'biggest': 1,
 'grocery': 1,
 'retailers': 1,
 'will': 6,
 'key': 1,
 'contact': 1,
 'retailer': 2,
 'responsible': 1,
 'building': 2,
 'maintaining': 1,
 'relationship': 2,
 'build': 1,
 'design': 1,
 'ideas': 3,
 'strategies': 1,
 'using': 1,
 'analytics': 1,
 'shopper': 1,
 'insights': 1,
 'use': 1,
 'understanding': 1,
 'market': 1

In [None]:
def get_vector(model, occurences, jobs):
    for job in jobs:
        for word in job:
            try:
                