In [40]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import os
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import nltk.corpus # sample text for performing tokenization
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/yunjaecho/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
def parsing(df):
    columns = ["company_profile","description","requirements","benefits"]
    one_hot_encoding_columns = ['employment_type', "required_education", "required_experience","function.", "industry"]
    length = []
    for column in columns:
        df[(str(column) + "_length")] = df[column].apply(lambda x: len(x) if x == x else 0)
    df = pd.get_dummies(df, columns = one_hot_encoding_columns)
    return df.iloc[:, 9:]



# text length

In [22]:
def length_counter(df,columns = ["company_profile","description","requirements","benefits"]):
    length = []
    for column in columns:
        df[(str(column) + "_length")] = df[column].apply(lambda x: len(x) if x == x else 0)
    return df

In [23]:
jobs = pd.read_csv("job_training_data.csv")
jobs = length_counter(jobs)

In [44]:
nf_jobs = jobs[jobs["fraudulent"] == 0]
f_jobs = jobs[jobs["fraudulent"] == 1]

# frequency selection

In [58]:
stop_words = [
    ".",
    'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 're',
 's',
 'same',
 'shan',
 "shan't",
 'she',
 "she's",
 'should',
 "should've",
 'shouldn',
 "shouldn't",
 'so',
 'some',
 'such',
 't',
 'than',
 'that',
 "that'll",
 'the',
 'their',
 'theirs',
 'them',
 'themselves',
 'then',
 'there',
 'these',
 'they',
 'this',
 'those',
 'through',
 'to',
 'too',
 'under',
 'until',
 'up',
 've',
 'very',
 'was',
 'wasn',
 "wasn't",
 'we',
 'were',
 'weren',
 "weren't",
 'what',
 'when',
 'where',
 'which',
 'while',
 'who',
 'whom',
 'why',
 'will',
 'with',
 'won',
 "won't",
 'wouldn',
 "wouldn't",
 'y',
 'you',
 "you'd",
 "you'll",
 "you're",
 "you've",
 'your',
 'yours',
 'yourself',
 'yourselves',"us"]


In [83]:
def strips(x):
    return (x.strip().strip("'").strip('"').strip(",").strip("(").strip(")").strip(".").strip(";").strip(":"))

def merge_and_tokenize(df,column = "company_profile",k = 0):
    merged_text = []
    text_dict = {}
    parsed_row = []
    for sentence in df[column]:
        if sentence == sentence:
            words = sentence.lower().split()
            words = [strips(w) for w in words if strips(w) not in stop_words]
            for word in words: # remove stop words
                if "url" in str(word):
                    words.remove(word)

            parsed_row.append(words)
            merged_text.extend(words)
        else:
            parsed_row.append(np.nan)
            
    fdist = FreqDist(merged_text)
    
    keys = list(fdist.keys())
    
    for key in keys:
        if fdist[key] <= k:
            del fdist[key]
    return fdist,parsed_row

def parsed_frequency(freq_dic,parsed_text):
    parsed_dict = freq_dic.copy()
    freq_list = []
    
    for text in tqdm(parsed_text):
        parsed_dict = freq_dic.copy()
        if text == text:
            for word in text:
                if word in parsed_dict:
                    parsed_dict[word] += 1/len(text)
        else:
            freq_list.append(freq_dic)
            continue
        
        freq_list.append(parsed_dict)
        
    return freq_list

In [89]:
nf_company_dict

FreqDist({'team': 2075, 'work': 2022, 'company': 1988, 'services': 1974, 'people': 1856, 'business': 1513, 'solutions': 1499, 'new': 1419, 'companies': 1265, '&amp': 1240, ...})

In [88]:
f_company_dict

FreqDist({'business': 59, 'candidates': 59, 'recruiting': 49, 'bonus': 49, 'services': 48, 'solutions': 47, 'experience': 44, 'products': 41, 'company': 40, 'financing': 40, ...})

In [87]:
nf_company_dict,nf_parsed_company = merge_and_tokenize(nf_jobs,"company_profile",2)

f_company_dict,f_parsed_company = merge_and_tokenize(f_jobs,"company_profile",2)
#company_text_freq = parsed_frequency(company_dict,parsed_company)

In [None]:
description_dict,parsed_description = merge_and_tokenize(jobs,"description",2)
description_text_freq = parsed_frequency(description_dict,parsed_description) # list of freq for each row

In [None]:
requirements_dict,parsed_requirements = merge_and_tokenize(jobs,"requirements",2)
requirements_text_freq = parsed_frequency(requirements_dict,parsed_requirements)

In [31]:
benefits_dict,parsed_benefits = merge_and_tokenize(jobs,"benefits",2)
benefits_text_freq = parsed_frequency(benefits_dict,parsed_benefits)

NameError: name 'stop_words' is not defined

In [70]:
s = ["a","a","b"]
s.remove("a")
s

['a', 'b']