In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Loading in Dataset

In [7]:
# from google.colab import drive
# drive.mount('/content/drive')
# !ls /content/drive/MyDrive/BT4012
# url = '/content/drive/MyDrive/BT4012/fake_job_postings.csv'

# import from github repo
url = 'https://raw.githubusercontent.com/LordZhiHao/BT4012_Fraud_Analytics_Project/main/fake_job_postings.csv'

# read data
data = pd.read_csv(url)

# Data Preprocessing

In [None]:
df = data.drop(["job_id",],axis=1)

In [None]:
df['location'] = df['location'].fillna('na, unknown')
df['country'] = df['location'].apply(lambda x : x.strip()[:2])
category_counts = df['country'].value_counts()
categories_to_replace = category_counts[category_counts <= 10].index
df['country'] = df['country'].replace(categories_to_replace, 'other')

In [None]:
df['country'].nunique()

50

In [None]:
df['city'] = df['location'].apply(lambda x : x.split(',')[-1])
category_counts = df['city'].value_counts()
categories_to_replace = category_counts[category_counts <= 10].index
df['city'] = df['city'].replace(categories_to_replace, 'other')

In [None]:
df['city'].nunique()

230

In [None]:
# to concat sections below into one column
df['company_profile'] = df['company_profile'].fillna('')
df['description'] = df['description'].fillna('')
df['requirements'] = df['requirements'].fillna('')
df['benefits'] = df['benefits'].fillna('')

# to do ohe later
df['employment_type'] = df['employment_type'].fillna('na')
df['required_experience'] = df['required_experience'].fillna('na')
df['required_education'] = df['required_education'].fillna('Unspecified')

In [None]:
df["industry"] = df["industry"].fillna('na')
category_counts = df['industry'].value_counts()
categories_to_replace = category_counts[category_counts <= 10].index
df['industry'] = df['industry'].replace(categories_to_replace, 'other')

In [None]:
df["function"] = df["function"].fillna('na')
category_counts = df['function'].value_counts()
categories_to_replace = category_counts[category_counts <= 10].index
df['function'] = df['function'].replace(categories_to_replace, 'other')

In [None]:
# department, salary_range, 0-0
df["salary_range"] = df["salary_range"].fillna('na').apply(lambda x: 0 if x == 'na' else 1)

In [None]:
df["department"] = df["department"].fillna('na')
category_counts = df['department'].value_counts()
categories_to_replace = category_counts[category_counts <= 10].index
df['department'] = df['department'].replace(categories_to_replace, 'other')

In [None]:
df['texts'] = df['title'] + ' ' + df['company_profile'] + ' ' + df['description'] + ' ' + df['requirements'] + ' ' + df['benefits']

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   title                17880 non-null  object
 1   location             17880 non-null  object
 2   department           17880 non-null  object
 3   salary_range         17880 non-null  int64 
 4   company_profile      17880 non-null  object
 5   description          17880 non-null  object
 6   requirements         17880 non-null  object
 7   benefits             17880 non-null  object
 8   telecommuting        17880 non-null  int64 
 9   has_company_logo     17880 non-null  int64 
 10  has_questions        17880 non-null  int64 
 11  employment_type      17880 non-null  object
 12  required_experience  17880 non-null  object
 13  required_education   17880 non-null  object
 14  industry             17880 non-null  object
 15  function             17880 non-null  object
 16  frau

In [None]:
# do ohe: location, department, employment_type, required_experience, required_education, industry, function
# city, country, department, emp_type, req_ex,req_edu,industry,function
to_ohe_columns = ['country', 'city', 'department', 'employment_type', 'industry', 'function','required_experience','required_education']
df = pd.get_dummies(df, columns=to_ohe_columns)

# Perform word2vec & flattening out

In [10]:
import gensim
from gensim.models import word2vec
from gensim.models.word2vec import Word2Vec
import spacy
import string

In [None]:
# import gensim.downloader as api
# wv = api.load('glove-wiki-gigaword-50')
# wv.save('/content/drive/MyDrive/BT4012/glove-wiki-gigaword-50.kv')

In [11]:
from gensim.models import KeyedVectors
wv = KeyedVectors.load('/content/drive/MyDrive/BT4012/glove-wiki-gigaword-50.kv')

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/BT4012/glove-wiki-gigaword-50.kv'

In [None]:
def sent_vec(sent):
    vector_size = wv.vector_size
    wv_res = np.zeros(vector_size)
    # print(wv_res)
    ctr = 1
    for w in sent:
        if w in wv:
            ctr += 1
            wv_res += wv[w]
    wv_res = wv_res/ctr
    return wv_res

In [None]:
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
punctuations = string.punctuation

def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)



    # print(doc)
    # print(type(doc))

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() for word in doc ]

    # print(mytokens)

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [None]:
df['tokens'] = df['texts'].apply(spacy_tokenizer)

In [None]:
df['vec'] = df['tokens'].apply(sent_vec)

In [None]:
df['vec'][0]

array([ 1.36916772e-01,  2.10944164e-01, -2.94024886e-03,  1.01867614e-01,
        1.41162147e-01, -7.47642205e-02, -5.93363908e-01, -2.33841143e-01,
        1.27959302e-01,  4.33842794e-02, -1.22974486e-01,  1.08006433e-01,
       -1.22143874e-01,  5.62082449e-02,  2.34243027e-01, -9.23100792e-03,
        7.49562187e-04,  1.18035723e-01,  4.05803668e-02, -2.33993228e-01,
        3.46334604e-01,  1.31203658e-01,  8.97378732e-03,  2.09037828e-01,
        2.50926664e-02, -9.48967772e-01, -2.60836187e-01, -1.45637211e-01,
        8.67212701e-02, -9.60440547e-02,  2.36396102e+00,  1.37084405e-01,
       -9.85044479e-02, -4.16008639e-01, -1.30632072e-01,  1.39579180e-01,
       -1.34021036e-01,  1.90052007e-01,  9.40797058e-02, -2.36223188e-01,
        1.32741970e-01,  1.17731370e-02, -1.36504664e-01,  1.23594398e-01,
        4.10689708e-02,  1.01238713e-01, -7.35612548e-02,  1.49082636e-01,
        1.33786493e-02,  3.24865197e-01])

In [None]:
columns_to_drop = ['title', 'company_profile', 'description', 'requirements', 'benefits',
                   'required_experience', 'required_education', 'texts', 'tokens','country','city','location']

df = df.drop(columns=columns_to_drop, axis=1, errors='ignore')

In [None]:
vector_df = pd.DataFrame(df['vec'].tolist())
df = pd.concat([df.drop('vec', axis=1), vector_df], axis=1)
df.columns = [str(col) for col in df.columns]

In [None]:
df.to_csv('/content/drive/MyDrive/BT4012/ohe_encoded_word2vec.csv', sep='\t', encoding='utf-8',index=False)