In [174]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession.builder.appName("Fake-job-prediction").getOrCreate()

In [175]:
%%html
<style>
div.output_area pre {
    white-space: pre;
}
</style>

In [215]:
df = spark.read.format("csv").option("header", "true").option("escape","\"").option("quote","\"").load("fake_job_postings.csv", sep=",")

df.show(100)

+------+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+-------------+----------------+-------------+---------------+-------------------+--------------------+--------------------+--------------------+----------+
|job_id|               title|            location|          department| salary_range|     company_profile|         description|        requirements|            benefits|telecommuting|has_company_logo|has_questions|employment_type|required_experience|  required_education|            industry|            function|fraudulent|
+------+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+-------------+----------------+-------------+---------------+-------------------+--------------------+--------------------+--------------------+----------+
|     1|    Marketing Int

In [216]:
df.dtypes

[('job_id', 'string'),
 ('title', 'string'),
 ('location', 'string'),
 ('department', 'string'),
 ('salary_range', 'string'),
 ('company_profile', 'string'),
 ('description', 'string'),
 ('requirements', 'string'),
 ('benefits', 'string'),
 ('telecommuting', 'string'),
 ('has_company_logo', 'string'),
 ('has_questions', 'string'),
 ('employment_type', 'string'),
 ('required_experience', 'string'),
 ('required_education', 'string'),
 ('industry', 'string'),
 ('function', 'string'),
 ('fraudulent', 'string')]

In [217]:
from pyspark.sql.types import StringType

def get_country(location):
    if location is None:
        return None
    else:
        location_list = location.strip().split(sep = ",")
        return location_list[0].lower()

def get_state(location):
    if location is None:
        return None
    else:
        location_list = location.strip().split(sep = ",")
        if(len(location_list)>1 and not location_list[1].isspace()):
            return location_list[1].lower()
        else: 
            return None

udf_country = F.udf(get_country, StringType())
udf_state = F.udf(get_state, StringType())


In [218]:
df = df.withColumn("country", udf_country(F.col('location')))
df = df.withColumn("state", udf_state(F.col('location')))

#Due to the poor form of the location column, country and state are the only useful columns we can get.
df = df.drop(F.col('location'))

df.show(100)

+------+--------------------+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+-------------+----------------+-------------+---------------+-------------------+--------------------+--------------------+--------------------+----------+-------+-----+
|job_id|               title|          department| salary_range|     company_profile|         description|        requirements|            benefits|telecommuting|has_company_logo|has_questions|employment_type|required_experience|  required_education|            industry|            function|fraudulent|country|state|
+------+--------------------+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+-------------+----------------+-------------+---------------+-------------------+--------------------+--------------------+--------------------+----------+-------+-----+
|     1|    Marketing Intern|           Market

## Since the salary_range column may vary because of different currencies, we get the currency for each country

In [223]:
df.select("country").distinct().count()

91

In [224]:
from bs4 import BeautifulSoup
import requests

URL = "https://unece.org/fileadmin/DAM/cefact/recommendations/bkup_htm/cocucod.htm"
r = requests.get(URL)
soup = BeautifulSoup(r.content)

In [316]:
country_codes = []
currency_codes = []

info = soup.find_all("font", attrs={'color':'#000000','size':'1','face':'Verdana'})

country_codes = [info[x].text.lower() for x in range(1526) if len(info[x].text)==2]

currency_codes = [info[x].text.lower() for x in range(1526) 
                  if len(info[x].text)==3 
                  and not info[x].text.isnumeric() 
                  and info[x].text.isupper()]

#There are three countries with no official currency in this table, so we have to drop them.
remove_list = ['gs','ps','aq']

country_codes = [x for x in country_codes if x not in remove_list]

CC = list(zip(country_codes,currency_codes))

len(CC)

250

In [331]:
def get_currencycode(country):
    idx = 0
    for i in range(len(CC)):
        if CC[i][0]==country:
            break
        idx+=1
    return CC[idx][1]        
    

udf_currency = F.udf(get_currencycode, StringType())


df = df.withColumn('salary_currency', udf_currency(F.col('country')))

In [332]:
from pyspark.sql.types import FloatType

def get_minRange(salary):
    if salary is None:
        return None
    else:
        salary_range = salary.split(sep = "-")
        return float(salary_range[0])

def get_maxRange(salary):
    if salary is None:
        return None
    else:
        salary_range = salary.split(sep = "-")
        return float(salary_range[1])

udf_minRange = F.udf(get_minRange, FloatType())
udf_maxRange = F.udf(get_maxRange, FloatType())

df = df.withColumn('salary_min', udf_minRange(F.col('salary_range')))
df = df.withColumn('salary_max', udf_maxRange(F.col('salary_range')))

df = df.drop(F.col('salary_range'))

df.show()

+------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+-------------+----------------+-------------+---------------+-------------------+--------------------+--------------------+--------------------+----------+-------+-----+---------------+----------+----------+
|job_id|               title|department|     company_profile|         description|        requirements|            benefits|telecommuting|has_company_logo|has_questions|employment_type|required_experience|  required_education|            industry|            function|fraudulent|country|state|salary_currency|salary_min|salary_max|
+------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+-------------+----------------+-------------+---------------+-------------------+--------------------+--------------------+--------------------+----------+-------+-----+---------------+----------+----------+
|   