#CS226_BigDataManagement_JobPostingsData_(Mis)Fortune-500
## This notebook is used for Visualization and Analytics.
## It provides distribution of job postings across different job locations.
## The job locations are initially for only the US states and then is expanded to countries across the world.
###Library imports, installing pyspark and mounting google drive

In [2]:
from google.colab import drive
drive.mount('/content/drive/')
!pip install pyspark
!pip install pycountry
import pycountry
import pandas as pd
import numpy
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession, dataframe
import plotly.express as px
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType, ArrayType
from pyspark.sql.functions import col, concat_ws, concat, when, lit

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


##Defining the Schema of the data and creating the Spark Session

In [3]:
# create sparksession
spark = SparkSession.builder.appName("CS226").config("spark.memory.offHeap.enabled","true").config("spark.memory.offHeap.size","12g").getOrCreate()

schema = StructType([StructField('dateUploaded', StructType([StructField('$date', StringType(), True)]), True),
                     StructField('orgAddress', StructType([StructField('addressLine', StringType(), True), StructField('city', StringType(), True),
                                                           StructField('companyName', StringType(), True), StructField('country', StringType(), True),
                                                           StructField('countryCode', StringType(), True), StructField('county', StringType(), True),
                                                           StructField('district', StringType(), True), StructField('formatted', StringType(), True),
                                                           StructField('houseNumber', StringType(), True), StructField('level', LongType(), True),
                                                           StructField('quarter', StringType(), True), StructField('state', StringType(), True),
                                                           StructField('street', StringType(), True)]), True),
                     StructField('orgCompany', StructType([StructField('description', StringType(), True), StructField('idInSource', StringType(), True),
                                                           StructField('ids', StructType([StructField('aarp_us', StringType(), True)]), True),
                                                           StructField('imgCover', StringType(), True), StructField('imgLogo', StringType(), True),
                                                           StructField('name', StringType(), True), StructField('nameCode', StringType(), True),
                                                           StructField('nameOrg', StringType(), True), StructField('registryID', StringType(), True)]), True),
                     StructField('orgTags', StructType([StructField('CATEGORIES', ArrayType(StringType(), True), True),
                                                        StructField('QUALIFICATIONS', ArrayType(StringType(), True), True),
                                                        StructField('WORK_TYPES', ArrayType(StringType(), True), True)]), True),
                     StructField('position', StructType([StructField('department', StringType(), True), StructField('name', StringType(), True),
                                                         StructField('workType', StringType(), True)]), True)])

###The initial Read into a spark dataframe with the options defined for dropping malformed records and parsing the data according to the schema

In [4]:
import time
start = time.time()
sdf = spark.read.option("mode", "DROPMALFORMED").schema(schema).json("/content/drive/Shareddrives/F23_CS226_DataWarehouse_18/International_Job_PostingsBig_2021/techmap-jobs-dump-2021-09.json")
end = time.time()
print(f"Time to read: {(end - start):.2f} seconds")
#Past Experimentation
#rddJSON = sc.textFile("/content/drive/Shareddrives/CS226_Dataset/International_Job_PostingsBig_2021/techmap-jobs-dump-2021-09.json")

Time to read: 10.50 seconds


#Selecting columns relevant to the project and renaming them accordingly

In [5]:
cleaned_df = sdf.withColumn("dateUploaded", col("dateUploaded.$date")).withColumn("orgAddress", concat_ws(", ", col("orgAddress.city"), col("orgAddress.state"))).withColumn(
    "position", col("position.name")).withColumn("orgCompany", col("orgCompany.nameOrg")).withColumn("orgTags", col("orgTags.CATEGORIES")[0])
cleaned_df = cleaned_df.withColumnRenamed("orgAddress", "JobLocation").withColumnRenamed("orgCompany", "HiringCompany").withColumnRenamed(
    "orgTags", "JobCategory").withColumnRenamed("position", "JobTitle").withColumnRenamed("dateUploaded", "JobDate")

cleaned_df.show(5, truncate=False)
#Past Experimentation
#firstNF_RDD = sdfRDD.mapPartitions(lambda row: row[0] if isinstance(row, (str, list, tuple, dict)) else row)

+------------------------+-----------------+-------------------------------+----------------------------+---------------------------------------+
|JobDate                 |JobLocation      |HiringCompany                  |JobCategory                 |JobTitle                               |
+------------------------+-----------------+-------------------------------+----------------------------+---------------------------------------+
|2021-09-02T05:11:20.352Z|New York, NY     |Brinks                         |Advertising/Public Relations|Account Executive Sr                   |
|2021-09-02T05:11:21.117Z|San Francisco, CA|Konica Minolta                 |Advertising/Public Relations|Global Client Executive                |
|2021-09-02T05:11:21.267Z|Stevens Point, WI|Sentry                         |Advertising/Public Relations|Retirement Plan Sales Support Analyst  |
|2021-09-02T05:11:21.688Z|Malvern, PA      |Vanguard                       |Advertising/Public Relations|Digital Channel Pro

In [6]:
cleaned_df.cache()

DataFrame[JobDate: string, JobLocation: string, HiringCompany: string, JobCategory: string, JobTitle: string]

#Count of Job Postings per company

In [7]:
start = time.time()
cleaned_df.createOrReplaceTempView('JobData')
roleBased_df = spark.sql(
'''
SELECT HiringCompany, SUM(JobCount) AS TotalJobCount
FROM (
    SELECT HiringCompany, COUNT(*) AS JobCount
    FROM JobData
    WHERE JobTitle LIKE "%Software Developer%" OR
          JobTitle LIKE "%Data Engineer%" OR
          JobTitle LIKE "%Data Analyst%" OR
          JobTitle LIKE "%Data Scientist%"
    GROUP BY HiringCompany
) AS Subquery
GROUP BY HiringCompany
ORDER BY TotalJobCount DESC;

'''
)
display(roleBased_df.toPandas())
end = time.time()
print(f"Time to query: {(end - start)/60:.2f} minutes")

Unnamed: 0,HiringCompany,TotalJobCount
0,General Motors LLC,493
1,Jobot,379
2,Revature,280
3,eFinancial Careers,215
4,Robert Half,188
...,...,...
6762,"NetSource, Inc.",1
6763,Noon -The Social Learning Platform,1
6764,JK Moving Services,1
6765,Newrest,1


Time to query: 14.95 minutes


#Count of Job Postings grouped by US-States as shown in the Choropleth

In [21]:
state_df = sdf.withColumn("dateUploaded", col("dateUploaded.$date")).withColumn("orgAddress", col("orgAddress.state")).withColumn(
    "position", col("position.name")).withColumn("orgCompany", col("orgCompany.nameOrg")).withColumn("orgTags", col("orgTags.CATEGORIES")[0])
cleaned_df = state_df.withColumnRenamed("orgAddress", "JobLocation").withColumnRenamed("orgCompany", "HiringCompany").withColumnRenamed(
    "orgTags", "JobCategory").withColumnRenamed("position", "JobTitle").withColumnRenamed("dateUploaded", "JobDate")
cleaned_df.cache()

DataFrame[JobDate: string, JobLocation: string, HiringCompany: string, JobCategory: string, JobTitle: string]

In [22]:
import plotly.express as px
geojson = px.data.gapminder()
import matplotlib as plt

start = time.time()
cleaned_df.createOrReplaceTempView('JobDataByState')
stateBased_df = spark.sql(
'''
SELECT JobLocation, SUM(JobCount) AS TotalJobCount
FROM (
    SELECT JobLocation, COUNT(*) AS JobCount
    FROM JobDataByState
    WHERE JobTitle LIKE "%Software Developer%" OR
          JobTitle LIKE "%Data Engineer%" OR
          JobTitle LIKE "%Data Analyst%" OR
          JobTitle LIKE "%Data Scientist%"
    GROUP BY JobLocation
) AS Subquery
GROUP BY JobLocation
ORDER BY TotalJobCount DESC;

'''
)
pandasStateDF = spark.createDataFrame(stateBased_df.collect()).toPandas()
display(pandasStateDF)
end = time.time()
print(f"Time to query: {end - start:.2f} seconds")

Unnamed: 0,JobLocation,TotalJobCount
0,,7908
1,TX,887
2,,886
3,CA,817
4,NY,753
...,...,...
435,Donegal,1
436,ST,1
437,Missouri,1
438,Niger,1


Time to query: 11.83 seconds


In [23]:
fig = px.choropleth(pandasStateDF.iloc[1:, :],
                    locations='JobLocation',
                    color='TotalJobCount',
                    color_continuous_scale='blues',
                    locationmode='USA-states',
                    scope='usa')
fig.update_geos(
    visible=True,
    scope="usa",
)
fig.show()

#Count of Job Postings by country across the world as shown by the Chloropleth

In [11]:
country_df = sdf.withColumn("dateUploaded", col("dateUploaded.$date")).withColumn("orgAddress", col("orgAddress.country")).withColumn(
    "position", col("position.name")).withColumn("orgCompany", col("orgCompany.nameOrg")).withColumn("orgTags", col("orgTags.CATEGORIES")[0])
cleaned_df = country_df.withColumnRenamed("orgAddress", "JobLocation").withColumnRenamed("orgCompany", "HiringCompany").withColumnRenamed(
    "orgTags", "JobCategory").withColumnRenamed("position", "JobTitle").withColumnRenamed("dateUploaded", "JobDate")
cleaned_df = cleaned_df.withColumn("JobLocation", when(col("JobLocation") == "USA", lit("US")).otherwise(col("JobLocation")))
cleaned_df.cache()

DataFrame[JobDate: string, JobLocation: string, HiringCompany: string, JobCategory: string, JobTitle: string]

In [12]:
start = time.time()
cleaned_df.createOrReplaceTempView('JobDataByCountry')
countryBased_df = spark.sql(
'''
SELECT JobLocation, SUM(JobCount) AS TotalCount
FROM (
    SELECT JobLocation, COUNT(*) AS JobCount
    FROM JobDataByCountry
    WHERE JobTitle LIKE "%Software Developer%" OR
          JobTitle LIKE "%Data Engineer%" OR
          JobTitle LIKE "%Data Analyst%" OR
          JobTitle LIKE "%Data Scientist%"
    GROUP BY JobLocation
) AS Subquery
GROUP BY JobLocation
ORDER BY TotalCount DESC;

'''
)
pandasCountryDF = spark.createDataFrame(countryBased_df.collect()).toPandas()
display(pandasCountryDF)
end = time.time()
print(f"Time to query: {end - start:.2f} seconds")

Unnamed: 0,JobLocation,TotalCount
0,US,7106
1,,4620
2,England,1833
3,DE,1013
4,us,788
...,...,...
165,Norge,1
166,Italy,1
167,România,1
168,RJ,1


Time to query: 832.30 seconds


In [13]:
pandasCountryDF["JobLocation"].unique()

array(['US', '', 'England', 'DE', 'us', 'Australia', 'UK', 'CA',
       'New Zealand', 'de', 'IE', 'GB', 'United Kingdom', ' CA',
       ' Canada', ' Österreich', ' Belgium', ' Schweiz',
       ' United Kingdom', ' WA', ' Ireland', 'NL', ' TX', 'Scotland',
       ' NY', 'Việt Nam', 'SE', 'China', 'Singapore', 'CH', 'AT',
       'Deutschland', ' Luxembourg', ' Costa Rica', ' Lithuania', 'FR',
       'United States', ' Estonia', ' NC', 'IT', ' Portugal', 'ES', 'GR',
       'Czech Republic', ' Cyprus', ' Finland', ' IL', 'Northern Ireland',
       ' VA', ' United States', 'Wales', ' FL', ' Latvia', ' Slovenia',
       ' Slovakia', ' Guatemala', ' GA', 'Germany', ' MA', ' Bulgaria',
       ' Pakistan', ' Uruguay', ' South Africa', ' MD', ' NJ', ' Hungary',
       ' Hong Kong SAR', ' Kenya', ' AZ', ' PA', ' México', ' Venezuela',
       ' Colombia', ' OR', 'Hong Kong', ' New Zealand', ' Argentina',
       ' India', ' CO', ' OH', ' DC', ' MI', 'Österreich', ' Panamá',
       ' Bolivia', ' TN

In [20]:
# Mapping country names to ISO Alpha-3 codes
import copy
plotCountryDF = copy.deepcopy(pandasCountryDF)

def standardize_country_name(country_name):
    countryCodeMap = {"england":"GBR", "hong kong sar": "HKG", "deutschland" : "DEU", "tx" : "USA", "ny" : "USA", "việt nam" : "VNM", "northern ireland" : "GBR", "nj" : "USA",
                      "fl" : "USA", "méxico" : "MEX", "or" : "USA", "oh" : "USA", "dc" : "USA", "mi" : "USA", "wi" : "USA", "uk" : "GBR", "danmark" : "DNK", "italia" : "ITL",
                      "jerman" : "DEU", "österreich" : "AUT", "wa" : "USA", "wales" : "GBR", "texas metropolitan area" : "USA"}
    if not country_name:
      return ""
    country_name = country_name.lower().strip()
    if country_name in countryCodeMap:
      country_name = countryCodeMap[country_name]
    try:
        mapped_country = pycountry.countries.lookup(country_name).alpha_3
        return mapped_country
    except:
        return country_name

# Apply the mapping
plotCountryDF['CountryCode'] = plotCountryDF['JobLocation'].apply(standardize_country_name)

# Filter out rows with invalid entries
filtered_df = plotCountryDF[plotCountryDF['CountryCode'].notnull()]
#display(filtered_df)

# Group by 'CountryCode' and sum up 'TotalCount'
summed_df = plotCountryDF.groupby('CountryCode')['TotalCount'].sum().reset_index()

# Rename the columns for clarity
summed_df.columns = ['CountryCode', 'TotalCount']
display(summed_df)
# Plotly Express choropleth map
fig = px.choropleth(summed_df,
                    locations='CountryCode',
                    color='TotalCount',
                    color_continuous_scale='ylgnbu',
                    locationmode="ISO-3",
                    title='Total Job Counts by Country')

fig.show()

Unnamed: 0,CountryCode,TotalCount
0,,4629
1,ALB,3
2,ARG,17
3,AUS,681
4,AUT,172
...,...,...
107,česko,3
108,ประเทศไทย,1
109,中国,2
110,台灣,5
