In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import explode, col, count, collect_list, array
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import lower
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("Assignment").getOrCreate()

In [0]:
#Read file skill2vec_50K_csv.gz
filepath="dbfs:/FileStore/tables/"
filename_skills= "skill2vec_50K.csv.gz"

num_cols = 1000  # Set the maximum number of columns (961)

#Dynamically define the Schema
schema = StructType([StructField("JD_id", StringType(), True)] + [StructField(f"skill_{i}", StringType(), True) for i in range(1, num_cols)])

#Read the csv file

df_skills = spark.read.option("header", False).option("sep", ",").schema(schema).csv(filepath+filename_skills) 

#Convert the long dataframe where each column corresponds to a particular skill to each row corresponding to a skill

df_skills = df_skills.select("JD_id", explode(array([col(f"skill_{i}") for i in range(1, num_cols)])).alias("skill"))




In [0]:
#Read file Technology_Skills

filename_tech="Technology_Skills.txt"
df_tech_skills = spark.read \
    .option("header", True) \
    .option("delimiter", "\t") \
    .option("inferSchema", True) \
    .option("mode", "DROPMALFORMED") \
    .csv(filepath+filename_tech)


In [0]:
#Preprocessing of the file skill2vec_50K_csv.gz

#Remove unnecessary rows that are null  
df_skills = df_skills.dropna(subset = "skill")

#Remove Duplicates
df_skills = df_skills.dropDuplicates()

#Remove trailing and leading spaces 
#df_skills=df_skills.select("JD_id").apply(lambda x: x.strip())

#Assign the data types
df_skills = df_skills.withColumn('JD_id', df_skills['JD_id'].cast('int'))
df_skills = df_skills.withColumn('skill', df_skills['skill'].cast('string'))

In [0]:
#Preprocessing of the file Technology_Skills


#Specify the datatypes
df_tech_skills = df_tech_skills.withColumn('O*NET-SOC Code', df_tech_skills['O*NET-SOC Code'].cast('string'))
df_tech_skills = df_tech_skills.withColumn('Example', df_tech_skills['Example'].cast('string'))
df_tech_skills = df_tech_skills.withColumn('Commodity Code', df_tech_skills['Commodity Code'].cast('int'))
df_tech_skills = df_tech_skills.withColumn('Commodity Title', df_tech_skills['Commodity Title'].cast('string'))
df_tech_skills = df_tech_skills.withColumn('Hot Technology', df_tech_skills['Hot Technology'].cast('string'))
df_tech_skills = df_tech_skills.withColumn('In Demand', df_tech_skills['In Demand'].cast('string'))

#Rename Columns to appropriate names
df_tech_skills = df_tech_skills.withColumnRenamed('O*NET-SOC Code','Code')
#Rename Columns to appropriate names
df_tech_skills = df_tech_skills.withColumnRenamed('Example','skill')

#Convert skills column to lower case 
df_tech_skills=df_tech_skills.withColumn("Skill", lower(col("Skill")))

In [0]:
# Convert Dataframe to view
df_skills.createOrReplaceTempView("TBL_JD_SKILLS")
df_tech_skills.createOrReplaceTempView("TBL_TECH_SKILLS")

In [0]:
#Q1 Number of job descriptions
job_description_count = spark.sql("SELECT COUNT(DISTINCT JD_ID) AS COUNT FROM TBL_JD_SKILLS ").show()

+-----+
|COUNT|
+-----+
|50000|
+-----+



In [0]:
#Q2
top_skills = spark.sql("SELECT SKILL, COUNT(SKILL) AS COUNT FROM TBL_JD_SKILLS GROUP BY SKILL ORDER BY COUNT DESC LIMIT 10 ").show()

+--------------------+-----+
|               SKILL|COUNT|
+--------------------+-----+
|                Java| 1911|
|          Javascript| 1770|
|               Sales| 1705|
|Business Development| 1545|
|    Web Technologies| 1313|
|Communication Skills| 1305|
|         development| 1238|
|           Marketing| 1184|
|             Finance| 1078|
|                HTML| 1067|
+--------------------+-----+



In [0]:
#Q3
# Convert Dataframe to table
df_skills.createOrReplaceTempView("jd_skills")

jdSkill=spark.sql("""SELECT count_skill as `num skills`, COUNT(JD_id) as Freq FROM (SELECT count(skill) as count_skill, JD_id FROM jd_skills group by JD_id) group by count_skill order by Freq desc Limit 5 """).show()




+----------+-----+
|num skills| Freq|
+----------+-----+
|        10|10477|
|         5| 3432|
|         6| 3405|
|         1| 3386|
|         7| 3345|
+----------+-----+



In [0]:
#Q4 Frequencies with which distinct skills are mentioned in JD and top 10 in desc order (Case insesitive)

top_skills_caseinsensitive  = spark.sql("SELECT LOWER(SKILL) as SKILL, COUNT(SKILL) AS COUNT FROM TBL_JD_SKILLS GROUP BY LOWER(SKILL)  ORDER BY COUNT DESC LIMIT 10 ").show()

+--------------------+-----+
|               SKILL|COUNT|
+--------------------+-----+
|                java| 2759|
|          javascript| 2738|
|               sales| 2680|
|business development| 2108|
|           marketing| 1809|
|                 sql| 1564|
|              jquery| 1547|
|                html| 1539|
|communication skills| 1537|
|                 bpo| 1530|
+--------------------+-----+



In [0]:
#Q5

Before_Join = spark.sql("SELECT COUNT(SKILL) as `BEFORE JOIN` FROM TBL_JD_SKILLS").show()
After_Join = spark.sql("SELECT COUNT(*) as `AFTER JOIN` FROM TBL_JD_SKILLS INNER JOIN TBL_TECH_SKILLS on LOWER(TBL_JD_SKILLS.SKILL) = LOWER(TBL_TECH_SKILLS.SKILL)").show()




+-----------+
|BEFORE JOIN|
+-----------+
|     463803|
+-----------+

+----------+
|AFTER JOIN|
+----------+
|   1101498|
+----------+



In [0]:
#Q6
spark.conf.set("spark.sql.repl.eagerEval.maxNumChars", 10000)  # Set a large value

top_Commodity_title = spark.sql("SELECT `COMMODITY TITLE`, COUNT(*) as COUNT FROM TBL_JD_SKILLS INNER JOIN TBL_TECH_SKILLS on LOWER(TBL_JD_SKILLS.SKILL) = LOWER(TBL_TECH_SKILLS.SKILL) GROUP BY `COMMODITY TITLE` ORDER BY COUNT(*) DESC LIMIT 10").show(truncate=False)


+-------------------------------------------------+------+
|COMMODITY TITLE                                  |COUNT |
+-------------------------------------------------+------+
|Object or component oriented development software|324521|
|Web platform development software                |298754|
|Operating system software                        |190926|
|Development environment software                 |53013 |
|Data base management system software             |44132 |
|Analytical or scientific software                |33552 |
|Web page creation and editing software           |31682 |
|Data base user interface and query software      |29436 |
|Spreadsheet software                             |18568 |
|File versioning software                         |13846 |
+-------------------------------------------------+------+

