In [1]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder.appName("recruitment_transform2").getOrCreate()

# Read all JSON files from the directory into a single DataFrame
df = spark.read.json("../recruitment_extract")


In [2]:
# Filter out the corrupt records
df.cache()
clean_corrupt_df = df.filter(df._corrupt_record.isNull())

# Count the number of records in the clean DataFrame
record_count = clean_corrupt_df.count()

# Print the result
print(f"Number of records in clean_df: {record_count}")


Number of records in clean_df: 987


In [3]:
# Drop the _corrupt_record column from clean_df
clean_corrupt_df = clean_corrupt_df.drop("_corrupt_record")

# Show the schema and verify the column is removed
clean_corrupt_df.printSchema()
# clean_df.show(20)


root
 |-- job_company: string (nullable = true)
 |-- job_description: string (nullable = true)
 |-- job_expire_date: string (nullable = true)
 |-- job_link: string (nullable = true)
 |-- job_location: string (nullable = true)
 |-- job_salary: string (nullable = true)
 |-- job_schedule: string (nullable = true)
 |-- job_title: string (nullable = true)
 |-- job_yoe: string (nullable = true)



In [8]:
# Drop the _corrupt_record column from clean_df
clean_df = clean_corrupt_df.drop("job_schedule")

# Show the schema and verify the column is removed
clean_df.printSchema()
clean_df.show(20)

root
 |-- job_company: string (nullable = true)
 |-- job_description: string (nullable = true)
 |-- job_expire_date: string (nullable = true)
 |-- job_link: string (nullable = true)
 |-- job_location: string (nullable = true)
 |-- job_salary: string (nullable = true)
 |-- job_title: string (nullable = true)
 |-- job_yoe: string (nullable = true)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+
|         job_company|     job_description|     job_expire_date|            job_link|        job_location|          job_salary|           job_title|      job_yoe|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+
|Công Ty TNHH Qorv...|Mô tả công việc:\...|Hết hạn trong 7 ngày|https://www.vietn...|5th Floor, CIC To...|        Thương lượng|IC TEST Developme...|            5|

In [10]:
# Define the regex pattern for Korean characters
korean_pattern = "[\uAC00-\uD7A3]"  # This range includes Hangul syllables

# Remove records with Korean characters in the job_description
clean_df2 = clean_df.filter(~clean_df.job_description.rlike(korean_pattern))

# Count remaining records
remaining_count = clean_df2.count()
print(f"Number of records after removing Korean entries: {remaining_count}")

# Show the cleaned DataFrame (optional)
clean_df2.show(20)


Number of records after removing Korean entries: 986
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+
|         job_company|     job_description|     job_expire_date|            job_link|        job_location|          job_salary|           job_title|      job_yoe|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+
|Công Ty TNHH Qorv...|Mô tả công việc:\...|Hết hạn trong 7 ngày|https://www.vietn...|5th Floor, CIC To...|        Thương lượng|IC TEST Developme...|            5|
|         Techcombank|Mô tả công việc:\...|Hết hạn trong 21 ...|https://www.vietn...|Head Office: 6 Qu...|$ 1,000-2,500 /tháng|Senior Expert, Bu...|           10|
|Công Ty Cổ Phần T...|Mô tả công việc:\...|Hết hạn trong 7 ngày|https://www.vietn...|Tầng 4, Toà nhà 6...|  Tới $ 1,800 /tháng|SAP L

In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col, when, concat, lit

In [12]:
from pyspark.sql import functions as F

def split_job_description(clean_df):
    # Define the conditions for splitting the job_description
    clean_df_split = clean_df.withColumn(
        "Mo ta cv",
        when(col("job_description").contains("Yêu cầu công việc"), split(col("job_description"), "Yêu cầu công việc").getItem(0))
        .when(col("job_description").contains("Kinh nghiệm / Kỹ năng chi tiết"), split(col("job_description"), "Kinh nghiệm / Kỹ năng chi tiết").getItem(0))
        .otherwise(col("job_description"))
    ).withColumn(
        "YC CV",
        when(col("job_description").contains("Yêu cầu công việc"), concat(lit("Yêu cầu công việc"), split(col("job_description"), "Yêu cầu công việc").getItem(1)))
        .when(col("job_description").contains("Kinh nghiệm / Kỹ năng chi tiết"), concat(lit("Kinh nghiệm / Kỹ năng chi tiết"), split(col("job_description"), "Kinh nghiệm / Kỹ năng chi tiết").getItem(1)))
        .otherwise(None)
    )
    
    # Select and return the new DataFrame with the new columns
    return clean_df_split

# Apply the function to your clean_df
split_df = split_job_description(clean_df)

# Show the result
split_df.show(truncate=False)


+---------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------