In [58]:
import sys; 
sys.path.insert(0, '..')

In [59]:
import findspark
findspark.init()

In [60]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Nyc-Jobs-data-exploration") \
    .config("spark.sql.shuffle.partitions", 200) \
    .config("spark.sql.parquet.mergeSchema", "true") \
    .getOrCreate()

In [255]:
import re
import json
import unittest
from unittest.mock import patch
from math import floor

import numpy as np

from pyspark.sql.types import StructType, StringType
from pyspark.sql import DataFrame
import pyspark.sql.functions as F
from pyspark.sql.window import Window

### Read data

In [62]:
"""
Input dataframe schema is defined in JSON file 
"""

with open("/dataset/input_schema/nyc_jobs.json") as schema_file:
    schema = schema_file.read()

nyc_jobs_json_schema = StructType.fromJson(json.loads(schema))

In [90]:
nyc_jobs_df = spark.read.schema(nyc_jobs_json_schema).\
        option("quote", "\"").\
        option("escape", "\"").\
        option("timestampFormat", "yyyy-MM-dd'T'HH:mm:ss.SSS").\
        csv("/dataset/nyc-jobs.csv", header=True)

nyc_jobs_df.printSchema()
print(nyc_jobs_df.columns)

root
 |-- job_id: integer (nullable = true)
 |-- agency: string (nullable = true)
 |-- posting_type: string (nullable = true)
 |-- num_of_positions: integer (nullable = true)
 |-- business_title: string (nullable = true)
 |-- civil_service_title: string (nullable = true)
 |-- title_code_no: string (nullable = true)
 |-- level: string (nullable = true)
 |-- job_category: string (nullable = true)
 |-- ft_pt_indicator: string (nullable = true)
 |-- salary_range_from: double (nullable = true)
 |-- salary_range_to: double (nullable = true)
 |-- salary_frequency: string (nullable = true)
 |-- work_location: string (nullable = true)
 |-- division_work_unit: string (nullable = true)
 |-- job_description: string (nullable = true)
 |-- min_qual_requirements: string (nullable = true)
 |-- preferred_skills: string (nullable = true)
 |-- additiona_information: string (nullable = true)
 |-- to_apply: string (nullable = true)
 |-- hours_shift: string (nullable = true)
 |-- work_location_1: string (nu

In [65]:
def calculate_mod(df: DataFrame, column: str):
    grouped_data = df.groupBy(column).agg(sf.count(column).alias("count"))
    sorted_grouped_data = grouped_data.sort(grouped_data["count"].desc())
    return sorted_grouped_data.first()[column]
    
def calculate_median(df: DataFrame, column: str):
    num_rows = df.count()
    median_index = floor(num_rows / 2)
    return df.sort(column).take(median_index + 1)[-1][column]

### Numerical Data Exploration

In [92]:
# Data Exploration
# Provide a detailed analysis of source data: Column values


analytical_columns = ['num_of_positions', 'salary_range_from', 'salary_range_to']

try:
    avg_df = nyc_jobs_df.agg(*[F.avg(F.col(c)).alias('avg_{}'.format(c)) for c in analytical_columns])
    min_df = nyc_jobs_df.agg(*[F.min(F.col(c)).alias('min_{}'.format(c)) for c in analytical_columns])
    max_df = nyc_jobs_df.agg(*[F.max(F.col(c)).alias('max_{}'.format(c)) for c in analytical_columns])
    
    
    mode_dict = dict()
    median_dict = dict()

    # Calculate mode and median
    mode_dict = {col: calculate_mod(nyc_jobs_df, col) for col in analytical_columns}
    median_dict = {col: calculate_median(nyc_jobs_df, col) for col in analytical_columns}

    # Convert mode and median dictionaries to DataFrames
    mod_df = spark.createDataFrame([mode_dict])
    median_df = spark.createDataFrame([median_dict])


        
except Exception as e:
    print(f"Error calculating min: {e}")

In [93]:
print("Minimum values for the selected numerical columns ->")
min_df.show()

print("Maximum values for the selected numerical columns ->")
max_df.show()

print("Average values for the selected numerical columns ->")
avg_df.show()

print("Mod values for the selected numerical columns ->")
mod_df.show()

print("Median values for the selected numerical columns ->")
median_df.show()

Minimum values for the selected numerical columns ->
+--------------------+---------------------+-------------------+
|min_num_of_positions|min_salary_range_from|min_salary_range_to|
+--------------------+---------------------+-------------------+
|                   1|                  0.0|              10.36|
+--------------------+---------------------+-------------------+

Maximum values for the selected numerical columns ->
+--------------------+---------------------+-------------------+
|max_num_of_positions|max_salary_range_from|max_salary_range_to|
+--------------------+---------------------+-------------------+
|                 200|             218587.0|           234402.0|
+--------------------+---------------------+-------------------+

Average values for the selected numerical columns ->
+--------------------+---------------------+-------------------+
|avg_num_of_positions|avg_salary_range_from|avg_salary_range_to|
+--------------------+---------------------+---------------

### String Data Exploration

In [101]:
"""
For each string column, get the distinct values count
"""

string_columns_frequency = dict()

for i, column in enumerate(nyc_jobs_df.columns):
    if nyc_jobs_df.dtypes[i][1] == "string":
        unique_count = nyc_jobs_df.select(column).distinct().count()
        string_columns_frequency[column] = unique_count


print(string_columns_frequency)

{'agency': 52, 'posting_type': 2, 'business_title': 1244, 'civil_service_title': 312, 'title_code_no': 323, 'level': 14, 'job_category': 131, 'ft_pt_indicator': 3, 'salary_frequency': 3, 'work_location': 226, 'division_work_unit': 678, 'job_description': 1608, 'min_qual_requirements': 337, 'preferred_skills': 1283, 'additiona_information': 682, 'to_apply': 894, 'hours_shift': 182, 'work_location_1': 228, 'recruitment_contact': 1, 'residency_requirement': 51}


In [115]:
"""
For each string column, get the Most Common value and its count
"""

for i, column in enumerate(nyc_jobs_df.columns):
    if nyc_jobs_df.dtypes[i][1] == "string":
        print("Column -> {}, Most Common Value:".format(column))
        df = nyc_jobs_df.groupBy(column).agg(F.count(column).alias("count")).orderBy(F.desc("count")).limit(1)
        df.show()
    

Column -> agency, Most Common Value:
+--------------------+-----+
|              agency|count|
+--------------------+-----+
|DEPT OF ENVIRONME...|  655|
+--------------------+-----+

Column -> posting_type, Most Common Value:
+------------+-----+
|posting_type|count|
+------------+-----+
|    Internal| 1684|
+------------+-----+

Column -> business_title, Most Common Value:
+--------------------+-----+
|      business_title|count|
+--------------------+-----+
|Assistant Civil E...|   33|
+--------------------+-----+

Column -> civil_service_title, Most Common Value:
+--------------------+-----+
| civil_service_title|count|
+--------------------+-----+
|COMMUNITY COORDIN...|  182|
+--------------------+-----+

Column -> title_code_no, Most Common Value:
+-------------+-----+
|title_code_no|count|
+-------------+-----+
|        56058|  182|
+-------------+-----+

Column -> level, Most Common Value:
+-----+-----+
|level|count|
+-----+-----+
|    0| 1112|
+-----+-----+

Column -> job_categ

In [119]:
nyc_jobs_df.createOrReplaceTempView("nyc_jobs")

In [125]:
"""
Data Exploration - Top 10 job categories according to the number of job postings.
"""

category_distribution_df = nyc_jobs_df.groupBy("job_category").agg(F.countDistinct("job_id").alias("num_job_postings"))
category_distribution_df = category_distribution_df.sort(F.desc("num_job_postings")).limit(10)

category_distribution_df.show(truncate=False)

cd_df = spark.sql("""
SELECT
job_category,
count(distinct job_id) as num_job_postings
FROM nyc_jobs
GROUP BY 1
ORDER BY num_job_postings DESC
LIMIT 10
""").show(truncate=False)

+-----------------------------------------+----------------+
|job_category                             |num_job_postings|
+-----------------------------------------+----------------+
|Engineering, Architecture, & Planning    |260             |
|Technology, Data & Innovation            |182             |
|Legal Affairs                            |120             |
|Building Operations & Maintenance        |99              |
|Finance, Accounting, & Procurement       |98              |
|Public Safety, Inspections, & Enforcement|98              |
|Administration & Human Resources         |88              |
|Health                                   |71              |
|Constituent Services & Community Programs|68              |
|Policy, Research & Analysis              |64              |
+-----------------------------------------+----------------+

+-----------------------------------------+----------------+
|job_category                             |num_job_postings|
+----------------------

In [136]:
"""
Data Exploration-Salary distribution per job category
"""

min_salary_df = nyc_jobs_df.groupBy("job_category").agg(F.min("salary_range_from").\
    alias("min_salary_from"), F.max("salary_range_from").\
    alias("max_salary_from"), F.avg("salary_range_from").\
    alias("avg_salary_from"))

max_salary_df = nyc_jobs_df.groupBy("job_category").agg(F.min("salary_range_to").\
    alias("min_salary_to"), F.max("salary_range_to").\
    alias("max_salary_to"), F.avg("salary_range_to").\
    alias("avg_salary_to"))

distribution_df = min_salary_df.join(max_salary_df, on='job_category', how='outer').fillna({
    "min_salary_from": 0,
    "min_salary_to": 0,
    "max_salary_from": 0,
    "max_salary_to":0,
    "avg_salary_from":0,
    "avg_salary_to":0
}).select(
    "job_category",
    "min_salary_from",
    "min_salary_to",
    "max_salary_from",
    "max_salary_to",
    "avg_salary_from",
    "avg_salary_to",
)


# min_salary_df.show(truncate=True)
# max_salary_df.show(truncate=True)
distribution_df.show(truncate=True)

+--------------------+---------------+-------------+---------------+-------------+------------------+------------------+
|        job_category|min_salary_from|min_salary_to|max_salary_from|max_salary_to|   avg_salary_from|     avg_salary_to|
+--------------------+---------------+-------------+---------------+-------------+------------------+------------------+
|Administration & ...|        90000.0|     100000.0|        90000.0|     100000.0|           90000.0|          100000.0|
|Administration & ...|        54100.0|      83981.0|        54100.0|      83981.0|           54100.0|           83981.0|
|Health Policy, Re...|        82008.0|     107770.0|       145000.0|     180000.0|          113504.0|          143885.0|
|Finance, Accounti...|        55659.0|      70390.0|        55659.0|      70390.0|           55659.0|           70390.0|
|Information Techn...|        68239.0|      85644.0|        68239.0|      85644.0|           68239.0|           85644.0|
|Engineering, Arch...|         5

In [150]:
"""
Job posting having the highest salary per agency
"""

window  = Window.partitionBy(F.col("agency")).orderBy(F.col("salary_range_to").desc())

job_rnk_per_agency_df = nyc_jobs_df.withColumn("job_rnk_per_agency", F.dense_rank().over(window)) \
    .where(F.col("job_rnk_per_agency") == 1) \
    .select("job_id", "agency") \
    .distinct()
job_rnk_per_agency_df.show(truncate=False)

spark.sql("""
with base AS (
SELECT
job_id,
agency,
DENSE_RANK() OVER (partition by agency order by salary_range_to desc) AS rank
from nyc_jobs
)
select distinct
job_id,
agency
FROM base
WHERE rank = 1
""").show(truncate=False)

+------+------------------------------+
|job_id|agency                        |
+------+------------------------------+
|425347|LANDMARKS PRESERVATION COMM   |
|170989|OFFICE OF COLLECTIVE BARGAININ|
|420216|FIRE DEPARTMENT               |
|399933|ADMIN FOR CHILDREN'S SVCS     |
|369120|MANHATTAN COMMUNITY BOARD #8  |
|423630|TAX COMMISSION                |
|424997|HRA/DEPT OF SOCIAL SERVICES   |
|413804|TAXI & LIMOUSINE COMMISSION   |
|420065|EQUAL EMPLOY PRACTICES COMM   |
|97899 |DEPARTMENT OF BUSINESS SERV.  |
|416442|DEPT OF DESIGN & CONSTRUCTION |
|420740|TEACHERS RETIREMENT SYSTEM    |
|423322|DEPARTMENT OF CORRECTION      |
|424356|FINANCIAL INFO SVCS AGENCY    |
|417137|OFFICE OF EMERGENCY MANAGEMENT|
|423210|HOUSING PRESERVATION & DVLPMNT|
|421027|CIVILIAN COMPLAINT REVIEW BD  |
|423972|OFFICE OF MANAGEMENT & BUDGET |
|420294|MAYORS OFFICE OF CONTRACT SVCS|
|420306|MAYORS OFFICE OF CONTRACT SVCS|
+------+------------------------------+
only showing top 20 rows

+------+------

In [165]:
"""
Data Exploration - Average salary per agency for the last 2 years
"""

maximum_date = nyc_jobs_df.agg(F.max("posting_date")).collect()[0][0]
last_2_years = maximum_date - F.expr("INTERVAL 2 YEARS")
filtered_df = nyc_jobs_df.where(F.col("posting_date") >= last_2_years)
avg_salary_per_agency_df = filtered_df.groupBy("agency").agg(F.avg("salary_range_to").alias("avg_salary_per_agency")).orderBy(F.col("avg_salary_per_agency").desc())
avg_salary_per_agency_df.show(truncate=False)

spark.sql("""
with cut_off as (
select 
date(max(posting_date)) - INTERVAL 2 YEAR as cutoff_date
from nyc_jobs
)
select
agency,
avg(salary_range_to) as avg_salary_per_agency
from nyc_jobs
where posting_date >= (select cutoff_date from cut_off)
group by 1
order by avg_salary_per_agency desc
""").show(truncate=False)

+------------------------------+---------------------+
|agency                        |avg_salary_per_agency|
+------------------------------+---------------------+
|CONFLICTS OF INTEREST BOARD   |170000.0             |
|NYC EMPLOYEES RETIREMENT SYS  |118241.22222222222   |
|DEPT OF DESIGN & CONSTRUCTION |114836.08450704225   |
|DEPT OF INFO TECH & TELECOMM  |113832.51733333332   |
|FINANCIAL INFO SVCS AGENCY    |111769.48387096774   |
|NYC HOUSING AUTHORITY         |107087.58299065419   |
|BOARD OF CORRECTION           |102936.0             |
|MAYORS OFFICE OF CONTRACT SVCS|99357.14285714286    |
|DEPARTMENT OF PROBATION       |94618.10347826086    |
|LAW DEPARTMENT                |92246.99042500001    |
|NYC DEPT OF VETERANS' SERVICES|92001.0              |
|DEPARTMENT OF FINANCE         |90428.91666666667    |
|BUSINESS INTEGRITY COMMISSION |87857.14285714286    |
|OFFICE OF THE COMPTROLLER     |87436.25             |
|DEPT OF ENVIRONMENT PROTECTION|87164.74768662675    |
|HOUSING P

In [168]:
"""
Data exploration - highest paid skills in the US market
"""

highest_paying_skills_df = nyc_jobs_df.groupby("preferred_skills").agg(avg("salary_range_to").alias("avg_salary")).orderBy(F.col("avg_salary").desc()).limit(10)
highest_paying_skills_df.show(truncate=True)


spark.sql("""
select
preferred_skills,
avg(salary_range_to) as avg_salary
from nyc_jobs
group by 1
order by avg_salary desc
limit 10
""").show(truncate=True)

+--------------------+----------+
|    preferred_skills|avg_salary|
+--------------------+----------+
|Valid holder of t...|  234402.0|
|â€¢	Minimum of fi...|  225217.0|
|â€¢	A minimum of ...|  224749.0|
|â€¢	A Masterâ€™s ...|  224749.0|
|The Deputy Commis...|  218587.0|
|Preferred Educati...|  217244.0|
|Preferred Educati...|  217244.0|
|Candidate must ha...|  217244.0|
|The Deputy Commis...|  209585.0|
|â€¢	10+ years of ...|  208826.0|
+--------------------+----------+

+--------------------+----------+
|    preferred_skills|avg_salary|
+--------------------+----------+
|Valid holder of t...|  234402.0|
|â€¢	Minimum of fi...|  225217.0|
|â€¢	A Masterâ€™s ...|  224749.0|
|â€¢	A minimum of ...|  224749.0|
|The Deputy Commis...|  218587.0|
|Candidate must ha...|  217244.0|
|Preferred Educati...|  217244.0|
|Preferred Educati...|  217244.0|
|The Deputy Commis...|  209585.0|
|â€¢	10+ years of ...|  208826.0|
+--------------------+----------+



### Data Processing

In [172]:
"""
Removing the duplicate records and filling the placeholder values where Null
"""

distinct_data_df = nyc_jobs_df.distinct().fillna({
    "job_id": 0 ,
    "num_of_positions": 0 ,
    "salary_range_from": 0 ,
    "salary_range_to": 0 ,
})



In [200]:
def get_highest_degree(degrees_string):
    # Define the precedence of degrees in order
    degree_precedence = ["Ph.D.", "master", "baccalaureate", "graduate", "undergraduate", "university", "high school"]
    
    degrees = degrees_string.split(',')
    
    highest_degree = None
    for degree in degrees:
        degree = degree.strip()  # Remove any extra whitespace
        if degree in degree_precedence:
            if highest_degree is None or degree_precedence.index(degree) < degree_precedence.index(highest_degree):
                highest_degree = degree
    
    return highest_degree

In [221]:
"""
Tokenize the minimum qualification descriptive column to derive the degree.

This function takes a descriptive text related to the minimum qualification 
and tokenizes it to identify educational degrees mentioned in the text. 
It extracts the highest degree found based on a predefined list of degree keywords.

Parameters:
-----------
text : str
    A string containing descriptive text about the minimum qualifications for a job.

Returns:
--------
str
    Returns the highest degree found in the text. If no degree is found, it returns 'N/A'.
"""

def extract_qualification_degree(text):
    degree_keywords = ["baccalaureate", "graduate", "master","Ph.D.", "high school", "university","undergraduate"]
    
    # Tokenizing the text to find out the qualification
    tokens = re.findall(r'\w+', text.lower())    
    degree_tokens = {word for word in tokens if any(keyword in word for keyword in degree_keywords)}
    
    degree = 'N/A'
    
    if degree_tokens is not None and len(degree_tokens) >= 1:
        degree = get_highest_degree(",".join(degree_tokens)) 
            
    return degree
     
# UDF function registration
extract_qualification_udf = F.udf(extract_qualification_degree, StringType())

filtered_qual_df = nyc_jobs_df.filter(F.col("min_qual_requirements").isNotNull())
df_with_degree = filtered_qual_df.withColumn("degree", extract_qualification_udf(F.col("min_qual_requirements")))

+-------------+
|degree       |
+-------------+
|baccalaureate|
|baccalaureate|
|N/A          |
|N/A          |
|N/A          |
|N/A          |
|baccalaureate|
|baccalaureate|
|master       |
|N/A          |
|baccalaureate|
|baccalaureate|
|baccalaureate|
|graduate     |
|graduate     |
|N/A          |
|baccalaureate|
|baccalaureate|
|N/A          |
|baccalaureate|
+-------------+
only showing top 20 rows



In [228]:
"""
To find correlation between the higher degree and the salary?
"""

# Defining degree precedence
degree_precedence = {
    "Ph.D.": 7,
    "master": 6,
    "baccalaureate": 5,
    "graduate": 4,
    "undergraduate": 3,
    "university": 3,
    "high school": 1,
    "N/A": 0,
    
}

# Create a UDF to map degrees to their numerical values
degree_to_numeric = F.udf(lambda degree: degree_precedence.get(degree, -1))

# Apply the UDF to create a new column with numerical degree values
df_with_degree_numeric = df_with_degree.withColumn("degree_numeric", degree_to_numeric(F.col("degree")).cast("int"))

correlation = df_with_degree_numeric.stat.corr("degree_numeric", "salary_range_to")
print(f"Correlation between degree and salary_range_to: {correlation}")


"""{} is a small positive correlation, indicating there is a slight tendency for higher degrees to be associated with higher Salary""".format(correlation)

Correlation between degree and salary_range_to: 0.14170815180585042


'0.14170815180585042 is a small positive correlation, indicating there is a slight tendency for higher degrees to be associated with higher Salary'

In [239]:
"""
Applying the binning and salary range distribution on the salary column
"""
binned_salary = F.when(F.col("salary_range_from") < 20000, "<20,000").\
                when((F.col("salary_range_from") >= 20000) & (F.col("salary_range_from") <= 40000), "20,000-40,000").\
                when((F.col("salary_range_from") >= 40001) & (F.col("salary_range_from") <= 75000), "40,001-75,000").\
                otherwise(">75,000")

salary_distribution_df = nyc_jobs_df.withColumn("binned_salary_range_from", binned_salary)

salary_distribution_df.groupBy(["job_category", "binned_salary_range_from"]).agg(F.countDistinct("job_id").alias("number_of_jobs")).show()


+--------------------+------------------------+--------------+
|        job_category|binned_salary_range_from|number_of_jobs|
+--------------------+------------------------+--------------+
|Communications & ...|                 <20,000|             1|
|Maintenance & Ope...|                 <20,000|             1|
|Finance, Accounti...|                 >75,000|             1|
|Engineering, Arch...|           40,001-75,000|             7|
|Administration & ...|           40,001-75,000|             1|
|Administration & ...|           40,001-75,000|             2|
|Finance, Accounti...|           40,001-75,000|             1|
|Constituent Servi...|                 <20,000|             1|
|Engineering, Arch...|           40,001-75,000|             2|
|Constituent Servi...|                 >75,000|             1|
|Policy, Research ...|                 >75,000|            11|
|Constituent Servi...|           40,001-75,000|             1|
|Constituent Servi...|                 <20,000|        

In [240]:
"""
Dropping the irrelevant features
"""

irrelevant_features = ["recruitment_contact", "process_date"]
output_nyc_df = salary_distribution_df.drop(*irrelevant_features)
output_nyc_df.printSchema()

root
 |-- job_id: integer (nullable = true)
 |-- agency: string (nullable = true)
 |-- posting_type: string (nullable = true)
 |-- num_of_positions: integer (nullable = true)
 |-- business_title: string (nullable = true)
 |-- civil_service_title: string (nullable = true)
 |-- title_code_no: string (nullable = true)
 |-- level: string (nullable = true)
 |-- job_category: string (nullable = true)
 |-- ft_pt_indicator: string (nullable = true)
 |-- salary_range_from: double (nullable = true)
 |-- salary_range_to: double (nullable = true)
 |-- salary_frequency: string (nullable = true)
 |-- work_location: string (nullable = true)
 |-- division_work_unit: string (nullable = true)
 |-- job_description: string (nullable = true)
 |-- min_qual_requirements: string (nullable = true)
 |-- preferred_skills: string (nullable = true)
 |-- additiona_information: string (nullable = true)
 |-- to_apply: string (nullable = true)
 |-- hours_shift: string (nullable = true)
 |-- work_location_1: string (nu

In [241]:
"""
Saving the transformed data at the output location
"""
output_nyc_df.write.option("header", "true").option("quote", "\"").option("escape", "\"").option("mode", "overwrite").partitionBy("agency").parquet("/dataset/curated/")

### Test Cases

In [264]:
class TestQualificationDegree(unittest.TestCase):

    def test_single_degree(self):
        result = extract_qualification_degree("A baccalaureate degree from an accredited university.")
        self.assertEqual(result, 'baccalaureate')

    def test_multiple_degrees(self):
        result = extract_qualification_degree("The candidate must have a baccalaureate degree or a master degree.")
        self.assertEqual(result, 'master')

    def test_no_degree(self):
        result = extract_qualification_degree("Experience in business management is required.")
        self.assertEqual(result, 'N/A')

# if __name__ == '__main__':
unittest.main(argv=[''], exit=False)

...........
----------------------------------------------------------------------
Ran 11 tests in 0.010s

OK


<unittest.main.TestProgram at 0x7fe64b642208>

In [263]:
class TestGetHighestDegree(unittest.TestCase):

    def test_single_degree(self):
        result = get_highest_degree("master")
        self.assertEqual(result, "master")

    def test_multiple_degrees(self):
        result = get_highest_degree("graduate, university, baccalaureate")
        self.assertEqual(result, "baccalaureate")

    def test_whitespace_handling(self):
        result = get_highest_degree("   Ph.D.  ,  master ")
        self.assertEqual(result, "Ph.D.")

    def test_no_valid_degree(self):
        result = get_highest_degree("certificate, diploma")
        self.assertIsNone(result)

    def test_empty_input(self):
        result = get_highest_degree("")
        self.assertIsNone(result)

# if __name__ == '__main__':
unittest.main(argv=[''], exit=False)

...........
----------------------------------------------------------------------
Ran 11 tests in 0.011s

OK


<unittest.main.TestProgram at 0x7fe64b6764e0>