In [None]:
import pandas as pd
from datetime import datetime, timedelta
import requests
import io 
#from transformers import pipeline # Might crash kernal needs lots of disk space and RAM

pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)     # Show all rows
pd.set_option('display.max_colwidth', None)


In [None]:
# Base URL for NYC GOV Jobs JSON data API
base_url = "https://data.cityofnewyork.us/resource/kpav-sd4t.json"

limit = 1000 
offset = 0 
all_data = []


while True:
   
    params = {
        "$limit": limit,
        "$offset": offset
    }
    
    response = requests.get(base_url, params=params)
    
    # If the request was successful, process the data
    if response.status_code == 200:
       
        chunk_data = response.json()
        
        if not chunk_data:
            break
        
        # Append the current chunk to the list
        all_data.extend(chunk_data)
        
        # Update the offset for the next request
        offset += limit
    else:
        print(f"Failed to retrieve data: {response.status_code}")
        break

# Convert the list of dictionaries to a DataFrame
full_df = pd.DataFrame(all_data)

# Drop rows where 'Job ID' is NaN
full_df = full_df.dropna(subset=['job_id'])

# Show the first few rows of the cleaned data
print(full_df.head())


In [None]:
full_df.columns

In [None]:
# Display the first few rows of the DataFrame
#print(.head())

In [None]:
# only external postings
df = full_df
df = df[df['posting_type'] == 'External']

# Filter by desired poistions
titles_to_match = ['Data Scientist', 'Software Engineer', 'Data Analyst', 'Software Developer', 'Data', 'Software']
df = df[df['business_title'].str.contains('|'.join(titles_to_match), case=False, na=False)]

#df.head()


In [None]:
df.sort_values(by = 'posting_date', ascending=False, inplace=True)

df['posting_date'] = pd.to_datetime(df['posting_date'])
df['last_updated'] = pd.to_datetime(df['posting_updated'], errors='coerce')

# Get the date 4 months ago from today
four_months_ago = datetime.now() - timedelta(days=4*30)

# Filter the rows
df = df[~((df['posting_date'] <= four_months_ago) & df['last_updated'].isna())]


In [None]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
df['Job Description Summarized'] = summarizer(df['Job Description'], max_length=130, min_length=30)

#df['Job Description Summarized'].head(5)

In [None]:
cols = ['Job ID', '# Of Positions', 'Business Title', 'Career Level', 'Division/Work Unit', 
                  'Job Description Summarized', 'Minimum Qual Requirements', 'Preferred Skills', 
                  'Additional Information', 'To Apply', 'Posting Date', 'Post Until', 'Posting Updated'
]

cols_more_info = ['job_id', 'number_of_positions', 'business_title',  'job_category',
'full_time_part_time_indicator', 'career_level', 'salary_range_from',
'salary_range_to', 'division_work_unit', 'job_description', 
'minimum_qual_requirements', 'preferred_skills', 'additional_information', 'to_apply',
'posting_date', 'post_until', 'posting_updated'
]

# Display the first 50 rows for the specified columns
df[cols_more_info].head(50)
