In [1]:
# Importing Libraries
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt

# Loading Data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleanup
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

In [2]:
# 1. Convert the job_posted_date column to a string format 'YYYY-MM-DD' and create a new column job_posted_date_str.

df['job_posted_date_str'] = df['job_posted_date'].apply(lambda x: x.strftime('%Y-%m-%d'))
df[['job_posted_date', 'job_posted_date_str']].head()

Unnamed: 0,job_posted_date,job_posted_date_str
0,2023-06-16 13:44:15,2023-06-16
1,2023-01-14 13:18:07,2023-01-14
2,2023-10-10 13:14:55,2023-10-10
3,2023-07-04 13:01:41,2023-07-04
4,2023-08-07 14:29:36,2023-08-07


In [3]:
# 2.Calculate the number of days since each job was posted. 
# Create a new column days_since_posted that contains this value. Use the job_posted_date column.

import datetime as dt
df['days_since_posted'] = df['job_posted_date'].apply(lambda day: (dt.datetime.now() - day).days)

df['days_since_posted'].head()





0    761
1    914
2    645
3    743
4    709
Name: days_since_posted, dtype: int64

In [4]:
# 2.

current_date = dt.datetime.now()
df['days_since_posted'] = df['job_posted_date'].apply(lambda date: (current_date - date).days)
df[['job_posted_date', 'days_since_posted']].head()

Unnamed: 0,job_posted_date,days_since_posted
0,2023-06-16 13:44:15,761
1,2023-01-14 13:18:07,914
2,2023-10-10 13:14:55,645
3,2023-07-04 13:01:41,743
4,2023-08-07 14:29:36,709


In [5]:
# 3. Create a copy of the DataFrame called df_filtered and drop the NaN values for salary_year_avg.
# Then, create a new column salary_category that categorizes the salary_year_avg into three categories: 'Low' for salaries less than 60,000, 'Medium' for salaries between 60,000 and 100,000, and 'High' for salaries greater than 100,000.
# Then show the df_filtered DataFrame and the salary_year_avg and salary_category columns.

df_filtered = df.copy()
df_filtered = df_filtered.dropna(subset=['salary_year_avg'])
df_filtered['salary_category'] = df_filtered['salary_year_avg'].apply(
    lambda x: 'Low' if x < 60000 else ('Medium' if x <= 100000 else 'High')
)

df_filtered[['salary_year_avg', 'salary_category']].head()


Unnamed: 0,salary_year_avg,salary_category
28,109500.0,High
77,140000.0,High
92,120000.0,High
100,228222.0,High
109,89000.0,Medium


In [None]:
# 4. Utwórz nową kolumnę year_posted, która wyciąga rok z job_posted_date

df['year_posted'] = df['job_posted_date'].apply(lambda x: x.year)
df[['year_posted', 'job_posted_date']]



Unnamed: 0,year_posted,job_posted_date
0,2023,2023-06-16 13:44:15
1,2023,2023-01-14 13:18:07
2,2023,2023-10-10 13:14:55
3,2023,2023-07-04 13:01:41
4,2023,2023-08-07 14:29:36
...,...,...
785736,2023,2023-03-13 06:16:16
785737,2023,2023-03-12 06:18:18
785738,2023,2023-03-12 06:32:36
785739,2023,2023-03-12 06:32:15


In [8]:
# 5. Stwórz kolumnę seniority_level na podstawie job_title_short.

df['seniority_level'] = (df['job_title_short']
                         .apply(lambda x: 'Senior' if 'Senior' in x 
                                else ('Junior' if 'Junior' in x else 'Mid'))
                         )

df.sample(10)

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,...,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills,job_posted_date_str,days_since_posted,year_posted,seniority_level
148437,Data Analyst,Analista de Modelamiento Predictivo de Datos,"Santiago, Chile",via BeBee,Full-time,False,Chile,2023-10-16 08:39:11,True,False,...,,,,Fashion's Park,,,2023-10-16,640,2023,Mid
334296,Data Engineer,Alation Data Engineer,Anywhere,via ZipRecruiter,Full-time and Temp work,True,Georgia,2023-10-26 00:34:37,True,False,...,,,,"RIT Solutions, Inc.",['snowflake'],{'cloud': ['snowflake']},2023-10-26,630,2023,Mid
413101,Data Scientist,Data Scientist - Hybrid,"Scottsdale, AZ",via SmartRecruiters Job Search,Full-time,False,"California, United States",2023-06-21 16:04:00,False,True,...,,,,Spear Education,,,2023-06-21,756,2023,Mid
343193,Data Analyst,Científico/a de datos - Real Estate,"Madrid, Spain",via Wijobs.es,Full-time,False,Spain,2023-02-11 00:19:34,True,False,...,,,,Deloitte,"['python', 'r', 'aws', 'spark']","{'cloud': ['aws'], 'libraries': ['spark'], 'pr...",2023-02-11,887,2023,Mid
63098,Data Analyst,Data Analyst Assc,"Fort Wayne, IN",via LinkedIn,Full-time,False,"Illinois, United States",2023-09-01 06:18:41,False,False,...,,,,Pacer Staffing LLC,"['oracle', 'hadoop', 'cognos', 'alteryx', 'pow...","{'analyst_tools': ['cognos', 'alteryx', 'power...",2023-09-01,685,2023,Mid
124504,Data Scientist,"VP, Data Scientist, Finance COO - Analytics & ...",Singapore,via EFinancialCareers,Full-time,False,Singapore,2023-05-26 08:26:13,False,False,...,,,,DBS Bank Limited,"['sql', 'excel']","{'analyst_tools': ['excel'], 'programming': ['...",2023-05-26,783,2023,Mid
56767,Business Analyst,Operations Analyst,"Montevideo, Montevideo Department, Uruguay","via Trabajo.org - Vacantes De Empleo, Trabajo",Full-time,False,Uruguay,2023-07-28 06:36:40,False,False,...,,,,Directa24,['excel'],{'analyst_tools': ['excel']},2023-07-28,720,2023,Mid
732698,Senior Data Analyst,Senior Digital Analyst (m/f/d),Germany,via XING,Full-time,False,Germany,2023-11-13 21:56:05,True,False,...,,,,TUI AG,"['excel', 'flow']","{'analyst_tools': ['excel'], 'other': ['flow']}",2023-11-13,611,2023,Senior
665329,Data Engineer,Data Engineer II,Anywhere,via LinkedIn,Full-time,True,India,2023-07-06 10:13:03,False,False,...,,,,GoGuardian,"['python', 'sql', 'nosql', 'redshift', 'snowfl...","{'analyst_tools': ['tableau'], 'cloud': ['reds...",2023-07-06,742,2023,Mid
247612,Senior Data Scientist,Senior Data Scientist,"Wokingham, UK",via Big Country Jobs,Full-time,False,United Kingdom,2023-06-23 07:30:09,False,False,...,,,,National Grid Careers,"['python', 'r', 'sql']","{'programming': ['python', 'r', 'sql']}",2023-06-23,755,2023,Senior


In [12]:
# 6. Policz długość opisu pracy, jeśli masz kolumnę job_description 
# (jeśli jej nie ma, możemy wziąć inną tekstową). Stwórz kolumnę desc_length z liczbą znaków w opisie.

df['title_word_count'] = df['job_title'].apply(lambda x: len(x.split()) if pd.notnull(x) else 0)
df.head()

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,...,salary_hour_avg,company_name,job_skills,job_type_skills,job_posted_date_str,days_since_posted,year_posted,seniority_level,title_length,title_word_count
0,Senior Data Engineer,Senior Clinical Data Engineer / Principal Clin...,"Watertown, CT",via Work Nearby,Full-time,False,"Texas, United States",2023-06-16 13:44:15,False,False,...,,Boehringer Ingelheim,,,2023-06-16,761,2023,Senior,68,10
1,Data Analyst,Data Analyst,"Guadalajara, Jalisco, Mexico",via BeBee México,Full-time,False,Mexico,2023-01-14 13:18:07,False,False,...,,Hewlett Packard Enterprise,"['r', 'python', 'sql', 'nosql', 'power bi', 't...","{'analyst_tools': ['power bi', 'tableau'], 'pr...",2023-01-14,914,2023,Mid,12,2
2,Data Engineer,"Data Engineer/Scientist/Analyst, Mid or Senior...","Berlin, Germany",via LinkedIn,Full-time,False,Germany,2023-10-10 13:14:55,False,False,...,,ALPHA Augmented Services,"['python', 'sql', 'c#', 'azure', 'airflow', 'd...","{'analyst_tools': ['dax'], 'cloud': ['azure'],...",2023-10-10,645,2023,Mid,54,6
3,Data Engineer,LEAD ENGINEER - PRINCIPAL ANALYST - PRINCIPAL ...,"San Antonio, TX",via Diversity.com,Full-time,False,"Texas, United States",2023-07-04 13:01:41,True,False,...,,Southwest Research Institute,"['python', 'c++', 'java', 'matlab', 'aws', 'te...","{'cloud': ['aws'], 'libraries': ['tensorflow',...",2023-07-04,743,2023,Mid,65,10
4,Data Engineer,Data Engineer- Sr Jobs,"Washington, DC",via Clearance Jobs,Full-time,False,Sudan,2023-08-07 14:29:36,False,False,...,,Kristina Daniel,"['bash', 'python', 'oracle', 'aws', 'ansible',...","{'cloud': ['oracle', 'aws'], 'other': ['ansibl...",2023-08-07,709,2023,Mid,22,4
