In [1]:
from google.colab import drive

In [6]:
drive.mount('/content/drive')

Mounted at /content/drive


load and import necessary libraries:

In [78]:
!pip install ydata_profiling
!pip install hijri_converter



In [126]:
import pandas as pd
import numpy as np
import ast
import plotly.express as px
from ydata_profiling import ProfileReport
from hijri_converter import Hijri, Gregorian  # Importing Hijri converter

read dataframe:

In [107]:
df = pd.read_csv('/content/drive/MyDrive/usecase5/Jadarat_data.csv')

# Data profiling

In [81]:
profile=ProfileReport(df,title='jadarat profile')

In [82]:
profile.to_notebook_iframe()

Output hidden; open in https://colab.research.google.com to view.

####relevance

In [108]:
# no need these columns in the analysis
df.drop(columns=['comp_no','qualif'],inplace=True)

####uniqueness:

In [109]:
# drop city already there's region column
df.drop(columns='city',inplace=True)

In [110]:
# drop duplicated rows
df.drop_duplicates(inplace=True)

In [111]:
# check
df.duplicated().sum()

0

job id shouldnt be duplicated

In [112]:
# job id shouldnt be duplicated drop first
df.drop_duplicates(subset='job_post_id',keep='last',inplace=True)

####completeness:

In [113]:
df.isnull().sum()

Unnamed: 0,0
job_title,0
job_date,0
job_desc,0
job_tasks,0
comp_name,0
comp_type,0
comp_size,14
eco_activity,71
region,0
benefits,0


In [114]:
# replace eco _activity by غبر معروف because unknown their eco_activity
df['eco_activity'].fillna('غير معروف', inplace=True)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





In [115]:
# replace comp_size by لم يحدد because unknown their comp_size
df['comp_size'].fillna('لم يحدد', inplace=True)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





##check accurcy:

In [116]:
df.dtypes

Unnamed: 0,0
job_title,object
job_date,object
job_desc,object
job_tasks,object
comp_name,object
comp_type,object
comp_size,object
eco_activity,object
region,object
benefits,object


In [94]:
df.shape

(1381, 15)

In [117]:
# exper looks like 0 years i want to remove years
df['exper']=df['exper'].apply(lambda x:x.split(' Years')[0])
df['exper']=df['exper'].astype(int)

In [118]:
# postion like 2/3 i want split them into two columns better
df['num_pos_filled']=df['positions'].apply(lambda x:x.split('/')[0]).astype('int')
df['num_pos_reqiured']=df['positions'].apply(lambda x:x.split('/')[1]).astype('int')

In [133]:
df.drop(columns=['positions'],inplace=True)

In [119]:
def extract_salary(cell):
    # Convert the string representation to a list
    parsed_list = ast.literal_eval(cell)
    # Find the salary and convert it to float
    salary_index = parsed_list.index('Salary') + 1
    return float(parsed_list[salary_index])


In [120]:
# extract salary from benefits
df['salary'] = df['benefits'].apply(extract_salary)

In [121]:
def extract_benfit(cell):
    # Convert the string representation to a list
    parsed_list = ast.literal_eval(cell)
    # Find the benfit and convert it to float
    if 'Benefits' in parsed_list:
        benefits_index = parsed_list.index('Benefits') + 1
        benefits = parsed_list[benefits_index]
    else:
        benefits = 'unknown'  # Return NaN if benefits do not exist
    return  benefits

In [122]:
# benefits without salary from benefits

df['benefits'] = df['benefits'].apply(extract_benfit)

In [123]:
# put it in another dataframe because i think i dont need it
df_benifit=df[df['benefits'].notnull()]
df.drop(columns='benefits',inplace=True)

In [125]:
# since 22/04/1444 most frequency i replace publish date by it because it may be that date
df.loc[df['job_date'] == 'Publish date', 'job_date'] = '22/04/1444'

In [127]:
# Function to convert Hijri dates to Gregorian
def convert_hijri_to_gregorian(hijri_date):
    day, month, year = map(int, hijri_date.split('/'))
    hijri_date = Hijri(year, month, day)
    gregorian_date = hijri_date.to_gregorian()
    return pd.Timestamp(year=gregorian_date.year, month=gregorian_date.month, day=gregorian_date.day)

In [128]:
df['job_date'] = df['job_date'].apply(convert_hijri_to_gregorian)

In [134]:
df.dtypes

Unnamed: 0,0
job_title,object
job_date,datetime64[ns]
job_desc,object
job_tasks,object
comp_name,object
comp_type,object
comp_size,object
eco_activity,object
region,object
contract,object


check outlier:


In [40]:
profile=ProfileReport(df,title='jadarat profile')
profile.to_notebook_iframe()

Output hidden; open in https://colab.research.google.com to view.

i wont remove outlier in salary because i think salary normal not over

##answer questions:

 What proportion of job postings is attributed to each region within the
 kingdom?

In [135]:
regionn=df['region'].value_counts()

In [136]:
fig = px.pie(df,
             values=regionn.values,
             names=regionn.index,
             title='region',
             width=1000,
             height=800)
fig.show()

تُشكل الرياض 42% من إجمالي الوظائف في المملكة، وذلك بفضل كونها العاصمة وما شهدته من تطوير كبير في مختلف المجالات.

**Is there a gender preference indicated in the job postings?**

In [137]:
genterr=df['gender'].value_counts()

In [138]:
fig = px.pie(df,
             values=genterr.values,
             names=genterr.index,
             title='Genders',
             width=1000,
             height=800)
fig.show()

٣٩.٦ % من الوظائف المعلن عنها ليس لها تفضيلات لأننا حاليا في عصر دعم كل من النساء والرجال

 What is the expected salary range for fresh graduates?

Are job opportunities predominantly targeted at individuals with experience, or is there room for fresh graduates as well?

In [139]:
experr=df['exper'].value_counts()

In [140]:
fig = px.pie(df,
             values=experr.values,
             names=experr.index,
             title='exper',
             width=1000,
             height=800)
fig.show()

**the code below its for storytelling in streamlit**

المناطق الأكثر توظيفاً في المملكة بين عامي 2022-2023



In [141]:
# Create a new column with the date as an integer in YYYYMMDD format
df['jobdate_int'] = df['job_date'].dt.year

In [142]:
r2022=df[df['jobdate_int']==2022]['region'].value_counts()

In [143]:
import plotly.express as px

fig = px.pie(df[df['jobdate_int']==2022],
             values=r2022.values,
             names=r2022.index,
             title='Employed regions 2022',
             width=1000,
             height=800)
fig.show()

In [144]:
r2023=df[df['jobdate_int']==2023]['region'].value_counts()

In [145]:
import plotly.express as px

fig = px.pie(df[df['jobdate_int']==2023],
             values=r2023.values,
             names=r2023.index,
             title='Employed regions 2023',
             width=1000,
             height=800)
fig.show()