<a href="https://colab.research.google.com/github/MohamedashickR/Ai-Jobs-Datasets/blob/main/AI_Jobs_Dataset_Data_Understanding_and_Ensure_Business.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import csv

def readcsvtodict(filepath):

  data = []
  with open(filepath,'r') as file:
    reader = csv.reader(file)
    headers = next(reader)
    print(headers)
    for row in reader:
      row_dict = {}
      for i,header in enumerate(headers):
        try:
          if header in['salary_usd','remote_ratio','years_experience','job_description_length']:
            row_dict[header] = int(row[i])
          elif header == 'benefits_score':
            row_dict[header] = float(row[i])
          else:
            row_dict[header] = row[i]
        except ValueError:
          row_dict[header] = row[i]
      data.append(row_dict)
  return data



jobdata = readcsvtodict('/content/ai_job_dataset.csv')
#Calculating Average Salary in USD

total_salary = 0
for job in jobdata:
  total_salary += job['salary_usd']
average_salary = total_salary / len(jobdata)
print(f"The Average Salary (USD) is: ${average_salary:,.2f}\n")


['job_id', 'job_title', 'salary_usd', 'salary_currency', 'experience_level', 'employment_type', 'company_location', 'company_size', 'employee_residence', 'remote_ratio', 'required_skills', 'education_required', 'years_experience', 'industry', 'posting_date', 'application_deadline', 'job_description_length', 'benefits_score', 'company_name']
The Average Salary (USD) is: $115,348.97



In [None]:
#Finding the most common job title

job_title_counts = {}

for job in jobdata:
  title = job['job_title']
  job_title_counts[title] = job_title_counts.get(title,0) + 1

most_common_title = None
max_count = 0

for title,count in job_title_counts.items():
  if count > max_count:
    most_common_title = title
    max_count = count
print(f"The most common title is : '{most_common_title}'appearing {max_count} times.\n")


The most common title is : 'Machine Learning Researcher'appearing 808 times.



In [None]:
#Count the number of unique company location values
unique_locations = set()

for job in jobdata:
  unique_locations.add(job['company_location'])
print(f"There are {len(unique_locations)} unique company locations.\n")

There are 20 unique company locations.



In [None]:
#Filtering Jobs based on experience Levels
senior_jobs = []

for job in jobdata:
  if job['experience_level'] == 'SE':
     senior_jobs.append(job)
print(f" There are {len(senior_jobs)} 'senior level' jobs in the dataset.\n")

 There are 3741 'senior level' jobs in the dataset.



In [None]:
#Display a sample of senior jobs (First 3)

if senior_jobs:
  for i,job in enumerate(senior_jobs[:3]):
    print(f"Job {i+1}: Title = {job['job_title']},salary =${job['salary_usd']:,.0f},company={job['company_name']}\n")
else:
  print("No senior level jobs found.\n")

Job 1: Title = AI Research Scientist,salary =$90,376,company=Smart Analytics

Job 2: Title = NLP Engineer,salary =$80,215,company=Future Systems

Job 3: Title = AI Architect,salary =$123,574,company=Neural Networks Co



In [None]:
#Calculate the median Salary

salaries = []
for job in jobdata:
  salaries.append(job['salary_usd'])
salaries.sort()

n = len(salaries)

if n % 2 == 0:
  median_salary = (salaries[n // 2 - 1] + salaries[n // 2])/2
  print(f"The median Salary in USD using even is : ${median_salary:,.2f}\n")
else:
  median_salary = salaries[n//2]
print(f"The median Salary in USD is : ${median_salary:,.2f}\n")

The median Salary in USD using even is : $99,705.00

The median Salary in USD is : $99,705.00



In [None]:
#Finding average years of experience of each experience level

experience_level_data = {}

for job in jobdata:
  level = job['experience_level']
  years = job['years_experience']
  if level not in experience_level_data:
    experience_level_data[level] = [0,0] #totalyears,count
    experience_level_data[level][0] +=years
    experience_level_data[level][1] +=1

    #Average years of experience by experience level
for level,data in experience_level_data.items():
     avg_years = data[0] / data[1]
     print(f"The average years of experience for {level} is : {avg_years:.2f}\n")


The average years of experience for SE is : 9.00

The average years of experience for EN is : 1.00

The average years of experience for MI is : 2.00

The average years of experience for EX is : 15.00



In [None]:
#Determine the distrobution of remote ratio

remote_ratio_counts = {}
for job in jobdata:
  ratio = job['remote_ratio']
  remote_ratio_counts[ratio] = remote_ratio_counts.get(ratio,0)+1

print("Distribution of remote ratios")

sorted_remote_ratios = sorted(remote_ratio_counts.items())
for ratio,count in sorted_remote_ratios:
    percentage = (count/len(jobdata))*100
    print(f"{ratio}: {count} ({percentage:.2f}%)")
    print("\n")

Distribution of remote ratios
0: 5075 (33.83%)


50: 5005 (33.37%)


100: 4920 (32.80%)




In [None]:
# Identify top 5 most skills:

skill_counts = {}
for job in jobdata:
  skills_string = job['required_skills']
  #Split skills by comma and strip whitespace from each skill
  skills = [s.strip() for s in skills_string.split(',')]
  for skill in skills:
       if skill: #Ensure skill is not an empty string
            skill_counts[skill] = skill_counts.get(skill,0) + 1

    #Convert dictionary to a list of (skill,count) tuples for sorting
sorted_skills = sorted(skill_counts.items(),key=lambda item: item[1],reverse=True)

   #Printing top 5 most frequent skills

for i,(skill,count) in enumerate(sorted_skills[:5]):
        print(f"{i + 1}.{skill}:{count} times")

1.Python:4450 times
2.SQL:3407 times
3.TensorFlow:3022 times
4.Kubernetes:3009 times
5.Scala:2794 times


In [None]:
# Identify the top 5 companies with the most job postingsob_data = [
job_data = [
    {"company_name": "Tech Corp", "title": "Software Engineer", "location": "Remote"},
    {"company_name": "Data Solutions", "title": "Data Analyst", "location": "New York"},
    {"company_name": "Tech Corp", "title": "DevOps Engineer", "location": "London"},
    {"company_name": "Innovate Inc", "title": "Product Manager", "location": "San Francisco"},
    {"company_name": "Data Solutions", "title": "Data Scientist", "location": "New York"},
    {"company_name": "Tech Corp", "title": "QA Engineer", "location": "Remote"},
]
print("--- Task 8: Identify the top 5 companies with the most job postings ---")
print("Explanation: This task counts the number of job postings made by each company.")
company_counts = {}
for job in job_data: # This line will now work if job_data is defined as above
    company = job['company_name']
    company_counts[company] = company_counts.get(company, 0) + 1

# Sort companies by their job posting count in descending order
sorted_companies = sorted(company_counts.items(), key=lambda item: item[1], reverse=True) # Corrected reverse=

print("Top 5 companies with the most job postings:")
for i, (company, count) in enumerate(sorted_companies[:5]):
    print(f"{i+1}. {company}: {count} postings")
print("\n")


--- Task 8: Identify the top 5 companies with the most job postings ---
Explanation: This task counts the number of job postings made by each company.
Top 5 companies with the most job postings:
1. Tech Corp: 3 postings
2. Data Solutions: 2 postings
3. Innovate Inc: 1 postings


