In [35]:
import pandas as pd
import chardet
import re
import numpy as np
import plotly.graph_objects as go

In [36]:
with open('data_science_job.csv', 'rb') as f:
    result = chardet.detect(f.read())
    
df = pd.read_csv('data_science_job.csv', encoding=result['encoding'])
df

Unnamed: 0,Company,Job Title,Location,Job Type,Experience level,Salary,Requirment of the company,Facilities
0,SGS,Clinical Data Analyst,"Richardson, TX, United States",Full Time,Entry-level,48K+ *,"Computer Science,Data quality,Genetics,Mathema...",",,,,"
1,Ocorian,AML/CFT & Data Analyst,"Ebène, Mauritius",Full Time,Entry-level,48K+ *,"Agile,Data management,Finance,Security,,",",,,,"
2,Cricut,Machine Learning Engineer,"South Jordan, UT, United States",Full Time,,90K+ *,"Agile,Architecture,AWS,Computer Science,Comput...","Career development,,,,"
3,Bosch Group,Application Developer & Data Analyst,"Nonantola, Italy",Full Time,Entry-level,48K+ *,"Engineering,Industrial,Oracle,Power BI,R,R&D",",,,,"
4,Publicis Groupe,Data Engineer Full time (Public Sector) USA,"Arlington, VA, United States",Full Time,Mid-level,108K+,"AWS,Azure,Computer Science,Consulting,Dataflow...","Flex hours,Flex vacation,Parental leave,Unlimi..."
...,...,...,...,...,...,...,...,...
3193,Western Digital,Data Scientist - New College Graduate,"Biñan, Philippines",Full Time,Entry-level,39K+ *,"APIs,Clustering,Computer Science,Data visualiz...","Career development,,,,"
3194,Experian,Cloud Data Analyst,"Heredia, Costa Rica",Full Time,Senior-level,92K+ *,"AWS,Big Data,Computer Science,GCP,Snowflake,SQL","Equity,Medical leave,Salary bonus,,"
3195,Locus Robotics,"Robotics Engineer, Sensors","Wilmington, MA, United States",Full Time,Senior-level,62K+ *,"E-commerce,Engineering,Linux,Python,Robotics,S...",",,,,"
3196,ATB Financial,Data Scientist,"Edmonton, Alberta, Canada",Full Time,Entry-level,39K+ *,"Computer Science,Data Analytics,Data Mining,Ec...","Career development,Startup environment,,,"


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3198 entries, 0 to 3197
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Company                     3197 non-null   object
 1   Job Title                   3197 non-null   object
 2   Location                    3197 non-null   object
 3   Job Type                    3197 non-null   object
 4   Experience level            2962 non-null   object
 5   Salary                      3009 non-null   object
 6   Requirment of the company   3198 non-null   object
 7   Facilities                  3198 non-null   object
dtypes: object(8)
memory usage: 200.0+ KB


# Data Cleaning

In [38]:
# Count null values
nulls = df.isnull().sum()

# Print column-wise null count and percentage
for column, count in nulls[nulls != 0].items():
    percentage = (count / df.shape[0]) * 100
    print(f"{column}: {count}  -->  {percentage:.2f}%")

Company: 1  -->  0.03%
Job Title: 1  -->  0.03%
Location: 1  -->  0.03%
Job Type: 1  -->  0.03%
Experience level: 236  -->  7.38%
Salary: 189  -->  5.91%


#### Handling null salaries based on the experience level

To address null values in the "Salary" column, the code employs the experience level as a reference. This approach involves filling the missing values with the average salary corresponding to the respective experience level.

In [39]:
# Convert the "Salary" column to numeric values and handle missing values
df['Salary'] = df['Salary'].replace('[^\d.]', '', regex=True).replace('NaN', np.nan, regex=True).astype(float).apply(lambda x: x*1000)

# Calculate the average salary for each experience level
mean_salary_by_level = df.groupby('Experience level')['Salary'].mean()

# Fill the missing values in the "Salary" column with the mean of the corresponding experience level
df['Salary'].fillna(df['Experience level'].map(mean_salary_by_level), inplace=True)

#### Handling null experience levels based on the mean salary of each level
To handle null values in the "Experience level" column, the code utilizes the mean salary of each level. This approach allows for filling the missing values with the experience level that corresponds to the closest mean salary.

In [40]:
# Function to find the nearest experience level based on salary
def find_nearest_level(row):
    salary = row['Salary']
    distances = abs(mean_salary_by_level - salary)
    closest_level = distances.idxmin()
    return closest_level

In [41]:
# Fill the missing values in the "Experience level" column with the nearest experience level based on salary
df['Experience level'].fillna(df.apply(find_nearest_level, axis=1), inplace=True)

#### Changing the empty values in the columns "Requirements of the company" and "Facilities"

In [42]:
# Rename the column "Requirment of the company" to "Requirements of the company"
df.rename(columns={'Requirment of the company ': 'Requirements of the company'}, inplace=True)

# Remove empty values in the "Requirements of the company" column
df['Requirements of the company'] = df['Requirements of the company'].replace(',,', '', regex=True)

# Replace empty values with "No Requirements" in the "Requirements of the company" column
for line in df['Requirements of the company']:
    if line == '':
        df['Requirements of the company'] = df['Requirements of the company'].replace('', 'No Requirements', regex=True)

In [43]:
# Remove empty values in the "Facilities" column
df['Facilities'] = df['Facilities'].replace(',,', '', regex=True)

# Replace empty values with "No Facilities" in the "Facilities" column
for line in df['Facilities']:
    if line == '':
        df['Facilities'] = df['Facilities'].replace('', 'No Facilities', regex=True)

#### Remaining Data Treatment

In [44]:
# Drop the null values
df.dropna(inplace=True)

# Add "City" column based on the "Location" column
df['City'] = df['Location'].apply(lambda x: x.split(',')[0])

df['City'] = df['City'].replace('Dublin 1', 'Dublin', regex=True)
 
# Add "Country" column based on the "Location" column
df['Country'] = df['Location'].apply(lambda x: x.split(',')[-1])

# Reorder the columns to match the desired order
df = df[['Company', 'Job Title', 'City', 'Country', 'Job Type', 'Experience level', 'Salary', 'Requirements of the company', 'Facilities']]

# Reset the index after dropping rows
df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,Company,Job Title,City,Country,Job Type,Experience level,Salary,Requirements of the company,Facilities
0,SGS,Clinical Data Analyst,Richardson,United States,Full Time,Entry-level,48000.0,"Computer Science,Data quality,Genetics,Mathema...",No Facilities
1,Ocorian,AML/CFT & Data Analyst,Ebène,Mauritius,Full Time,Entry-level,48000.0,"Agile,Data management,Finance,Security",No Facilities
2,Cricut,Machine Learning Engineer,South Jordan,United States,Full Time,Executive-level,90000.0,"Agile,Architecture,AWS,Computer Science,Comput...",Career development
3,Bosch Group,Application Developer & Data Analyst,Nonantola,Italy,Full Time,Entry-level,48000.0,"Engineering,Industrial,Oracle,Power BI,R,R&D",No Facilities
4,Publicis Groupe,Data Engineer Full time (Public Sector) USA,Arlington,United States,Full Time,Mid-level,108000.0,"AWS,Azure,Computer Science,Consulting,Dataflow...","Flex hours,Flex vacation,Parental leave,Unlimi..."


# Exploratory Data Analysis

### Which cities have the highest salaries?

In [54]:
# Group the data by city and calculate the average salary for each city
avg_salary_by_city = df.groupby('City')['Salary'].mean()

# Filter the cities that have at least 3 job openings
cities_with_min_3_jobs = df['City'].value_counts()[df['City'].value_counts() >= 3].index

# Calculate the average salary for the cities with at least 3 job openings
avg_salary_by_city = avg_salary_by_city[cities_with_min_3_jobs]

# Sort the cities based on the average salary in descending order
top_cities = avg_salary_by_city.sort_values(ascending=False)

# Select the top 5 cities with the highest salaries
top_cities = top_cities.head(5)

In [55]:

# Define background color and generate gradient colors
background_color = 'rgb(0, 22, 33)'
font_color = 'rgb(240, 240, 240)'

# Define manual colors for the bars
bar_colors = ['rgb(0, 0, 24)', 'rgb(0, 10, 56)', 'rgb(0, 67, 100)', 'rgb(0, 147, 110)', 'rgb(62, 196, 67)']

# Create a bar chart for the top cities and their average salaries
fig = go.Figure(data=go.Bar(x=top_cities.index, y=top_cities.values, marker=dict(color=bar_colors)))



# Update layout with desired colors
fig.update_layout(
    title="Cities with the Highest Salaries",
    xaxis_title="City",
    yaxis_title="Average Salary",
    plot_bgcolor=background_color,
    paper_bgcolor=background_color,
    font_color=font_color,
    template='plotly_dark'
)

fig.show()


In [59]:
df[df.City == 'San Mateo']

Unnamed: 0,Company,Job Title,City,Country,Job Type,Experience level,Salary,Requirements of the company,Facilities
596,Roblox,"Principal Engineer, Datacenter Software Systems",San Mateo,United States,Full Time,Senior-level,283000.0,"APIs,Engineering,Genetics,Golang,Kubernetes,","Equity,Flex hours,Flex vacation,Health care,Un..."
665,Roblox,"Principal Software Engineer, Applied ML",San Mateo,United States,Full Time,Senior-level,283000.0,"Architecture,Content creation,Deep Learning,Di...","Career development,Conferences,Equity,Flex hou..."
688,Roblox,Senior Data Scientist- Creator Content,San Mateo,United States,Full Time,Senior-level,295000.0,"A/B testing,Airflow,Big Data,Causal inference,...","Career development,Equity,Flex hours,Flex vaca..."
706,Roblox,Research Scientist,San Mateo,CA,Full Time,Entry-level,63000.0,"Computer Science,Content creation,Distributed ...","Conferences,Flex hours,Flex vacation,Health ca..."
720,Roblox,Principal Machine Learning Engineer - Personal...,San Mateo,United States,Full Time,Senior-level,283000.0,"Computer Science,Engineering,Genetics,Machine ...","Career development,Equity,Flex hours,Flex vaca..."
731,Roblox,Research Engineer - Research,San Mateo,United States,Full Time,Senior-level,139000.0,"APIs,Computer Science,Distributed Systems,Engi...","Career development,Equity,Flex hours,Flex vaca..."
732,Roblox,"Principal Software Engineer, Data Engineering",San Mateo,United States,Full Time,Senior-level,283000.0,"Airflow,Architecture,AWS,Azure,Dagster,Data pi...","Career development,Equity,Flex hours,Flex vaca..."
736,Roblox,Principal Machine Learning Engineer- Economy,San Mateo,United States,Full Time,Senior-level,267000.0,"Computer Science,Data analysis,Data pipelines,...","Career development,Equity,Flex hours,Flex vaca..."
744,Roblox,Principal Deep Learning Engineer - Computer Vi...,San Mateo,United States,Full Time,Senior-level,283000.0,"Computer Vision,Deep Learning,Genetics,Machine...","Career development,Equity,Flex hours,Flex vaca..."
747,Roblox,"Technical Director, Machine Learning (Individu...",San Mateo,United States,Full Time,Executive-level,98803.278689,"Architecture,Deep Learning,Distributed Systems...","Career development,Equity,Flex hours,Flex vaca..."


In [53]:
#Print the top cities with their corresponding average salaries
print("Cities with the highest salaries:")
for city, salary in top_cities.items():
    print(f"{city}: ${salary:.2f}")

Cities with the highest salaries:


TypeError: unsupported format string passed to Series.__format__