# Import Libraries and Dataset

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [11]:
salary = pandas.read_csv('C:/Users/HamzaArham/Downloads/DataSet_OF_Salaries.csv')

In [10]:
salary

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L
5,5,2020,EN,FT,Data Analyst,72000,USD,72000,US,100,US,L
6,6,2020,SE,FT,Lead Data Scientist,190000,USD,190000,US,100,US,S
7,7,2020,MI,FT,Data Scientist,11000000,HUF,35735,HU,50,HU,L
8,8,2020,MI,FT,Business Data Analyst,135000,USD,135000,US,100,US,L
9,9,2020,SE,FT,Lead Data Engineer,125000,USD,125000,NZ,50,NZ,S


---

## Drop unwanted column

In [None]:
salary.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
salary

---

# Checking Dataset

In [None]:
salary.isnull().sum()

In [None]:
salary.info()

In [None]:
salary.shape

---

**Average of Data Science Job Salaries (USD)**

In [None]:
salary['salary_in_usd'].mean()

**Maximum Data Science Job Salaries (USD)**

In [None]:
salary['salary_in_usd'].max()

**Minimum Data Science Job Salaries (USD)**

In [None]:
salary['salary_in_usd'].min()

**Information of people who get the most Salaries (USD)**

In [None]:
salary[salary['salary_in_usd'] == salary['salary_in_usd'].max()]

**Information of people who get the least Salaries (USD)**

In [None]:
salary[salary['salary_in_usd'] == salary['salary_in_usd'].min()]

# ------Additional analysis with graph---------

**Amount of data kept in years**

In [None]:
salary['work_year'].value_counts()

In [None]:
sns.countplot(x='work_year',data=salary)

**Data Science Job Average Salaries Per Year (USD)**

In [None]:
salary_per_year = salary.groupby('work_year').mean()['salary_in_usd'].reset_index().sort_values(by='salary_in_usd')
salary_per_year

In [None]:
plt.figure(figsize=(12,6))
plt.title("Average Salary of Data Science Jobs per Year")
sns.barplot(x=salary_per_year['work_year'], y=salary_per_year['salary_in_usd'])
plt.xlabel("Year")
plt.ylabel("Number of Average Salary in USD")

In [None]:
sns.violinplot(x='work_year',y='salary_in_usd',data=salary,palette='rainbow')

The amount of salary is gaining every year.

In [None]:
sns.distplot(salary['salary_in_usd'])

The Average of Data Science Jobs Salary is around 100,000 USD

**Amount of Data Science Job Titles**

In [None]:
salary['job_title'].nunique()

**List of Data Science Job Titles**

In [None]:
salary['job_title'].unique()

**Top 10 most Data Science Jobs**

In [None]:
salary['job_title'].value_counts().head(10)

**Top 10 Highest Average of Data Science Jobs**

In [None]:
salary.groupby('job_title').mean()['salary_in_usd'].reset_index().sort_values(['salary_in_usd'],ascending=False)\
.head(10)

**Country of Company location that Data Science Employees are in**

In [None]:
salary['company_location'].nunique()

**Top 10 Country of Company location that Data Science Employees are in**

In [None]:
salary['company_location'].value_counts().head(10)

**Top 10 Data Science Job Titles in US**

In [None]:
salary[(salary['company_location']=='US')]['job_title'].value_counts().head()

In [None]:
salary['employee_residence'].value_counts().head(10)

**Amount of Experience Level**

In [None]:
salary['experience_level'].value_counts()

In [None]:
sns.countplot(x='experience_level', data=salary)

**Average Salary (USD) Based on Experience Level**

In [None]:
explvl_sal = salary.groupby('experience_level').mean()['salary_in_usd'].reset_index().sort_values(['salary_in_usd'])
explvl_sal

In [None]:
plt.figure(figsize=(9,6))
plt.title("Average Salary of Data Science Jobs by Experience Level")
sns.barplot(x=explvl_sal['experience_level'], y=explvl_sal['salary_in_usd'])
plt.xlabel("Experience Level")
plt.ylabel("Number of Average Salary in USD")

**Amount of Employment Type**

In [None]:
salary['employment_type'].value_counts()

In [None]:
sns.countplot(x='employment_type', data=salary)

**Amount of Company Size**

In [None]:
salary['company_size'].value_counts()

In [None]:
sns.countplot(x='company_size', data=salary)

**Average Salary (USD) based on Company Size**

In [None]:
mean_sal_com = salary.groupby('company_size').mean()['salary_in_usd'].reset_index()
mean_sal_com

In [None]:
plt.figure(figsize=(8,6))
plt.title("Average Salary of Data Science Jobs by Company Size")
sns.barplot(x=mean_sal_com['company_size'], y=mean_sal_com['salary_in_usd'])
plt.xlabel("Company Size")
plt.ylabel("Number of Average Salary in USD")

**Top 10 Countries that Have Most Averaged Data Science Job Salaries**

In [None]:
ds_cntry_sal = salary.groupby('company_location').mean()['salary_in_usd'].reset_index().sort_values\
(['salary_in_usd'],ascending=False).head(10)
ds_cntry_sal

According from ISO 3166-1 country codes
1. Russia
2. USA
3. New Zealand
4. Israel
5. Japan
6. Australia
7. United Arab Emirates
8. Algeria
9. Iraq
10. Canada

In [None]:
plt.figure(figsize=(14,6))
plt.title("High Average Salary of Data Science Jobs by Country")
sns.barplot(x=ds_cntry_sal['company_location'], y=ds_cntry_sal['salary_in_usd'])
plt.xlabel("Country")
plt.ylabel("Number of Average Salary in USD")

**Top 10 Countries that Have Least Averaged Data Science Job Salaries**

In [None]:
ds_cntry_sal2 = salary.groupby('company_location').mean()['salary_in_usd'].reset_index().sort_values\
(['salary_in_usd'],ascending=True).head(10)
ds_cntry_sal2

According from ISO 3166-1 country codes
1. Vietnam
2. Iran
3. Kenya
4. Pakistan
5. Ukraine
6. Moldova
7. American Samoa
8. Brazil
9. Honduras
10. Turkey

In [None]:
plt.figure(figsize=(14,6))
plt.title("Least Average Salary of Data Science Jobs by Country")
sns.barplot(x=ds_cntry_sal2['company_location'], y=ds_cntry_sal2['salary_in_usd'])
plt.xlabel("Country")
plt.ylabel("Number of Average Salary in USD")

**Amount of remote ratio work**

In [None]:
salary['remote_ratio'].value_counts()

In [None]:
sns.countplot(x='remote_ratio', data=salary)