# Exploratory analysis of 2020 salary data for IT European Specialists.

In [1]:
#Importing the required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


## Step 1: Importing the dataset

In [2]:
salarySurvey2018= pd.read_csv("IT Salary Survey EU 2018.csv")
salarySurvey2019= pd.read_csv("T Salary Survey EU 2019.csv")
salarySurvey2020= pd.read_csv("IT Salary Survey EU  2020.csv")

## Step 2: Data Exploration
Let's start with the 2020 dataset.

In [3]:
# Display the first 5 observations
salarySurvey2020.head()

Unnamed: 0,Timestamp,Age,Gender,City,Position,Total years of experience,Years of experience in Germany,Seniority level,Your main technology / programming language,Other technologies/programming languages you use often,...,Annual bonus+stocks one year ago. Only answer if staying in same country,Number of vacation days,Employment status,Сontract duration,Main language at work,Company size,Company type,Have you lost your job due to the coronavirus outbreak?,"Have you been forced to have a shorter working week (Kurzarbeit)? If yes, how many hours per week","Have you received additional monetary support from your employer due to Work From Home? If yes, how much in 2020 in EUR"
0,24/11/2020 11:14:15,26.0,Male,Munich,Software Engineer,5,3,Senior,TypeScript,"Kotlin, Javascript / Typescript",...,10000.0,30,Full-time employee,Unlimited contract,English,51-100,Product,No,,
1,24/11/2020 11:14:16,26.0,Male,Berlin,Backend Developer,7,4,Senior,Ruby,,...,5000.0,28,Full-time employee,Unlimited contract,English,101-1000,Product,No,,
2,24/11/2020 11:14:21,29.0,Male,Berlin,Software Engineer,12,6,Lead,Javascript / Typescript,"Javascript / Typescript, Docker",...,100000.0,30,Self-employed (freelancer),Temporary contract,English,101-1000,Product,Yes,,
3,24/11/2020 11:15:24,28.0,Male,Berlin,Frontend Developer,4,1,Junior,Javascript,,...,,24,Full-time employee,Unlimited contract,English,51-100,Startup,No,,
4,24/11/2020 11:15:46,37.0,Male,Berlin,Backend Developer,17,6,Senior,C# .NET,".NET, SQL, AWS, Docker",...,,29,Full-time employee,Unlimited contract,English,101-1000,Product,No,,


In [4]:
# Check the column names
salarySurvey2020.columns

Index(['Timestamp', 'Age', 'Gender', 'City', 'Position ',
       'Total years of experience', 'Years of experience in Germany',
       'Seniority level', 'Your main technology / programming language',
       'Other technologies/programming languages you use often',
       'Yearly brutto salary (without bonus and stocks) in EUR',
       'Yearly bonus + stocks in EUR',
       'Annual brutto salary (without bonus and stocks) one year ago. Only answer if staying in the same country',
       'Annual bonus+stocks one year ago. Only answer if staying in same country',
       'Number of vacation days', 'Employment status', 'Сontract duration',
       'Main language at work', 'Company size', 'Company type',
       'Have you lost your job due to the coronavirus outbreak?',
       'Have you been forced to have a shorter working week (Kurzarbeit)? If yes, how many hours per week',
       'Have you received additional monetary support from your employer due to Work From Home? If yes, how much in 20

The column names are too long. This will make it hard to reference the column names during analysis. To solve this, we will rename the columns.

### Renaming the columns

In [6]:
salarySurvey2020.columns = ["Year", "Age", "Gender","City","Position","Years of experience","Germany Experience","Seniority level","Tech program language",
                       "Other Language","Yearly salary","Yearly bonus and stocks", "Salary one year ago","Bonus and stocks last year","Vacation days",
                       "Employment_status","Сontract_duration","Language","Company size","Company type","Job loss COVID","Kurzarbeit","Monetary Support"]

salarySurvey2020.columns = salarySurvey2020.columns.str.replace(' ', '_')
salarySurvey2020.head()

Unnamed: 0,Year,Age,Gender,City,Position,Years_of_experience,Germany_Experience,Seniority_level,Tech_program_language,Other_Language,...,Bonus_and_stocks_last_year,Vacation_days,Employment_status,Сontract_duration,Language,Company_size,Company_type,Job_loss_COVID,Kurzarbeit,Monetary_Support
0,24/11/2020 11:14:15,26.0,Male,Munich,Software Engineer,5,3,Senior,TypeScript,"Kotlin, Javascript / Typescript",...,10000.0,30,Full-time employee,Unlimited contract,English,51-100,Product,No,,
1,24/11/2020 11:14:16,26.0,Male,Berlin,Backend Developer,7,4,Senior,Ruby,,...,5000.0,28,Full-time employee,Unlimited contract,English,101-1000,Product,No,,
2,24/11/2020 11:14:21,29.0,Male,Berlin,Software Engineer,12,6,Lead,Javascript / Typescript,"Javascript / Typescript, Docker",...,100000.0,30,Self-employed (freelancer),Temporary contract,English,101-1000,Product,Yes,,
3,24/11/2020 11:15:24,28.0,Male,Berlin,Frontend Developer,4,1,Junior,Javascript,,...,,24,Full-time employee,Unlimited contract,English,51-100,Startup,No,,
4,24/11/2020 11:15:46,37.0,Male,Berlin,Backend Developer,17,6,Senior,C# .NET,".NET, SQL, AWS, Docker",...,,29,Full-time employee,Unlimited contract,English,101-1000,Product,No,,


Checking for missing data

In [8]:
print(salarySurvey2020.shape)
print(salarySurvey2020.isna().sum())

(1253, 23)
Year                            0
Age                            27
Gender                         10
City                            0
Position                        6
Years_of_experience            16
Germany_Experience             32
Seniority_level                12
Tech_program_language         127
Other_Language                157
Yearly_salary                   0
Yearly_bonus_and_stocks       424
Salary_one_year_ago           368
Bonus_and_stocks_last_year    639
Vacation_days                  68
Employment_status              17
Сontract_duration              29
Language                       16
Company_size                   18
Company_type                   25
Job_loss_COVID                 20
Kurzarbeit                    880
Monetary_Support              791
dtype: int64


Removing duplicates and missing data in some selected rows

In [9]:
salarySurvey2020= salarySurvey2020.dropna(subset=['Age','Gender','Position','Years_of_experience',
                    'Seniority_level','Salary_one_year_ago','Language'])

salarySurvey2020= salarySurvey2020.drop_duplicates()
salarySurvey2020.shape

(860, 23)