In [18]:
import pandas as pd
df = pd.read_csv('./Downloads/survey_results_public.csv')
schema = pd.read_csv('./Downloads/survey_results_schema.csv')

In [None]:
# Primele 5 valori
print(df.head())
print(schema.head())

In [38]:
# 1. Cati respondenti au completat sondajul?
nr_resp = df.ResponseId.nunique()
print(f"Cati respondenti au completat sondajul: {nr_resp}")

Cati respondenti au completat sondajul: 65437


In [40]:
# 2. Cati respondenti au raspuns la toate intrebarile?
questions = set(schema.qname.unique()) & set(df.columns)
nr_resp_cmpl = df.dropna(subset=questions).shape[0]
print(f"Cati respondenti au raspuns la toate intrebarile?: {nr_resp_cmpl}")

Cati respondenti au raspuns la toate intrebarile?: 6306


In [42]:
# 3. Care sunt valorile masurilor de tendinta centrala pentru experienta respondentilor (WorkExp)?
print(f'Media: {df.WorkExp.mean()}')
print(f'Modulul: {df.WorkExp.mode()}')
print(f'Mediana: {df.WorkExp.median()}')

Media: 11.46695663901814
Modulul: 0    3.0
Name: WorkExp, dtype: float64
Mediana: 9.0


In [44]:
# 4. Cati respondenti lucreaza de la distanta?
nr_resp_remote = df[df.RemoteWork == 'Remote'].shape[0]
print(f"Cati respondenti lucreaza de la distanta?: {nr_resp_remote}")

Cati respondenti lucreaza de la distanta?: 20831


In [46]:
# 5. Ce procent de respondenti programeaza in Python?
df['worked_with_python'] = df.LanguageHaveWorkedWith.str.contains('python', case=False,na=False)
perc_python = round(df.worked_with_python.sum() / df.ResponseId.nunique(), 2)
print(f"Ce procent de respondenti programeaza in Python?: {perc_python}")

Ce procent de respondenti programeaza in Python?: 0.47


In [48]:
# 6. Cati respondenti au invatat sa programeze prin cursuri online?
df['learned_with_online_courses'] = df.LearnCode.str.contains('online courses', case=False, na=False)
resp_online_courses = df.learned_with_online_courses.sum()
print(f"Cati respondenti au invatat sa programeze prin cursuri online? {resp_online_courses}")

Cati respondenti au invatat sa programeze prin cursuri online? 30271


In [50]:
# 7. Dintre respondentii care programeaza in Python, grupati pe tari, care este valoarea medie si mediana a remuneratiei? (ConvertedCompYearly) in fiecare tara?
python_progr = df[df["LanguageHaveWorkedWith"].str.contains("Python", case=False, na=False)]
# Valoarea mediei si medianei a remuneratiei, grupate pe tara
py_salary_stats = python_progr.groupby("Country")["ConvertedCompYearly"].agg(["mean", "median"])
print(py_salary_stats)

                                               mean    median
Country                                                      
Afghanistan                             4543.000000    4768.5
Albania                                56295.000000   56295.0
Algeria                                 9053.285714    6230.0
Andorra                               193331.000000  193331.0
Angola                                     6.000000       6.0
...                                             ...       ...
Venezuela, Bolivarian Republic of...   21500.000000    7100.0
Viet Nam                               14014.562500   10180.0
Yemen                                  10297.333333    5333.0
Zambia                                 28123.666667   22803.0
Zimbabwe                               37500.000000   18000.0

[173 rows x 2 columns]


In [54]:
# 8. Ce nivel de educatie au cei 5 respondenti cu cea mai mare compensatie?
resp_education_renum = df[['ResponseId', 'EdLevel', 'ConvertedCompYearly', 'Country']].sort_values(by='ConvertedCompYearly', ascending=False).head(5)
print(f"Ce nivel de educatie au cei 5 respondenti cu cea mai mare compensatie?: {resp_education_renum}")

Ce nivel de educatie au cei 5 respondenti cu cea mai mare compensatie?:        ResponseId                                         EdLevel  \
15837       15838    Bachelor’s degree (B.A., B.S., B.Eng., etc.)   
12723       12724  Professional degree (JD, MD, Ph.D, Ed.D, etc.)   
28379       28380  Professional degree (JD, MD, Ph.D, Ed.D, etc.)   
17593       17594    Bachelor’s degree (B.A., B.S., B.Eng., etc.)   
17672       17673  Professional degree (JD, MD, Ph.D, Ed.D, etc.)   

       ConvertedCompYearly       Country  
15837           16256603.0      Ethiopia  
12723           13818022.0  South Africa  
28379            9000000.0        Taiwan  
17593            6340564.0        Brazil  
17672            4936778.0       Ukraine  


In [60]:
# In fiecare grupa de varsta, ce procent de respondenti programeaza in Python?
py_progres_share_by_age = df.groupby('Age', as_index=False).agg({'ResponseId': 'count', 'worked_with_python': 'sum'})
py_progres_share_by_age['work_with_python_share'] = round(py_progres_share_by_age.worked_with_python / py_progres_share_by_age.ResponseId, 2)
py_progres_share_by_age

Unnamed: 0,Age,ResponseId,worked_with_python,work_with_python_share
0,18-24 years old,14098,7884,0.56
1,25-34 years old,23911,10945,0.46
2,35-44 years old,14942,6204,0.42
3,45-54 years old,6249,2619,0.42
4,55-64 years old,2575,1041,0.4
5,65 years or older,772,290,0.38
6,Prefer not to say,322,146,0.45
7,Under 18 years old,2568,1666,0.65


In [62]:
# In randul respondentilor din a 75-a percentila a renumeratiei medii care lucreaza de la distanta, care sunt cele mai frecvente industrii?
df[(df.ConvertedCompYearly > df.ConvertedCompYearly.quantile(0.75)) & (df.RemoteWork == 'Remote')].Industry.value_counts().reset_index()

Unnamed: 0,Industry,count
0,Software Development,768
1,Other:,239
2,Healthcare,156
3,Fintech,156
4,"Internet, Telecomm or Information Services",145
5,Retail and Consumer Services,106
6,Media & Advertising Services,103
7,Banking/Financial Services,69
8,Government,69
9,Computer Systems Design and Services,69
