In [16]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [17]:
results= pd.read_csv("/home/maxi/SoyHenry/stack-overflow-developer-survey-2021/survey_results_public.csv", index_col="ResponseId")


In [18]:
resultsschema= pd.read_csv("/home/maxi/SoyHenry/stack-overflow-developer-survey-2021/survey_results_schema.csv")

In [19]:
#Asignamos variables a los nombres de las columnas que vamos a utilizar:
lenguages="LanguageHaveWorkedWith"
salary="ConvertedCompYearly"
#renombramos las columnas,con inplace hacemos que se aplique al df original
results.rename(columns={lenguages:"Languages", salary:"Salary"}, inplace=True)

In [20]:
#Eliminamos todas las filas que tengan NaN en las columnas elegidas:
results.dropna(subset=["Salary","Languages"], inplace=True)

In [21]:
#Seleccionamos las columnas con las que queremos trabajar:
# [:,] significa que queremos todas las filas de las columnas[]

results= results.loc[:,["Country","Salary","Languages"]].sort_values(by="Salary")

In [22]:
#realizamos un filtro para limpiar los datos de las puntas y aplicamos 
filtro = (results["Salary"] >= 8000) & (results["Salary"]<= 1e6)

results= results[filtro]

In [23]:
results.head()

Unnamed: 0_level_0,Country,Salary,Languages
ResponseId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20963,United States of America,8000.0,Go;Java;JavaScript;Node.js;Python;Ruby;Swift;T...
17454,United States of America,8000.0,HTML/CSS;JavaScript;LISP
25123,South Africa,8004.0,HTML/CSS;JavaScript;Python;SQL
17979,South Africa,8004.0,C#;C++;Dart;Erlang;HTML/CSS;JavaScript;Node.js...
27869,Bangladesh,8006.0,Bash/Shell;C#;HTML/CSS;Java;Matlab;PHP;PowerSh...


In [24]:
#Tenemos que realizar una lista con los lenguajes unicos.
lg = results["Languages"]
allLanguage = lg[20963].split(";")

#una vez creada la lista, realizamos un for para ir seleccionando los lenguajes.
for row in lg:
    for leng in row.split(";"):
        allLanguage.append(leng)

#con list(set()) hacemos una lista con los lenguajes sin repetirse.
allLanguage = list(set(allLanguage))

#ordenamos alfabeticamente
allLanguage.sort()
allLanguage

['APL',
 'Assembly',
 'Bash/Shell',
 'C',
 'C#',
 'C++',
 'COBOL',
 'Clojure',
 'Crystal',
 'Dart',
 'Delphi',
 'Elixir',
 'Erlang',
 'F#',
 'Go',
 'Groovy',
 'HTML/CSS',
 'Haskell',
 'Java',
 'JavaScript',
 'Julia',
 'Kotlin',
 'LISP',
 'Matlab',
 'Node.js',
 'Objective-C',
 'PHP',
 'Perl',
 'PowerShell',
 'Python',
 'R',
 'Ruby',
 'Rust',
 'SQL',
 'Scala',
 'Swift',
 'TypeScript',
 'VBA']

In [25]:
#Vamos a crear nuestro DataFrame para los salarios x lenguaje
#data = los valores que va a tomar x defecto, len= al rango , index= los lenguajes , columns = salarios
meanSalary= pd.DataFrame(data = np.zeros(len(allLanguage)), index = allLanguage, columns = ["Salary"])
meanSalary

Unnamed: 0,Salary
APL,0.0
Assembly,0.0
Bash/Shell,0.0
C,0.0
C#,0.0
C++,0.0
COBOL,0.0
Clojure,0.0
Crystal,0.0
Dart,0.0


In [26]:
#Vamos a crear un for para poder comprobar cada lenguage y su salario y agregar a nuestro df vacio.
#Usamos import re - porque  la caracteristica "contains" de pandas toma como valor especial C++ y tira error.

import re

for leng in allLanguage:
    #Creamos una mascara, que pase por la columna lenguajes y compruebe que el lenguaje esta en dicha fila.
    mask= results["Languages"].str.contains(re.escape(leng)) #re.escape ignora el detalle especial C++ y permite contarlo
    
    #realizamos el promedio del sueldo x lenguaje
    avarage= results.loc[mask,"Salary"].mean()
    #agregamos el promedio a la posicion correspondiente
    meanSalary.loc[leng,"Salary"] = avarage

meanSalary = meanSalary.sort_values(by="Salary")


In [27]:
meanSalary

Unnamed: 0,Salary
PHP,67130.923527
Dart,67654.700593
Delphi,68800.341237
Matlab,75097.573001
HTML/CSS,81471.737883
VBA,81871.93154
C#,82400.588896
JavaScript,82587.724304
Java,83087.28031
SQL,83590.871942


In [28]:
colors = [
    "#03071E",
    "#370617",
    "#6A040F",
    "#9D0208",
    "#D00000",
    "#DC2F02",
    "#E85D04",
    "#F48C06",
    "#FAA307",
    "#FFBA08"
]

fontdict = {
    'family': 'serif',
    'color':  'darkred',
    'weight': 'normal',
    'size': 16,
}


In [29]:
%matplotlib qt5

In [30]:
plt.style.use("seaborn")
plt.barh(meanSalary.index, meanSalary["Salary"], color=colors)
plt.title("Averange annual Salary in 2021")
plt.xlabel("Mean Salary in US$")
plt.ylabel("Language programmation")
plt.vlines(meanSalary["Salary"].mean(), 0, len(meanSalary["Salary"]),color="red", linestyles="dashed", label= "Mean Salary")
plt.legend()
plt.tight_layout()
#colocar simbolo de $ en los elementos del eje x
plt.gca().xaxis.set_major_formatter("${x:1.0f}")

plt.show()
