# Plotly Tutorial - 1
https://www.kaggle.com/hakkisimsek/plotly-tutorial-1

In [9]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
from plotly import tools
import plotly.figure_factory as ff

import warnings
warnings.filterwarnings('ignore')

mcr = pd.read_csv('./input/multipleChoiceResponses.csv', encoding='ISO-8859-1')
mcr.head()

Unnamed: 0,GenderSelect,Country,Age,EmploymentStatus,StudentStatus,LearningDataScience,CodeWriter,CareerSwitcher,CurrentJobTitleSelect,TitleFit,...,JobFactorExperienceLevel,JobFactorDepartment,JobFactorTitle,JobFactorCompanyFunding,JobFactorImpact,JobFactorRemote,JobFactorIndustry,JobFactorLeaderReputation,JobFactorDiversity,JobFactorPublishingOpportunity
0,"Non-binary, genderqueer, or gender non-conforming",,,Employed full-time,,,Yes,,DBA/Database Engineer,Fine,...,,,,,,,,,,
1,Female,United States,30.0,"Not employed, but looking for work",,,,,,,...,,,,,,,,Somewhat important,,
2,Male,Canada,28.0,"Not employed, but looking for work",,,,,,,...,Very Important,Very Important,Very Important,Very Important,Very Important,Very Important,Very Important,Very Important,Very Important,Very Important
3,Male,United States,56.0,"Independent contractor, freelancer, or self-em...",,,Yes,,Operations Research Practitioner,Poorly,...,,,,,,,,,,
4,Male,Taiwan,38.0,Employed full-time,,,Yes,,Computer Scientist,Fine,...,,,,,,,,,,


In [10]:
mcr['GenderSelect'].value_counts()

Male                                                 13610
Female                                                2778
A different identity                                   159
Non-binary, genderqueer, or gender non-conforming       74
Name: GenderSelect, dtype: int64

In [11]:
colors = ['aqua', 'lightgrey', 'lightgreen', '#D0F9B1', 'khaki', 'grey']
# 'Non-binary, genderqueer, or gender non-conforming'를 'Non-binary'로 변환
# 아닌 경우는 그대로 출력
mcr['GenderSelect'] = np.where(mcr['GenderSelect'] == 'Non-binary, genderqueer, or gender non-conforming', 'Non-binary', mcr['GenderSelect'])

gender = mcr['GenderSelect'].value_counts()
label = gender.index
size = gender.values

trace = go.Pie(labels = label,
               values = size,
               marker = dict(colors = colors))

data = [trace]
layout = go.Layout(title = 'Gender Distribution')

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

대부분의 캐글 사용자는 남성입니다.

In [12]:
df = pd.DataFrame(mcr['Country'].value_counts())
df['country'] = df.index
df.columns = ['number', 'country']
df = df.reset_index().drop('index', axis=1)
df.head()

Unnamed: 0,number,country
0,4197,United States
1,2704,India
2,1023,Other
3,578,Russia
4,535,United Kingdom


In [13]:
df = pd.DataFrame(mcr['Country'].value_counts())
df['country'] = df.index
df.columns = ['number', 'country']
df = df.reset_index().drop('index', axis=1)

data = [dict(
    type = 'choropleth',
    locations = df['country'],
    locationmode = 'country names',
    z = df['number'],
    text = df['country'],
    colorscale = [[0, 'rgb(5, 10, 172)'], [0.35, 'rgb(40, 60, 190)'],
                  [0.5, 'rgb(70, 100, 245)'],
                  [0.6, 'rgb(90, 120, 245)'],
                  [0.7, 'rgb(106, 137, 247)'],
                  [1, 'rgb(220, 220, 220)']],
    autocolorscale = False,
    reversescale = True,
    marker = dict(
        line = dict(
            color = 'rgb(180, 180, 180)',
            width = 0.5
        )
    ),
    colorbar = dict(
        autotick = False,
        tickprefix = '$',
        title = 'Survey Respondents'
    )
)]

layout = dict(
    title = 'The Nationality of Respondents',
    geo = dict(
        showframe = False,
        showcoastlines = True,
        projection = dict(
            type = 'Mercator'
        )
    )
)

fig = dict(data = data, layout=layout)
py.iplot(fig, validate=False)

**참여자 수 상위 5개국**
1. 미국 - 4197
2. 인도 - 2704
3. 러시아 - 578
4. 영국 - 535
5. 중국 - 471

In [14]:
mcr = mcr[(mcr['Age'] < 75) & (mcr['Age'] > 18)]

age = round(mcr['Age'].value_counts(normalize=True), 4)
trace = go.Bar(
    x = age.index,
    y = age.values,
    marker = dict(
        color = age.values,
        colorscale = 'Reds',
        showscale = True
    )
)

data = [trace]
layout = go.Layout(title = 'Age distribution',
                   yaxis = dict(title = '# of Respondents'))

fig = go.Figure(data=data, layout=layout)
fig['layout']['xaxis'].update(dict(title='Age',
                                   tickfont = dict(size=12)))
py.iplot(fig)

**연령 분포**
- 20-30대가 제일 많음

In [15]:
train = mcr['FirstTrainingSelect'].value_counts()
label = train.index
size = train.values
colors = ['aqua', 'lightgrey', 'lightgreen', '#D0F9B1', 'khaki', 'grey']

trace = go.Pie(labels=label,
               values=size,
               marker=dict(colors=colors))
data = [trace]
layout = go.Layout(title='First Training Platform',
                   legend=dict(orientation='h'))

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

**데이터 과학을 처음 배운 경로**
1. 온라인 교육과정(36%)
2. 대학교 과정(28.6%)
3. 독학(24.8%)
4. 일, 실무(6.79%)
5. 기타(2.02%)
6. 캐글(1.74%)

In [16]:
course = mcr['CoursePlatformSelect'].str.split(',')
course_set = []

for i in course.dropna():
    course_set.extend(i)
courses = round(pd.Series(course_set).value_counts(normalize=True)[:5], 4).sort_values(ascending=False).to_frame()

trace1 = go.Bar(
    x = courses.index,
    y = courses[0],
    name = 'course',
    marker = dict(
        color = courses[0],
        colorscale = 'Jet'
    )
)

learning = mcr['LearningPlatformSelect'].str.split(',')
learning_set = []
for i in learning.dropna():
    learning_set.extend(i)
learn = round(pd.Series(learning_set).value_counts(normalize=True)[:5], 4).sort_values(ascending=False).to_frame()

trace2= go.Bar(
    x = learn.index,
    y = learn[0],
    name = 'platform',
    marker = dict(
        color = learn[0],
        colorscale = 'Jet'
    )
)

fig = tools.make_subplots(rows=1, cols=2, subplot_titles=('Course Platforms', 'Learning Platforms'))
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig['layout'].update(height=500, width=820,
                     title='Where to start & How to continue in DS?',
                     showlegend=False)
py.iplot(fig)

코세라는 아마 앤드류 응 교수님 덕에 압도적 1위로 보여집니다.
- 기초적인 학습 뒤 사람들은 캐글에서 학습을 진행합니다.
- 온라인 과정과 스택오버플로우가 책이나 학교 수업보다 선호됩니다.

In [20]:
hardware = mcr['HardwarePersonalProjectsSelect'].str.split(',')
hardware_set = []
for i in hardware.dropna():
    hardware_set.extend(i)
    hware = pd.Series(hardware_set).value_counts()[:6]
    
label = hware.index
size = hware.values

color = ['#FEBFB3', 'skyblue', '#96D38C', '#D0F9B1', 'tan', 'lightgrey']

trace = go.Pie(labels = label,
               values = size,
               marker = dict(colors=colors))

data = [trace]
layout = go.Layout(
    title = 'Hardware Requirements',
    legend = dict(orientation = 'h')
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

데이터 과학을 다루는데 기본적이 노트북으로 충분해 보입니다.

이번에는 급여에 대해 살펴보겠습니다.

In [30]:
mcr['CompensationAmount'] = mcr['CompensationAmount'].str.replace(',', '')
mcr['CompensationAmount'] = mcr['CompensationAmount'].str.replace('-', '')

salary = mcr[['CompensationAmount', 'CompensationCurrency', 'Country', 'JobSatisfaction', 'CurrentJobTitleSelect', 'Age', 'GenderSelect']].dropna()

crates = pd.read_csv('./input/conversionRates.csv')
crates.drop('Unnamed: 0', axis=1, inplace=True)
salary = salary.merge(crates, left_on='CompensationCurrency', right_on='originCountry', how='left')
salary['Salary'] = pd.to_numeric(salary['CompensationAmount'])*salary['exchangeRate']

us_salary = salary[(salary['Salary'] > 100) & (salary['Salary'] < 500000) & (salary['Country'] == 'United States')]
non_us_salary = salary[(salary['Salary'] > 100) & (salary['Salary'] < 500000) & (~(salary['Country'] == 'United States'))]
sal_coun = salary.groupby('Country')['Salary'].median().round(-2).sort_values(ascending=False)[:16].to_frame()

trace = go.Bar(
    x = sal_coun.index,
    y = sal_coun.Salary,
    marker = dict(
        color = sal_coun.Salary,
        colorscale = 'Reds'
    )
)

data = [trace]
layout = go.Layout(
    title = 'Top Countries with Highes Median Salaries',
    yaxis = dict(title = 'Salary ($)')
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

US 와 non-US간 임금 격차가 존재하기 때문에 따로 분석해보겠습니다.

In [33]:
us_group = us_salary.groupby('Age')['Salary'].median().to_frame()
non_us_group = non_us_salary.groupby('Age')['Salary'].median().to_frame()

trace0 = go.Scatter(
    x = us_group.index,
    y = us_group['Salary'].round(-2),
    name = 'US',
    mode = 'markers',
    marker = dict(
        size = 9,
        color = ('aqua')
    )
)

trace1 = go.Scatter(
    x = non_us_group.index,
    y = non_us_group['Salary'].round(-2),
    name = 'non-US',
    mode = 'markers',
    marker = dict(
        size = 9,
        color = ('navy')
    )
)

data = [trace0, trace1]
layout = dict(title = 'The Median Salary by Age in US and Non-US Countries',
              xaxis = dict(title = 'Age'),
              yaxis = dict(title = 'Salary ($)'))

fig = dict(data=data, layout=layout)
fig['layout']['xaxis'].update(dict(title = 'Age', tickfont = dict(size=12)))
py.iplot(fig)

In [34]:
salary['JobSatisfaction'].replace({'10 - Highly Satisfied': '10', '1 - Highly Dissatisfied': '0', 'I prefer not to share': np.NaN}, inplace=True)

salary.dropna(subset=['JobSatisfaction'], inplace=True)
salary['JobSatisfaction'] = salary['JobSatisfaction'].astype(int)

In [35]:
salary_us = salary[salary.originCountry=='USD'].groupby('JobSatisfaction').Salary.mean().to_frame()
salary_non_us = salary[salary.originCountry!='USD'].groupby('JobSatisfaction').Salary.mean().to_frame()

trace0 = go.Scatter(
    x = salary_us.index,
    y = salary_us['Salary'].round(-2),
    name = 'US',
    mode = 'markers',
    marker = dict(
        size = 11,
        color = ('navy')
    )
)

trace1 = go.Scatter(
    x = salary_non_us.index,
    y = salary_non_us['Salary'].round(-2),
    name = 'non-US',
    mode = 'markers',
    marker = dict(
        size = 11,
        color = ('aqua')
    )
)

data = [trace0, trace1]
layout = dict(title = 'The Median Salary & Satisfaction in US $ non-US Countries',
              xaxis = dict(title = 'Job Satisfaction'),
              yaxis = dict(title = 'Salary ($)'))

fig = dict(data=data, layout=layout)
fig['layout']['xaxis'].update(dict(title = 'Job Satisfaction',
                                   tickmode = 'linear',
                                   tickfont = dict(size=10)))
py.iplot(fig)

상관관계가 인과관계를 나타내진 않지만 직업 만족도와 임금사이에 어느정도 연관이 있어보입니다.

이번엔 연령과 직업 만족도, 임금을 결합해보겠습니다.

In [36]:
salary = salary[salary.Salary<1000000]
dat = salary[['Age', 'JobSatisfaction', 'Salary']]
dat['index'] = np.arange(len(dat))
fig = ff.create_scatterplotmatrix(dat, diag='box', index='index',
                                  colormap_type='cat', colormap='Jet',
                                  height=800, width=800)
py.iplot(fig)

In [37]:
male_salary = salary[salary['GenderSelect'] == 'Male']
female_salary = salary[salary['GenderSelect'] == 'Female']
male = male_salary.groupby('Age').Salary.mean().to_frame()
female = female_salary.groupby('Age').Salary.mean().to_frame()

trace0 = go.Scatter(
    x = male.index,
    y = male['Salary'].round(-2),
    name = 'male',
    line = dict(
        color = 'aqua',
        width = 2,
        dash = 'dash'
    )
)

trace1 = go.Scatter(
    x = female.index,
    y = female['Salary'].round(-2),
    name = 'female',
    line = dict(
        color = 'navy',
        width = 2,
        dash = 'dash'
    )
)

data = [trace0, trace1]
layout = dict(title = 'The Median Salary of Men & Women by Age',
              xaxis = dict(title = 'Age'),
              yaxis = dict(title = 'Salary ($)'))

fig = dict(data=data, layout=layout)
fig['layout']['xaxis'].update(dict(title='Age',
                                   tickfont = dict(size=12)))
py.iplot(fig)

여성과 남성간의 임금격차는 없어보입니다.

In [38]:
trace0 = go.Box(x=male_salary.Salary, name='Male', fillcolor='navy')
trace1 = go.Box(x=female_salary.Salary, name='Female', fillcolor='lime')
data = [trace0, trace1]
py.iplot(data)

그러나 남성의 중위 임금이 여성보다 조금 높습니다.

미국과 미국이 아닌 국가의 성 불평등을 확인해보겠습니다.

In [43]:
male_us_salary = salary[(salary['GenderSelect'] == 'Male') & (salary.originCountry=='USD')]
male_non_us_salary = salary[(salary['GenderSelect'] == 'Male') & (salary.originCountry!='USD')]
female_us_salary = salary[(salary['GenderSelect'] == 'Female') & (salary.originCountry=='USD')]
female_non_us_salary = salary[(salary['GenderSelect'] == 'Female') & (salary.originCountry!='USD')]
male_us = male_us_salary.groupby('Age').Salary.mean().to_frame()
male_nus = male_non_us_salary.groupby('Age').Salary.mean().to_frame()
female_us = female_us_salary.groupby('Age').Salary.mean().to_frame()
female_nus = female_non_us_salary.groupby('Age').Salary.mean().to_frame()

trace0 = go.Scatter(
    x = male_us.index,
    y = male_us['Salary'].round(-2),
    name = 'male',
    mode = 'markers',
    marker = dict(
        size = 8,
        color = ('grey')
    )
)

trace1 = go.Scatter(
    x = female_us.index,
    y = female_us['Salary'].round(-2),
    name = 'female',
    mode = 'markers',
    marker = dict(
        size = 8,
        color = ('red')
    )
)

data = [trace0, trace1]
layout = dict(title='The Median Salary of Men & Women by Age in US',
              xaxis = dict(title = 'Age'),
              yaxis = dict(title = 'Salary ($)'))

fig = dict(data=data, layout=layout)
fig['layout']['xaxis'].update(dict(title = 'Age',
                                   tickfont = dict(size=12)))
py.iplot(fig)

trace0 = go.Scatter(
    x = male_nus.index,
    y = male_nus['Salary'].round(-2),
    name = 'male',
    mode = 'markers',
    marker = dict(
        size = 8,
        color = ('grey')
    )
)

trace1 = go.Scatter(
    x = female_nus.index,
    y = female_nus['Salary'].round(-2),
    name = 'female',
    mode = 'markers',
    marker = dict(
        size = 8,
        color = ('red')
    )
)

data = [trace0, trace1]
layout = dict(title='The Median Salary of Men & Women by Age in non-US Countries',
              xaxis = dict(title = 'Age'),
              yaxis = dict(title = 'Salary ($)'))

fig = dict(data=data, layout=layout)
fig['layout']['xaxis'].update(dict(title = 'Age',
                                   tickfont = dict(size=12)))
py.iplot(fig)

미국이 아닌 국가에서는 35세 이후에 여성이 훨씬 적게 받는 경향이 있고 50대 후반부터는 거의 여성이 없습니다.

전공과 직무에 대해 알아보겠습니다.

In [45]:
mcr = mcr[~(mcr['MajorSelect'] == 'Other')]
mcr['MajorSelect'].replace({'Information technology, networking, or system administration':'IT, Network, System Admin', 
                      'Mathematics or statistics':'Math or stats',
                      'Engineering (non-computer focused)':'Engineering (non-CS)',
                      'IT, Network, System Admin':'IT-Network-System'}, inplace=True)

ms = round(mcr.MajorSelect.value_counts(normalize=True).to_frame()[:8], 4)
trace1 = go.Bar(
    x = ms.index,
    y = ms.MajorSelect,
    marker = dict(color='orange')
)

mcr['CurrentJobTitleSelect'].replace({'Software Developer/Software Engineer': 'Software Developer', 'Machine Learning Engineer': 'ML Engineer'}, inplace=True)
cs = round(mcr.CurrentJobTitleSelect.value_counts(normalize=True).to_frame()[:8], 4)
trace2 = go.Bar(
    x = cs.index,
    y = cs.CurrentJobTitleSelect,
    marker = dict(color='navy')
)

fig = tools.make_subplots(rows=1, cols=2, subplot_titles=('Majors', 'Titles'))
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig['layout'].update(height=500, width=820, title='Majors & Titles in Data Science World', showlegend=False)
py.iplot(fig)

컴퓨터 과학, 수학 또는 통계, CS가 아닌 공학분야 순으로 전공이 많습니다.

직무는 데이터 사이언티스트, 소프트웨어 개발자, 데이터 분석가 순입니다.

In [27]:
sal_coun

Unnamed: 0_level_0,Salary
Country,Unnamed: 1_level_1
United States,108000.0
Switzerland,104300.0
Australia,93500.0
Norway,87900.0
Denmark,80400.0
Israel,74900.0
Netherlands,74100.0
Germany,71700.0
Canada,70000.0
Ireland,66700.0
