# Plotly Tutorial - 1
https://www.kaggle.com/hakkisimsek/plotly-tutorial-1

In [8]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
from plotly import tools
import plotly.figure_factory as ff

import warnings
warnings.filterwarnings('ignore')

mcr = pd.read_csv('./input/multipleChoiceResponses.csv', encoding='ISO-8859-1')
mcr.head()

Unnamed: 0,GenderSelect,Country,Age,EmploymentStatus,StudentStatus,LearningDataScience,CodeWriter,CareerSwitcher,CurrentJobTitleSelect,TitleFit,...,JobFactorExperienceLevel,JobFactorDepartment,JobFactorTitle,JobFactorCompanyFunding,JobFactorImpact,JobFactorRemote,JobFactorIndustry,JobFactorLeaderReputation,JobFactorDiversity,JobFactorPublishingOpportunity
0,"Non-binary, genderqueer, or gender non-conforming",,,Employed full-time,,,Yes,,DBA/Database Engineer,Fine,...,,,,,,,,,,
1,Female,United States,30.0,"Not employed, but looking for work",,,,,,,...,,,,,,,,Somewhat important,,
2,Male,Canada,28.0,"Not employed, but looking for work",,,,,,,...,Very Important,Very Important,Very Important,Very Important,Very Important,Very Important,Very Important,Very Important,Very Important,Very Important
3,Male,United States,56.0,"Independent contractor, freelancer, or self-em...",,,Yes,,Operations Research Practitioner,Poorly,...,,,,,,,,,,
4,Male,Taiwan,38.0,Employed full-time,,,Yes,,Computer Scientist,Fine,...,,,,,,,,,,


In [9]:
mcr['GenderSelect'].value_counts()

Male                                                 13610
Female                                                2778
A different identity                                   159
Non-binary, genderqueer, or gender non-conforming       74
Name: GenderSelect, dtype: int64

In [10]:
colors = ['aqua', 'lightgrey', 'lightgreen', '#D0F9B1', 'khaki', 'grey']
# 'Non-binary, genderqueer, or gender non-conforming'를 'Non-binary'로 변환
# 아닌 경우는 그대로 출력
mcr['GenderSelect'] = np.where(mcr['GenderSelect'] == 'Non-binary, genderqueer, or gender non-conforming', 'Non-binary', mcr['GenderSelect'])

gender = mcr['GenderSelect'].value_counts()
label = gender.index
size = gender.values

trace = go.Pie(labels = label,
               values = size,
               marker = dict(colors = colors))

data = [trace]
layout = go.Layout(title = 'Gender Distribution')

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

대부분의 캐글 사용자는 남성입니다.

In [23]:
df = pd.DataFrame(mcr['Country'].value_counts())
df['country'] = df.index
df.columns = ['number', 'country']
df = df.reset_index().drop('index', axis=1)
df.head()

Unnamed: 0,number,country
0,4197,United States
1,2704,India
2,1023,Other
3,578,Russia
4,535,United Kingdom


In [15]:
df = pd.DataFrame(mcr['Country'].value_counts())
df['country'] = df.index
df.columns = ['number', 'country']
df = df.reset_index().drop('index', axis=1)

data = [dict(
    type = 'choropleth',
    locations = df['country'],
    locationmode = 'country names',
    z = df['number'],
    text = df['country'],
    colorscale = [[0, 'rgb(5, 10, 172)'], [0.35, 'rgb(40, 60, 190)'],
                  [0.5, 'rgb(70, 100, 245)'],
                  [0.6, 'rgb(90, 120, 245)'],
                  [0.7, 'rgb(106, 137, 247)'],
                  [1, 'rgb(220, 220, 220)']],
    autocolorscale = False,
    reversescale = True,
    marker = dict(
        line = dict(
            color = 'rgb(180, 180, 180)',
            width = 0.5
        )
    ),
    colorbar = dict(
        autotick = False,
        tickprefix = '$',
        title = 'Survey Respondents'
    )
)]

layout = dict(
    title = 'The Nationality of Respondents',
    geo = dict(
        showframe = False,
        showcoastlines = True,
        projection = dict(
            type = 'Mercator'
        )
    )
)

fig = dict(data = data, layout=layout)
py.iplot(fig, validate=False)

**참여자 수 상위 5개국**
1. 미국 - 4197
2. 인도 - 2704
3. 러시아 - 578
4. 영국 - 535
5. 중국 - 471

In [27]:
mcr = mcr[(mcr['Age'] < 75) & (mcr['Age'] > 18)]

age = round(mcr['Age'].value_counts(normalize=True), 4)
trace = go.Bar(
    x = age.index,
    y = age.values,
    marker = dict(
        color = age.values,
        colorscale = 'Reds',
        showscale = True
    )
)

data = [trace]
layout = go.Layout(title = 'Age distribution',
                   yaxis = dict(title = '# of Respondents'))

fig = go.Figure(data=data, layout=layout)
fig['layout']['xaxis'].update(dict(title='Age',
                                   tickfont = dict(size=12)))
py.iplot(fig)

**연령 분포**
- 20-30대가 제일 많음

In [32]:
train = mcr['FirstTrainingSelect'].value_counts()
label = train.index
size = train.values
colors = ['aqua', 'lightgrey', 'lightgreen', '#D0F9B1', 'khaki', 'grey']

trace = go.Pie(labels=label,
               values=size,
               marker=dict(colors=colors))
data = [trace]
layout = go.Layout(title='First Training Platform',
                   legend=dict(orientation='h'))

fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

**데이터 과학을 처음 배운 경로**
1. 온라인 교육과정(36%)
2. 대학교 과정(28.6%)
3. 독학(24.8%)
4. 일, 실무(6.79%)
5. 기타(2.02%)
6. 캐글(1.74%)

In [42]:
course = mcr['CoursePlatformSelect'].str.split(',')
course_set = []

for i in course.dropna():
    course_set.extend(i)
courses = round(pd.Series(course_set).value_counts(normalize=True)[:5], 4).sort_values(ascending=False).to_frame()

trace1 = go.Bar(
    x = courses.index,
    y = courses[0],
    name = 'course',
    marker = dict(
        color = courses[0],
        colorscale = 'Jet'
    )
)

learning = mcr['LearningPlatformSelect'].str.split(',')
learning_set = []
for i in learning.dropna():
    learning_set.extend(i)
learn = round(pd.Series(learning_set).value_counts(normalize=True)[:5], 4).sort_values(ascending=False).to_frame()

trace2= go.Bar(
    x = learn.index,
    y = learn[0],
    name = 'platform',
    marker = dict(
        color = learn[0],
        colorscale = 'Jet'
    )
)

fig = tools.make_subplots(rows=1, cols=2, subplot_titles=('Course Platforms', 'Learning Platforms'))
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig['layout'].update(height=500, width=820,
                     title='Where to start & How to continue in DS?',
                     showlegend=False)
py.iplot(fig)

In [41]:
courses[0]

Coursera    0.4088
Udacity     0.1889
edX         0.1667
DataCamp    0.1508
Other       0.0847
Name: 0, dtype: float64