Plotly with pandas-backend

\- pandas에서 plotly를 이용하여 플롯을 그려보자.

## 1. 라이브러리 imports

In [1]:
import pandas as pd
import numpy as np
import plotly.io as pio

In [2]:
pd.options.plotting.backend = 'plotly'
pio.templates.default = 'plotly_white'
print(pio.templates)

Templates configuration
-----------------------
    Default template: 'plotly_white'
    Available templates:
        ['ggplot2', 'seaborn', 'simple_white', 'plotly',
         'plotly_white', 'plotly_dark', 'presentation', 'xgridoff',
         'ygridoff', 'gridon', 'none']



> 기본적으로 pandas의 backend는 matplotlib로 지정되어 있다. 이것을 plotly로 바꾸고 템플릿을 하얀색으로바꿨다.

\- `backend = plotly`를 입력하지 않아도 되게 만들었음.

In [3]:
_df = pd.DataFrame({'x' : [1,2,3]})

In [5]:
_df.plot.line()

> 알아서 `plotly`로 산출해주는 모습

\- pie chart같은 것은 지원하지 않음.

## 2. 여러가지 플랏


### **A. `.plot.bar()`**
---




\- 예제 1 : 성별 합격률 시각화

In [6]:
df = pd.read_csv("https://raw.githubusercontent.com/guebin/DV2022/master/posts/Simpson.csv",index_col=0,header=[0,1]).reset_index().melt(id_vars='index').set_axis(['department','gender','result','count'],axis=1)
df  ## 파일에 index_column이 존재하고, 첫 행이 열이름인듯.

Unnamed: 0,department,gender,result,count
0,A,male,fail,314
1,B,male,fail,208
2,C,male,fail,204
3,D,male,fail,279
4,E,male,fail,137
5,F,male,fail,149
6,A,male,pass,511
7,B,male,pass,352
8,C,male,pass,121
9,D,male,pass,138


In [9]:
df.pivot_table(index = 'gender', columns = 'result', values = 'count', aggfunc = 'sum')

result,fail,pass
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
female,1063,772
male,1291,1400


In [11]:
df.pivot_table(index = 'gender', columns = 'result', values = 'count', aggfunc = 'sum').assign(total = lambda _df : _df.fail + _df['pass'])\
.assign(rate = lambda _df : _df['pass']/_df.total)

result,fail,pass,total,rate
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,1063,772,1835,0.420708
male,1291,1400,2691,0.520253


In [14]:
df.pivot_table(index = 'gender', columns = 'result', values = 'count', aggfunc = 'sum').assign(total = lambda _df : _df.fail + _df['pass'])\
.assign(rate = lambda _df : _df['pass']/_df.total)\
.reset_index().drop(['fail', 'pass', 'total'], axis = 1)

result,gender,rate
0,female,0.420708
1,male,0.520253


\- 이상태에서 바로 시각화

In [30]:
df.pivot_table(index = 'gender', columns = 'result', values = 'count', aggfunc = 'sum').assign(total = lambda _df : _df.fail + _df['pass'])\
.assign(rate = lambda _df : _df['pass']/_df.total)\
.reset_index().drop(['fail', 'pass', 'total'], axis = 1)\
.assign(rate = lambda _df : _df.rate.apply(lambda x : round(x, 2)))\
.plot.bar(x = 'gender', y = 'rate', color = 'gender', text = 'rate', width = 600)    ## text 옵션으로 개체에 라벨링 가능, width나 height 옵션으로 크기 조절 가능

\- 예제 2 : (성별, 학과) 별 지원자 수 시각화

In [33]:
df.pivot_table(index = ['department', 'gender'], values = 'count', aggfunc = 'sum')

Unnamed: 0_level_0,Unnamed: 1_level_0,count
department,gender,Unnamed: 2_level_1
A,female,108
A,male,825
B,female,25
B,male,560
C,female,593
C,male,325
D,female,375
D,male,417
E,female,393
E,male,191


In [36]:
df.pivot_table(index = ['department', 'gender'], values = 'count', aggfunc = 'sum').reset_index()\
.plot.bar(x = 'gender', y = 'count', color = 'gender', facet_col = 'department', text = 'count', width = 800)  ## 면분할도 됨. facet_row는 지양할것

### **B. `plot.line()`**
---




\- 예제 1 : 핸드폰 판매량

In [37]:
df = pd.read_csv('https://raw.githubusercontent.com/guebin/2021DV/master/_notebooks/phone.csv')
df

Unnamed: 0,Date,Samsung,Apple,Huawei,Xiaomi,Oppo,Mobicel,Motorola,LG,Others,Realme,Google,Nokia,Lenovo,OnePlus,Sony,Asus
0,2019-10,461,324,136,109,76,81,43,37,135,28,39,14,22,17,20,17
1,2019-11,461,358,167,141,86,61,29,36,141,27,29,20,23,10,19,27
2,2019-12,426,383,143,105,53,45,51,48,129,30,20,26,28,18,18,19
3,2020-01,677,494,212,187,110,79,65,49,158,23,13,19,19,22,27,22
4,2020-02,593,520,217,195,112,67,62,71,157,25,18,16,24,18,23,20
5,2020-03,637,537,246,187,92,66,59,67,145,21,16,24,18,31,22,14
6,2020-04,647,583,222,154,98,59,48,64,113,20,23,25,19,19,23,21
7,2020-05,629,518,192,176,91,87,50,66,150,43,27,15,18,19,19,13
8,2020-06,663,552,209,185,93,69,54,60,140,39,16,16,17,29,25,16
9,2020-07,599,471,214,193,89,78,65,59,130,40,27,25,21,18,18,12


In [40]:
df.melt(id_vars = 'Date').set_axis(['날짜','회사','판매량'], axis = 1)

Unnamed: 0,날짜,회사,판매량
0,2019-10,Samsung,461
1,2019-11,Samsung,461
2,2019-12,Samsung,426
3,2020-01,Samsung,677
4,2020-02,Samsung,593
...,...,...,...
203,2020-06,Asus,16
204,2020-07,Asus,12
205,2020-08,Asus,20
206,2020-09,Asus,15


In [42]:
df.melt(id_vars = 'Date').set_axis(['날짜','회사','판매량'], axis = 1)\
.plot.line(x = '날짜', y = '판매량', color = '회사', width = 800)

### **C. `.plot.scatter()`**
---




\- 예제 1 : FIFA23 data

In [43]:
position_dict = {
    'GOALKEEPER':{'GK'},
    'DEFENDER':{'CB','RCB','LCB','RB','LB','RWB','LWB'},
    'MIDFIELDER':{'CM','RCM','LCM','CDM','RDM','LDM','CAM','RAM','LAM','RM','LM'},
    'FORWARD':{'ST','CF','RF','LF','RW','LW','RS','LS'},
    'SUB':{'SUB'},
    'RES':{'RES'}
}
df = pd.read_csv('https://raw.githubusercontent.com/guebin/DV2021/master/_notebooks/2021-10-25-FIFA22_official_data.csv')\
.loc[:,lambda df: df.isna().mean()<0.5].dropna()\
.assign(Position = lambda df: df.Position.str.split(">").str[-1].apply(lambda x: [k for k,v in position_dict.items() if x in v].pop()))\
.assign(Wage = lambda df: df.Wage.str[1:].str.replace('K','000').astype(int))
df

Unnamed: 0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,Club Logo,...,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Best Position,Best Overall Rating,Release Clause,DefensiveAwareness
0,212198,Bruno Fernandes,26,https://cdn.sofifa.com/players/212/198/22_60.png,Portugal,https://cdn.sofifa.com/flags/pt.png,88,89,Manchester United,https://cdn.sofifa.com/teams/11/30.png,...,65.0,12.0,14.0,15.0,8.0,14.0,CAM,88.0,€206.9M,72.0
1,209658,L. Goretzka,26,https://cdn.sofifa.com/players/209/658/22_60.png,Germany,https://cdn.sofifa.com/flags/de.png,87,88,FC Bayern München,https://cdn.sofifa.com/teams/21/30.png,...,77.0,13.0,8.0,15.0,11.0,9.0,CM,87.0,€160.4M,74.0
2,176580,L. Suárez,34,https://cdn.sofifa.com/players/176/580/22_60.png,Uruguay,https://cdn.sofifa.com/flags/uy.png,88,88,Atlético de Madrid,https://cdn.sofifa.com/teams/240/30.png,...,38.0,27.0,25.0,31.0,33.0,37.0,ST,88.0,€91.2M,42.0
3,192985,K. De Bruyne,30,https://cdn.sofifa.com/players/192/985/22_60.png,Belgium,https://cdn.sofifa.com/flags/be.png,91,91,Manchester City,https://cdn.sofifa.com/teams/10/30.png,...,53.0,15.0,13.0,5.0,10.0,13.0,CM,91.0,€232.2M,68.0
4,224334,M. Acuña,29,https://cdn.sofifa.com/players/224/334/22_60.png,Argentina,https://cdn.sofifa.com/flags/ar.png,84,84,Sevilla FC,https://cdn.sofifa.com/teams/481/30.png,...,82.0,8.0,14.0,13.0,13.0,14.0,LB,84.0,€77.7M,80.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16703,259718,F. Gebhardt,19,https://cdn.sofifa.com/players/259/718/22_60.png,Germany,https://cdn.sofifa.com/flags/de.png,52,66,FC Basel 1893,https://cdn.sofifa.com/teams/896/30.png,...,10.0,53.0,45.0,47.0,52.0,57.0,GK,52.0,€361K,6.0
16704,251433,B. Voll,20,https://cdn.sofifa.com/players/251/433/22_60.png,Germany,https://cdn.sofifa.com/flags/de.png,58,69,F.C. Hansa Rostock,https://cdn.sofifa.com/teams/27/30.png,...,10.0,59.0,60.0,56.0,55.0,61.0,GK,58.0,€656K,5.0
16706,262846,�. Dobre,20,https://cdn.sofifa.com/players/262/846/22_60.png,Romania,https://cdn.sofifa.com/flags/ro.png,53,63,FC Academica Clinceni,https://cdn.sofifa.com/teams/113391/30.png,...,12.0,57.0,52.0,53.0,48.0,58.0,GK,53.0,€279K,5.0
16707,241317,21 Xue Qinghao,19,https://cdn.sofifa.com/players/241/317/21_60.png,China PR,https://cdn.sofifa.com/flags/cn.png,47,60,Shanghai Shenhua FC,https://cdn.sofifa.com/teams/110955/30.png,...,9.0,49.0,48.0,45.0,38.0,52.0,GK,47.0,€223K,21.0


In [59]:
df.query('Position == "DEFENDER" or Position == "FORWARD"')\
.plot.scatter(x = 'ShotPower', y = 'StandingTackle',
              color = 'Position', size = 'Wage', hover_data = ['Name', 'Age'],
              opacity = 0.5, width = 800)    ## alpha가 아니라 opacity(불투명)로 설정함, hover_data로 마우스를 갖다댔을 때 추가적인 정보를 넣어줄 수 있음.

### **D. `.plot.box()`**
---




\- 예제 1 : 전북고등학교

In [60]:
y1=[75,75,76,76,77,77,78,79,79,98] # A선생님에게 통계학을 배운 학생의 점수들
y2=[76,76,77,77,78,78,79,80,80,81] # B선생님에게 통계학을 배운 학생의 점수들

In [61]:
df = pd.DataFrame({
    'Class' : ['A']*len(y1) + ['B']*len(y2),
    'Score' : y1+y2
})
df

Unnamed: 0,Class,Score
0,A,75
1,A,75
2,A,76
3,A,76
4,A,77
5,A,77
6,A,78
7,A,79
8,A,79
9,A,98


In [66]:
df.plot.box(x = 'Class', y = 'Score', color = 'Class',
            points = 'all', width = 500)   ## points 옵션을 사용해서 점도 직접 띄워줄 수 있음

\- 예제 2 : (년도, 시도)별 전기에너지 사용량

In [67]:
url = 'https://raw.githubusercontent.com/guebin/DV2022/main/posts/Energy/{}.csv'
prov = ['Seoul', 'Busan', 'Daegu', 'Incheon',
        'Gwangju', 'Daejeon', 'Ulsan', 'Sejongsi',
        'Gyeonggi-do', 'Gangwon-do', 'Chungcheongbuk-do',
        'Chungcheongnam-do', 'Jeollabuk-do', 'Jeollanam-do',
        'Gyeongsangbuk-do', 'Gyeongsangnam-do', 'Jeju-do']
df = pd.concat([pd.read_csv(url.format(p+y)).assign(년도=y, 시도=p) for p in prov for y in ['2018', '2019', '2020', '2021']]).reset_index(drop=True)\
.assign(년도 = lambda df: df.년도.astype(int))\
.set_index(['년도','시도','지역']).applymap(lambda x: int(str(x).replace(',','')))\
.reset_index()
df.head()

Unnamed: 0,년도,시도,지역,건물동수,연면적,에너지사용량(TOE)/전기,에너지사용량(TOE)/도시가스,에너지사용량(TOE)/지역난방
0,2018,Seoul,종로구,17929,9141777,64818,82015,111
1,2018,Seoul,중구,10598,10056233,81672,75260,563
2,2018,Seoul,용산구,17201,10639652,52659,85220,12043
3,2018,Seoul,성동구,14180,11631770,60559,107416,0
4,2018,Seoul,광진구,21520,12054796,70609,130308,0


In [77]:
## 그냥 바로 해도 됨(이미 타이디데이터임)
df.plot.box(x = '시도', y = '에너지사용량(TOE)/전기', color = '시도', facet_row = '년도', height = 1600, width = 800, hover_data = ['지역','연면적'])

### **E. `.plot.hist()`**
---




\- 예제 1 : 타이타닉 (연령, 성별) 생존자

In [78]:
df = pd.read_csv("https://raw.githubusercontent.com/guebin/DV2023/main/posts/titanic.csv")
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,logFare
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,1.981001
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,4.266662
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,2.070022
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,3.972177
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,2.085672
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,2.564949
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,3.401197
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,3.154870
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,3.401197


In [81]:
df.Age.hist()  ## 바로 히스토그램 그릴 수도 있음(시리즈에서)

In [85]:
df.plot.hist(
    x = 'Age', color = 'Sex',
    facet_row = 'Sex',
    facet_col = 'Survived')

> 성별 효과가 15세 이상에서는 있었고, 그 아래에서는 없는 것 같다.

### **F. `.plot.area()`**
---




\- 예제 1 : 핸드폰 판매량

In [89]:
df = pd.read_csv('https://raw.githubusercontent.com/guebin/2021DV/master/_notebooks/phone.csv')
df

Unnamed: 0,Date,Samsung,Apple,Huawei,Xiaomi,Oppo,Mobicel,Motorola,LG,Others,Realme,Google,Nokia,Lenovo,OnePlus,Sony,Asus
0,2019-10,461,324,136,109,76,81,43,37,135,28,39,14,22,17,20,17
1,2019-11,461,358,167,141,86,61,29,36,141,27,29,20,23,10,19,27
2,2019-12,426,383,143,105,53,45,51,48,129,30,20,26,28,18,18,19
3,2020-01,677,494,212,187,110,79,65,49,158,23,13,19,19,22,27,22
4,2020-02,593,520,217,195,112,67,62,71,157,25,18,16,24,18,23,20
5,2020-03,637,537,246,187,92,66,59,67,145,21,16,24,18,31,22,14
6,2020-04,647,583,222,154,98,59,48,64,113,20,23,25,19,19,23,21
7,2020-05,629,518,192,176,91,87,50,66,150,43,27,15,18,19,19,13
8,2020-06,663,552,209,185,93,69,54,60,140,39,16,16,17,29,25,16
9,2020-07,599,471,214,193,89,78,65,59,130,40,27,25,21,18,18,12


In [92]:
df.melt(id_vars = 'Date').set_axis(['날짜', '회사', '판매량'], axis = 1)\
.plot.area(x = '날짜', y = '판매량', color = '회사')

> 전체적인 판매량과 비중을 알 수 있음과 동시에 경향성을 알 수 있음

\- 예제 2 : 에너지 사용량

In [93]:
url = 'https://raw.githubusercontent.com/guebin/DV2022/main/posts/Energy/{}.csv'
prov = ['Seoul', 'Busan', 'Daegu', 'Incheon',
        'Gwangju', 'Daejeon', 'Ulsan', 'Sejongsi',
        'Gyeonggi-do', 'Gangwon-do', 'Chungcheongbuk-do',
        'Chungcheongnam-do', 'Jeollabuk-do', 'Jeollanam-do',
        'Gyeongsangbuk-do', 'Gyeongsangnam-do', 'Jeju-do']
df = pd.concat([pd.read_csv(url.format(p+y)).assign(년도=y, 시도=p) for p in prov for y in ['2018', '2019', '2020', '2021']]).reset_index(drop=True)\
.assign(년도 = lambda df: df.년도.astype(int))\
.set_index(['년도','시도','지역']).applymap(lambda x: int(str(x).replace(',','')))\
.reset_index()
df.head()

Unnamed: 0,년도,시도,지역,건물동수,연면적,에너지사용량(TOE)/전기,에너지사용량(TOE)/도시가스,에너지사용량(TOE)/지역난방
0,2018,Seoul,종로구,17929,9141777,64818,82015,111
1,2018,Seoul,중구,10598,10056233,81672,75260,563
2,2018,Seoul,용산구,17201,10639652,52659,85220,12043
3,2018,Seoul,성동구,14180,11631770,60559,107416,0
4,2018,Seoul,광진구,21520,12054796,70609,130308,0


In [97]:
df.set_index(['년도','시도','지역','건물동수','연면적']).stack().reset_index().rename({'level_5' : '에너지종류', 0 : '에너지사용량'}, axis = 1)\
.assign(에너지종류 = lambda _df : _df.에너지종류.str.split('/').str[-1])

Unnamed: 0,년도,시도,지역,건물동수,연면적,에너지종류,에너지사용량
0,2018,Seoul,종로구,17929,9141777,전기,64818
1,2018,Seoul,종로구,17929,9141777,도시가스,82015
2,2018,Seoul,종로구,17929,9141777,지역난방,111
3,2018,Seoul,중구,10598,10056233,전기,81672
4,2018,Seoul,중구,10598,10056233,도시가스,75260
...,...,...,...,...,...,...,...
2995,2021,Jeju-do,제주시,67053,20275738,도시가스,25689
2996,2021,Jeju-do,제주시,67053,20275738,지역난방,0
2997,2021,Jeju-do,서귀포시,35230,7512206,전기,37884
2998,2021,Jeju-do,서귀포시,35230,7512206,도시가스,2641


In [108]:
df.set_index(['년도','시도','지역','건물동수','연면적']).stack().reset_index().rename({'level_5' : '에너지종류', 0 : '에너지사용량'}, axis = 1)\
.assign(에너지종류 = lambda _df : _df.에너지종류.str.split('/').str[-1])\
.pivot_table(index = ['에너지종류', '시도', '년도'], values = '에너지사용량', aggfunc = 'sum').reset_index()

Unnamed: 0,에너지종류,시도,년도,에너지사용량
0,도시가스,Busan,2018,708240
1,도시가스,Busan,2019,675882
2,도시가스,Busan,2020,690015
3,도시가스,Busan,2021,878874
4,도시가스,Chungcheongbuk-do,2018,288927
...,...,...,...,...
199,지역난방,Seoul,2021,546491
200,지역난방,Ulsan,2018,0
201,지역난방,Ulsan,2019,0
202,지역난방,Ulsan,2020,0


In [107]:
df.set_index(['년도','시도','지역','건물동수','연면적']).stack().reset_index().rename({'level_5' : '에너지종류', 0 : '에너지사용량'}, axis = 1)\
.assign(에너지종류 = lambda _df : _df.에너지종류.str.split('/').str[-1])\
.pivot_table(index = ['에너지종류', '시도', '년도'], values = '에너지사용량', aggfunc = 'sum').reset_index()\
.plot.area(x = '년도', y = '에너지사용량', facet_col = '에너지종류', color = '시도', width = 600)

> 연도가 겹치는 문제 해결을 위한 미세조정

In [109]:
## figure로 저장
fig = df.set_index(['년도','시도','지역','건물동수','연면적']).stack().reset_index().rename({'level_5' : '에너지종류', 0 : '에너지사용량'}, axis = 1)\
.assign(에너지종류 = lambda _df : _df.에너지종류.str.split('/').str[-1])\
.pivot_table(index = ['에너지종류', '시도', '년도'], values = '에너지사용량', aggfunc = 'sum').reset_index()\
.plot.area(x = '년도', y = '에너지사용량', facet_col = '에너지종류', color = '시도', width = 600)

## 아마도 xaxis3에서 년도가 들어갈 수 있는 영역을 한정해주는듯
fig.update_layout(
    xaxis_domain = [0.0, 0.25],
    xaxis2_domain = [0.35, 0.60],
    xaxis3_domain = [0.70, 0.95]
)