In [206]:
import pandas as pd
import io
import requests
import altair as alt
from scipy.stats import pearsonr

# データのダウンロード

In [2]:
#仕事満足度を含むISSP2015のデータ
df_wo = pd.read_csv('/Users/MitaShunosuke/Library/CloudStorage/OneDrive-TheUniversityofTokyo/ダウンロード/修士課程/M1A/情報可視化の理論と実践/infovis-notebooks/final_report/ISSP 2015 - Work Orientations IV/issp2015_wo.csv')

In [42]:
# 性別による平均賃金の差のデータをダウンロードする
url = 'https://sdmx.oecd.org/public/rest/data/OECD.ELS.SAE,DSD_EARNINGS@GENDER_WAGE_GAP,1.0/all?dimensionAtObservation=AllDimensions&format=csvfilewithlabels'
response = requests.get(url)
df_gengap = pd.read_csv(io.BytesIO(response.content))

# データセットの前処理

## ISSP2015

### 国名

In [7]:
#すべての調査が終了した2017年6月時点でのOECD加盟国 + 台湾
def Country_OECD_full(x):
    if x == "AT":
        return "Austria"
    elif x == "AU":
        return "Australia"
    elif x == "BE":
        return "Belgium"
    elif x == "CH":
        return "Switzerland"
    elif x == "CL":
        return "Chile"
    elif x == "CZ":
        return "Czech Republic"
    elif x == "DK":
        return "Denmark"
    elif x == "EE":
        return "Estonia"
    elif x == "ES":
        return "Spain"
    elif x == "FI":
        return "Finland"
    elif x == "FR":
        return "France"
    elif x == "HU":
        return "Hungary"
    elif x == "IS":
        return "Iceland"
    elif x == "JP":
        return "Japan"
    elif x == "LV":
        return "Latvia"
    elif x == "MX":
        return "Mexico"
    elif x == "NZ":
        return "New Zealand"
    elif x == "NO":
        return "Norway"
    elif x == "PL":
        return "Poland"
    elif x == "SE":
        return "Sweden"
    elif x == "SI":
        return "Slovenia"
    elif x == "SK":
        return "Slovakia"
    elif x == "TR":
        return "Turkey"
    elif x == "TW":
        return "Taiwan"
    elif x == "US":
        return "United States"
    elif x == "DE":
        return "Germany"
    elif x == "IL":
        return "Israel"
    elif x == "GB-GBN":
        return "Great Britain"
    else:
        return None
df_wo['Country_OECD_full'] = df_wo['c_alphan'].apply(Country_OECD_full)

In [24]:
df_wo['c_alphan'].unique()

array(['AT', 'AU', 'BE', 'CH', 'CL', 'CN', 'CZ', 'DE', 'DK', 'EE', 'ES',
       'FI', 'FR', 'GB-GBN', 'GE', 'HR', 'HU', 'IL', 'IN', 'IS', 'JP',
       'LT', 'LV', 'MX', 'NO', 'NZ', 'PH', 'PL', 'RU', 'SE', 'SI', 'SK',
       'SR', 'TW', 'US', 'VE', 'ZA'], dtype=object)

In [None]:
#変換できたか確認
pd.crosstab(df_wo['c_alphan'], df_wo['Country_OECD_full'], dropna=False)

In [28]:
#変換できたか、表のサイズで確認
pd.crosstab(df_wo['c_alphan'], df_wo['Country_OECD_full']).shape

(28, 28)

### 性別

#### 女性ダミー

In [3]:
def func_female_dummy(x):
    if x == 1: # Male
        return 0
    elif x == 2: #Female
        return 1
    else:
        return None
df_wo['fem_dummy'] = df_wo['SEX'].apply(func_female_dummy)

In [4]:
#変換できたか確認
pd.crosstab(df_wo['SEX'], df_wo['fem_dummy'], dropna=False)

fem_dummy,0.0,1.0,NaN
SEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,23540,0,0
2.0,0,27983,0
,0,0,145


#### 性別ラベル

In [51]:
def func_female_dummy(x):
    if x == 1: # Male
        return 'Male'
    elif x == 2: #Female
        return 'Female'
    else:
        return None
df_wo['SEX_label'] = df_wo['SEX'].apply(func_female_dummy)

In [52]:
#変換できたか確認
pd.crosstab(df_wo['SEX_label'], df_wo['fem_dummy'], dropna=False)

fem_dummy,0.0,1.0,NaN
SEX_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,0,27983,0
Male,23540,0,0
,0,0,145


### 最高学歴

In [53]:
def func_deg(x):
    if x <= 4:
        return 0
    elif x == 5 or x == 6 :
        return 1
    else:
        return None
df_wo['ter_DEG_dummy'] = df_wo['DEGREE'].apply(func_deg)

In [54]:
#変換できたか確認
pd.crosstab(df_wo['DEGREE'], df_wo['ter_DEG_dummy'], dropna=False)

ter_DEG_dummy,0.0,1.0,NaN
DEGREE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,3032,0,0
1.0,3388,0,0
2.0,11421,0,0
3.0,12775,0,0
4.0,6797,0,0
5.0,0,8582,0
6.0,0,4978,0
,0,0,695


### 仕事満足度

In [55]:
#v44 あなたは、今の仕事にどのくらい満足していますか。
#項目の逆転
df_wo['satisfy_in_job'] = df_wo['v44'].apply(lambda x : 8-x if x < 8 else None)

In [56]:
#変換できたか確認
pd.crosstab(df_wo['v44'], df_wo['satisfy_in_job'], dropna=False)

satisfy_in_job,1.0,2.0,3.0,4.0,5.0,6.0,7.0,NaN
v44,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1.0,0,0,0,0,0,0,4304,0
2.0,0,0,0,0,0,8248,0,0
3.0,0,0,0,0,10333,0,0,0
4.0,0,0,0,3183,0,0,0,0
5.0,0,0,1322,0,0,0,0,0
6.0,0,435,0,0,0,0,0,0
7.0,217,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,23626


# 分析用データの作成

In [193]:
# ISSP2015のデータについて、男性における国別の平均仕事満足度のデータフレームを作成する
df_wo_analysis = df_wo[(df_wo['SEX_label'] == 'Male')]\
	.groupby(['Country_OECD_full'])['satisfy_in_job']\
	.mean().reset_index()
df_wo_analysis.head()
# OECDのデータセットについて、集計年が2014年かつ格差の計算範囲が全体の行を抽出
df_gengap_analysis = df_gengap[(df_gengap['TIME_PERIOD'] == 2014)&(df_gengap['AGGREGATION_OPERATION'] == 'MEDIAN')]\
	.loc[:, ['STRUCTURE_NAME','REF_AREA','Reference area','OBS_VALUE']]\
	.reset_index()
# 二つのデータセットを横に結合した分析用データセットを作る
df_concat = pd.merge(df_wo_analysis, df_gengap_analysis,\
					    left_on="Country_OECD_full", right_on="Reference area")
# 特定の列を抽出し、列名を変更する
df_analysis = df_concat.loc[:, ['Country_OECD_full','satisfy_in_job','OBS_VALUE']]\
	.rename(columns={"Country_OECD_full":"Country","OBS_VALUE":"Gender wage gap"})

In [189]:
# Zスコアによる標準化
df_analysis_std['satisfy_in_job'] = (df_analysis['satisfy_in_job'] - df_analysis['satisfy_in_job'].mean()) / df_analysis['satisfy_in_job'].std()
df_analysis_std['Gender wage gap'] = (df_analysis['Gender wage gap'] - df_analysis['Gender wage gap'].mean()) / df_analysis['Gender wage gap'].std()

# グラフの描画

In [200]:
chart = alt.Chart(df_analysis).mark_point().encode(
    x='Gender wage gap',
    y='satisfy_in_job',
).properties(
    title="男女間平均賃金格差×男性の仕事満足度"
)
text = chart.mark_text(
    align='left',
    baseline='middle',
    dx=7
).encode(
    text='Country'
)
line = chart.transform_regression('Gender wage gap', 'satisfy_in_job').mark_line()

chart_std = alt.Chart(df_analysis_std).mark_point().encode(
    x='Gender wage gap',
    y='satisfy_in_job',
).properties(
    title="男女間平均賃金格差×男性の仕事満足度(標準化)"
)
text_std = chart_std.mark_text(
    align='left',
    baseline='middle',
    dx=7
).encode(
    text='Country'
)
line_std = chart_std.transform_regression('Gender wage gap', 'satisfy_in_job').mark_line()


chart + chart.interactive() + text + line | chart_std + chart_std.interactive() + text_std + line_std

In [199]:
# 日本を除いてみる
df_analysis_nonj = df_analysis[(df_analysis['Country'] != 'Japan')]
df_analysis_std_nonj = df_analysis_std[(df_analysis_std['Country'] != 'Japan')]
chart = alt.Chart(df_analysis_nonj).mark_point().encode(
    x='Gender wage gap',
    y='satisfy_in_job',
).properties(
    title="男女間平均賃金格差×男性の仕事満足度(日本除く)"
)
text = chart.mark_text(
    align='left',
    baseline='middle',
    dx=7
).encode(
    text='Country'
)
line = chart.transform_regression('Gender wage gap', 'satisfy_in_job').mark_line()

chart_std = alt.Chart(df_analysis_std_nonj).mark_point().encode(
    x='Gender wage gap',
    y='satisfy_in_job',
).properties(
    title="男女間平均賃金格差×男性の仕事満足度(標準化・日本除く)"
)
text_std = chart_std.mark_text(
    align='left',
    baseline='middle',
    dx=7
).encode(
    text='Country'
)
line_std = chart_std.transform_regression('Gender wage gap', 'satisfy_in_job').mark_line()


chart + chart.interactive() + text + line | chart_std + chart_std.interactive() + text_std + line_std

In [194]:
# 男性かつ管理職に絞ってみる------------------------------------------------------------------------------
## ISSP2015のデータについて、男性における国別の平均仕事満足度のDFを作成する
df_wo_analysis = df_wo[(df_wo['SEX_label'] == 'Male')&(df_wo['WRKSUP'] == 1)]\
	.groupby(['Country_OECD_full'])['satisfy_in_job']\
	.mean().reset_index()
df_wo_analysis.head()
## OECDのデータセットについて、集計年が2015年かつ格差の計算範囲が全体の行を抽出
df_gengap_analysis = df_gengap[(df_gengap['TIME_PERIOD'] == 2014)&(df_gengap['AGGREGATION_OPERATION'] == 'MEDIAN')]\
	.loc[:, ['STRUCTURE_NAME','REF_AREA','Reference area','OBS_VALUE']]\
	.reset_index()
## 二つのデータセットを横に結合した分析用データセットを作る
df_concat = pd.merge(df_wo_analysis, df_gengap_analysis,\
					    left_on="Country_OECD_full", right_on="Reference area")
## 特定の列を抽出し、列名を変更する
df_analysis = df_concat.loc[:, ['Country_OECD_full','satisfy_in_job','OBS_VALUE']]\
	.rename(columns={"Country_OECD_full":"Country","OBS_VALUE":"Gender wage gap"})
#グラフの描画-----------------------------------------------------------------------------
chart = alt.Chart(df_analysis).mark_point().encode(
    x='Gender wage gap',
    y='satisfy_in_job',
).properties(
    title="男女間平均賃金格差×男性管理職の仕事満足度"
)
text = chart.mark_text(
    align='left',
    baseline='middle',
    dx=7
).encode(
    text='Country'
)
line = chart.transform_regression('Gender wage gap', 'satisfy_in_job').mark_line()

chart_std = alt.Chart(df_analysis_std).mark_point().encode(
    x='Gender wage gap',
    y='satisfy_in_job',
).properties(
    title="男女間平均賃金格差×男性管理職の仕事満足度（標準化）"
)
text_std = chart_std.mark_text(
    align='left',
    baseline='middle',
    dx=7
).encode(
    text='Country'
)
line_std = chart_std.transform_regression('Gender wage gap', 'satisfy_in_job').mark_line()

chart + chart.interactive() + text + line | chart_std + chart_std.interactive() + text_std + line_std


In [201]:
# 男性かつ配偶者がいて、一緒に生活している人に絞ってみる------------------------------------------------------------------------------
## ISSP2015のデータについて、男性における国別の平均仕事満足度のDFを作成する
df_wo_analysis = df_wo[(df_wo['SEX_label'] == 'Male')&(df_wo['PARTLIV'] == 1)]\
	.groupby(['Country_OECD_full'])['satisfy_in_job']\
	.mean().reset_index()
df_wo_analysis.head()
## OECDのデータセットについて、集計年が2015年かつ格差の計算範囲が全体の行を抽出
df_gengap_analysis = df_gengap[(df_gengap['TIME_PERIOD'] == 2014)&(df_gengap['AGGREGATION_OPERATION'] == 'MEDIAN')]\
	.loc[:, ['STRUCTURE_NAME','REF_AREA','Reference area','OBS_VALUE']]\
	.reset_index()
## 二つのデータセットを横に結合した分析用データセットを作る
df_concat = pd.merge(df_wo_analysis, df_gengap_analysis,\
					    left_on="Country_OECD_full", right_on="Reference area")
## 特定の列を抽出し、列名を変更する
df_analysis = df_concat.loc[:, ['Country_OECD_full','satisfy_in_job','OBS_VALUE']]\
	.rename(columns={"Country_OECD_full":"Country","OBS_VALUE":"Gender wage gap"})
# Zスコアによる標準化
df_analysis_std['satisfy_in_job'] = (df_analysis['satisfy_in_job'] - df_analysis['satisfy_in_job'].mean()) / df_analysis['satisfy_in_job'].std()
df_analysis_std['Gender wage gap'] = (df_analysis['Gender wage gap'] - df_analysis['Gender wage gap'].mean()) / df_analysis['Gender wage gap'].std()
#グラフの描画-----------------------------------------------------------------------------
chart = alt.Chart(df_analysis).mark_point().encode(
    x='Gender wage gap',
    y='satisfy_in_job',
).properties(
    title="男女間平均賃金格差×男性有配偶者の仕事満足度"
)
text = chart.mark_text(
    align='left',
    baseline='middle',
    dx=7
).encode(
    text='Country'
)
line = chart.transform_regression('Gender wage gap', 'satisfy_in_job').mark_line()

chart_std = alt.Chart(df_analysis_std).mark_point().encode(
    x='Gender wage gap',
    y='satisfy_in_job',
).properties(
    title="男女間平均賃金格差×男性有配偶者の仕事満足度（標準化）"
)
text_std = chart_std.mark_text(
    align='left',
    baseline='middle',
    dx=7
).encode(
    text='Country'
)
line_std = chart_std.transform_regression('Gender wage gap', 'satisfy_in_job').mark_line()

chart + chart.interactive() + text + line | chart_std + chart_std.interactive() + text_std + line_std


In [204]:
# 男性かつ配偶者がいて、一緒に生活している人のデータから日本をのぞいてみる------------------------------------------------------------------------------
## ISSP2015のデータについて、男性における国別の平均仕事満足度のDFを作成する
df_wo_analysis = df_wo[(df_wo['SEX_label'] == 'Male')&(df_wo['PARTLIV'] == 1)]\
	.groupby(['Country_OECD_full'])['satisfy_in_job']\
	.mean().reset_index()
## OECDのデータセットについて、集計年が2015年かつ格差の計算範囲が全体の行を抽出
df_gengap_analysis = df_gengap[(df_gengap['TIME_PERIOD'] == 2014)&(df_gengap['AGGREGATION_OPERATION'] == 'MEDIAN')]\
	.loc[:, ['STRUCTURE_NAME','REF_AREA','Reference area','OBS_VALUE']]\
	.reset_index()
## 二つのデータセットを横に結合した分析用データセットを作る
df_concat = pd.merge(df_wo_analysis, df_gengap_analysis,\
					    left_on="Country_OECD_full", right_on="Reference area")
## 特定の列を抽出し、列名を変更する
df_analysis = df_concat.loc[:, ['Country_OECD_full','satisfy_in_job','OBS_VALUE']]\
	.rename(columns={"Country_OECD_full":"Country","OBS_VALUE":"Gender wage gap"})
# Zスコアによる標準化
df_analysis_std['satisfy_in_job'] = (df_analysis['satisfy_in_job'] - df_analysis['satisfy_in_job'].mean()) / df_analysis['satisfy_in_job'].std()
df_analysis_std['Gender wage gap'] = (df_analysis['Gender wage gap'] - df_analysis['Gender wage gap'].mean()) / df_analysis['Gender wage gap'].std()
df_analysis_m_nonj = df_analysis[(df_analysis['Country'] != 'Japan')]
df_analysis_m_std_nonj = df_analysis_std[(df_analysis_std['Country'] != 'Japan')]
#グラフの描画-----------------------------------------------------------------------------
chart = alt.Chart(df_analysis_m_nonj).mark_point().encode(
    x='Gender wage gap',
    y='satisfy_in_job',
).properties(
    title="男女間平均賃金格差×男性有配偶者の仕事満足度（日本を除く）"
)
text = chart.mark_text(
    align='left',
    baseline='middle',
    dx=7
).encode(
    text='Country'
)
line = chart.transform_regression('Gender wage gap', 'satisfy_in_job').mark_line()

chart_std = alt.Chart(df_analysis_m_std_nonj).mark_point().encode(
    x='Gender wage gap',
    y='satisfy_in_job',
).properties(
    title="男女間平均賃金格差×男性有配偶者の仕事満足度（標準化・日本を除く）"
)
text_std = chart_std.mark_text(
    align='left',
    baseline='middle',
    dx=7
).encode(
    text='Country'
)
line_std = chart_std.transform_regression('Gender wage gap', 'satisfy_in_job').mark_line()

chart + chart.interactive() + text + line | chart_std + chart_std.interactive() + text_std + line_std


In [207]:
# 男性かつ配偶者がいて、一緒に生活している人のデータセットで無相関検定を実施
satisfy_in_job = df_analysis_m_nonj["satisfy_in_job"].values
gap = df_analysis_m_nonj["Gender wage gap"].values

#相関係数とp値を算出
res = pearsonr(satisfy_in_job, gap)
r_value = res[0]
p_value = res[1]

print('相関係数：', r_value)
print('p値：', p_value)

相関係数： 0.2583438331926193
p値： 0.2714371025597681
