<a href="https://colab.research.google.com/github/Kazuyasus/Econometrics/blob/main/1st.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

# GitHubのdataを利用
url = "https://raw.githubusercontent.com/Kazuyasus/Econometrics/main/resume.csv"

# CSVを読み込む
df=pd.read_csv(url)

# データの表示
df.head()


Unnamed: 0,firstname,sex,race,call
0,Allison,female,white,0
1,Kristen,female,white,0
2,Lakisha,female,black,0
3,Latonya,female,black,0
4,Carrie,female,white,0


In [None]:

# 観察数と説明変数の数を表示
print(df.shape)  # (行数, 列数)


(4870, 4)


In [None]:
# 最初の数行を表示
print(df.head())


  firstname     sex   race  call
0   Allison  female  white     0
1   Kristen  female  white     0
2   Lakisha  female  black     0
3   Latonya  female  black     0
4    Carrie  female  white     0


In [None]:
# 記述統計量を表示
print(df.describe(include='all'))
#include='all' → 記述統計量（平均値、標準偏差、最小値、最大値など）を計算する際に、すべての列を対象に含ませる.

       firstname     sex   race         call
count       4870    4870   4870  4870.000000
unique        36       2      2          NaN
top       Tamika  female  white          NaN
freq         256    3746   2435          NaN
mean         NaN     NaN    NaN     0.080493
std          NaN     NaN    NaN     0.272083
min          NaN     NaN    NaN     0.000000
25%          NaN     NaN    NaN     0.000000
50%          NaN     NaN    NaN     0.000000
75%          NaN     NaN    NaN     0.000000
max          NaN     NaN    NaN     1.000000


In [None]:
# 人種別×連絡の有無を示した分割表を作成
race_call_tab = pd.crosstab(df["race"], df["call"])
print(race_call_tab)

call      0    1
race            
black  2278  157
white  2200  235


In [None]:
# 各行、各列に合計値を追加
total = race_call_tab.copy()
#行「Total」を新しく追加し、列ごとの合計をその行に設定
total.loc["Total"] = total.sum()
#新しい列「Total」を追加し、行ごとの合計をその列に設定
total["Total"] = total.sum(axis=1)
print(total)


call      0    1  Total
race                   
black  2278  157   2435
white  2200  235   2435
Total  4478  392   4870


In [None]:

# 審査通過率 (全体の審査通過数をサンプルサイズで割る)  392÷4870
pass_rate = race_call_tab.iloc[:, 1].sum() / len(df)
print(pass_rate)
#　[:, 1] の解釈
#:（コロン） → 全ての行を選択する。
# 1 → 列のインデックス番号 1（2番目の列）を選択する。
#　len() は、オブジェクトの長さ（サイズ）→　サンプルサイズ


0.08049281314168377


In [None]:
# 人種ごとの審査通過率
black_pass_rate = race_call_tab.iloc[0, 1] / race_call_tab.iloc[0, :].sum()  # 1行2列目 (157)   ÷  1行目の合計値 (2435)
white_pass_rate = race_call_tab.iloc[1, 1] / race_call_tab.iloc[1, :].sum()  # 2行2列目 (235)   ÷  2行目の合計値 (2435)

print("Black Pass Rate:", black_pass_rate)
print("White Pass Rate:", white_pass_rate)

Black Pass Rate: 0.06447638603696099
White Pass Rate: 0.09650924024640657


In [None]:
# 人種×性別で部分集合を作成
dfBf = df[(df["race"] == "black") & (df["sex"] == "female")] # 黒人　かつ　女性
dfBm = df[(df["race"] == "black") & (df["sex"] == "male")] # 黒人　かつ　男性
dfWf = df[(df["race"] == "white") & (df["sex"] == "female")] # 白人　かつ　女性
dfWm = df[(df["race"] == "white") & (df["sex"] == "male")] # 白人　かつ　男性

# 人種格差の検証（女性同士）
diff_women = dfWf["call"].mean() - dfBf["call"].mean()
print("Difference in call rates (White Women - Black Women):", diff_women)

Difference in call rates (White Women - Black Women): 0.0326468944913853
