In [1]:
import pandas as pd
import os

# 外部データのzipファイルの読み込みに必要
import zipfile
import requests
import io

# まとめて表示できるようにする
from IPython.display import display

# DataFrameの表示数を変更
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 500)



In [2]:
# カレントディレクトリの参照
path = os.getcwd()

### 外部データの読込(zipファイルの展開含む）

In [3]:
# データがあるurlの指定
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00356/student.zip'

# データをurlから取得
r = requests.get(url, stream=True)

# zipファイル内のstudent-mat.csvをカレントディレクトリ内に展開
with zipfile.ZipFile(io.BytesIO(r.content)) as existing_zip:
    existing_zip.extract('student-mat.csv', './')

In [4]:
# データの読込
df = pd.read_csv(path + "\student-mat.csv", sep = ';')

### データの解説は下記サイト参照
https://archive.ics.uci.edu/ml/datasets/student+performance

In [5]:
# まとめて df を表示
display(df.head())
display(df.tail())
display(df.dtypes)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,3,yes,no,yes,no,yes,yes,yes,no,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,yes,no,yes,yes,no,no,4,3,2,1,2,5,4,6,10,10


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
390,MS,M,20,U,LE3,A,2,2,services,services,course,other,1,2,2,no,yes,yes,no,yes,yes,no,no,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,course,mother,2,1,0,no,no,no,no,no,yes,yes,no,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,course,other,1,1,3,no,no,no,no,no,yes,no,no,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,course,mother,3,1,0,no,no,no,no,no,yes,yes,no,4,4,1,3,4,5,0,11,12,10
394,MS,M,19,U,LE3,T,1,1,other,at_home,course,father,1,1,0,no,no,no,no,yes,yes,yes,no,3,2,3,3,3,5,5,8,9,9


school        object
sex           object
age            int64
address       object
famsize       object
Pstatus       object
Medu           int64
Fedu           int64
Mjob          object
Fjob          object
reason        object
guardian      object
traveltime     int64
studytime      int64
failures       int64
schoolsup     object
famsup        object
paid          object
activities    object
nursery       object
higher        object
internet      object
romantic      object
famrel         int64
freetime       int64
goout          int64
Dalc           int64
Walc           int64
health         int64
absences       int64
G1             int64
G2             int64
G3             int64
dtype: object

In [6]:
# 集計
df.groupby('sex').agg(
    count=('sex', 'count'),     # sex の個数をカウント
    age_mean=('age', 'mean'),   # age の平均値
    G1_mean=('G1', 'mean'),     # G1  の平均値
    G1_median=('G1', 'median'), # G1  の中央値
    G2_mean=('G2', 'sum'),      # G2  の合計
    G2_median=('G2', 'max'),    # G2  の最大値
    G3_mean=('G3', 'min'),      # G3  の最小値
    G3_median=('G3', 'std')     # G3  の標準偏差
)

Unnamed: 0_level_0,count,age_mean,G1_mean,G1_median,G2_mean,G2_median,G3_mean,G3_median
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
F,208,16.730769,10.620192,10,2161,18,0,4.622338
M,187,16.657754,11.229947,11,2071,19,0,4.495297
