In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

# 1セルでまとめて.head()、.tail()等を入力しても大丈夫になる
from IPython.display import display

# ホールドアウト法
from sklearn.model_selection import train_test_split

# 正解率
from sklearn.metrics import accuracy_score

# 適合率
from sklearn.metrics import precision_score

# 再現率
from sklearn.metrics import recall_score

# F1値
from sklearn.metrics import f1_score

# pytorch
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# DataFrameの表示数を変更
pd.set_option('max_columns', 500)
pd.set_option('max_rows', 500)

In [2]:
url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/raw/titanic.csv"

In [3]:
df = pd.read_csv(url)

In [4]:
display(df.head())
display(df.tail())
display(df.shape)
display(df.dtypes)
print()
print("数値データ 基本統計量")
display(df.describe())
print()
print("カテゴリーデータ 基本統計量")
display(df.describe(exclude="number"))

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
886,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


(891, 11)

survived      int64
pclass        int64
name         object
sex          object
age         float64
sibsp         int64
parch         int64
ticket       object
fare        float64
cabin        object
embarked     object
dtype: object


数値データ 基本統計量


Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292



カテゴリーデータ 基本統計量


Unnamed: 0,name,sex,ticket,cabin,embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Woolner, Mr. Hugh",male,CA. 2343,G6,S
freq,1,577,7,4,644


In [5]:
feature = ["pclass", "sibsp", "parch", "fare"]
#feature = ["Pclass", "SibSp", "Parch", "Fare"]

### ホールドアウト法

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df[feature], df["survived"], test_size=0.3, random_state=42)

In [7]:
display(X_train.shape)
display(X_test.shape)
display(y_train.shape)
display(y_test.shape)

(623, 4)

(268, 4)

(623,)

(268,)

### Pytorch

In [8]:
# DataFrame型から、numpy.ndarray型に変換して、tensor型に変換
X_train = torch.Tensor(X_train.values) # ここをTensorにしないと RuntimeError: Expected object of scalar type Float but got scalar type Long for argument #2 'mat1' in call to _th_addmm 発生
X_test = torch.Tensor(X_test.values) # 同上

y_train = torch.LongTensor(y_train.values) # ここをLongTensorにしないと RuntimeError: expected scalar type Long but found Float 発生
y_test = torch.LongTensor(y_test.values) # 同上

In [9]:
torch.manual_seed(42)

<torch._C.Generator at 0x1fb9d8f1470>

In [10]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(4, 20)  # (X_train.shape[1], X)
        self.fc2 = nn.Linear(20, 10) # (X, Y)
        self.fc3 = nn.Linear(10, 2)   # (Y, Z)
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return F.log_softmax(x, dim = 1)

In [11]:
model = Net()
print(model)

Net(
  (fc1): Linear(in_features=4, out_features=20, bias=True)
  (fc2): Linear(in_features=20, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=2, bias=True)
)


In [12]:
optimizer = optim.SGD(model.parameters(), lr=0.02)
train_loss = []
train_accu = []
i = 0

In [13]:
model.train() #学習モード

Net(
  (fc1): Linear(in_features=4, out_features=20, bias=True)
  (fc2): Linear(in_features=20, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=2, bias=True)
)

In [14]:
for epoch in range(2000): # 数字は適当
    data, target = Variable(X_train), Variable(y_train)#微分可能な型
    optimizer.zero_grad() #勾配初期化
    output = model(data) #データを流す
        
    loss = F.nll_loss(output, target) #loss計算
    loss.backward()    #バックプロパゲーション
    train_loss.append(loss.data.item())
    optimizer.step()   # 重み更新
        
    prediction = output.data.max(1)[1] #予測結果
    accuracy = prediction.eq(target.data).sum().numpy() / len(X_train) #正解率
    train_accu.append(accuracy)
    
    if i % 10 == 0:
        print('Train Step: {}\tLoss: {:.3f}\tAccuracy: {:.3f}'.format(i, loss.data.item(), accuracy))
    i += 1

Train Step: 0	Loss: 1.062	Accuracy: 0.629
Train Step: 10	Loss: 0.692	Accuracy: 0.385
Train Step: 20	Loss: 0.692	Accuracy: 0.385
Train Step: 30	Loss: 0.692	Accuracy: 0.385
Train Step: 40	Loss: 0.692	Accuracy: 0.385
Train Step: 50	Loss: 0.692	Accuracy: 0.390
Train Step: 60	Loss: 0.692	Accuracy: 0.392
Train Step: 70	Loss: 0.692	Accuracy: 0.390
Train Step: 80	Loss: 0.692	Accuracy: 0.390
Train Step: 90	Loss: 0.692	Accuracy: 0.392
Train Step: 100	Loss: 0.692	Accuracy: 0.393
Train Step: 110	Loss: 0.691	Accuracy: 0.395
Train Step: 120	Loss: 0.691	Accuracy: 0.398
Train Step: 130	Loss: 0.690	Accuracy: 0.604
Train Step: 140	Loss: 0.685	Accuracy: 0.642
Train Step: 150	Loss: 0.673	Accuracy: 0.660
Train Step: 160	Loss: 0.671	Accuracy: 0.642
Train Step: 170	Loss: 0.655	Accuracy: 0.676
Train Step: 180	Loss: 0.645	Accuracy: 0.685
Train Step: 190	Loss: 0.638	Accuracy: 0.666
Train Step: 200	Loss: 0.636	Accuracy: 0.655
Train Step: 210	Loss: 0.633	Accuracy: 0.663
Train Step: 220	Loss: 0.630	Accuracy: 0.668

In [15]:
print('Train Step: {}\tLoss: {:.3f}\tAccuracy: {:.3f}'.format(i, loss.data.item(), accuracy))

Train Step: 2000	Loss: 0.607	Accuracy: 0.684


In [16]:
#精度検証
model.eval() #推論モード
outputs = model(Variable(X_test))
_, predicted = torch.max(outputs.data, 1)

In [17]:
print(f"正解率 :{accuracy_score(y_test, predicted)}")
print(f"適合率 :{precision_score(y_test, predicted)}")
print(f"再現率 :{recall_score(y_test, predicted)}")
print(f"F1スコア :{f1_score(y_test, predicted)}")

正解率 :0.7238805970149254
適合率 :0.6907216494845361
再現率 :0.6036036036036037
F1スコア :0.6442307692307693
