In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import DataLoader, TensorDataset

In [4]:
# Load data from a CSV file or another data source
file_path = '~/Desktop/547943_data.csv'
df = pd.read_csv(file_path)

In [12]:
# Sorting by game_date in descending order
df = df.sort_values(by='game_date', ascending=True)

# Splitting data into train and test based on game_date
train_data = df[df['game_date'].apply(lambda x: int(x[:4])) <= 2018]  # Training data for 2017, 2018
test_data = df[df['game_date'].apply(lambda x: int(x[:4])) == 2019]  # Test data for 2019

# 선택한 열 유지
selected_columns = [
    'pitch_type', 'game_date', 'release_speed', 'release_pos_x', 'release_pos_z', 'player_name', 'batter', 'pitcher',
    'events', 'description', 'spin_dir', 'spin_rate_deprecated', 'break_angle_deprecated', 'break_length_deprecated',
    'zone', 'des', 'game_type', 'stand', 'p_throws', 'home_team', 'away_team', 'type', 'hit_location', 'bb_type',
    'balls', 'strikes', 'game_year', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'on_3b', 'on_2b', 'on_1b',
    'outs_when_up', 'inning', 'inning_topbot', 'hc_x', 'hc_y', 'tfs_deprecated', 'tfs_zulu_deprecated', 'fielder_2',
    'umpire', 'sv_id', 'vx0', 'vy0', 'vz0', 'ax', 'ay', 'az', 'sz_top', 'sz_bot', 'hit_distance_sc', 'launch_speed',
    'launch_angle', 'effective_speed', 'release_spin_rate', 'release_extension', 'game_pk', 'pitcher.1', 'fielder_2.1',
    'fielder_3', 'fielder_4', 'fielder_5', 'fielder_6', 'fielder_7', 'fielder_8', 'fielder_9', 'release_pos_y',
    'estimated_ba_using_speedangle', 'estimated_woba_using_speedangle', 'woba_value', 'woba_denom', 'babip_value',
    'iso_value', 'launch_speed_angle', 'at_bat_number', 'pitch_number', 'pitch_name', 'home_score', 'away_score',
    'bat_score', 'fld_score', 'post_away_score', 'post_home_score', 'post_bat_score', 'post_fld_score',
    'if_fielding_alignment', 'of_fielding_alignment', 'spin_axis', 'delta_home_win_exp', 'delta_run_exp'
]

df_selected = df[selected_columns]



# 범주형 열을 수치형으로 변환 및 인코딩
label_encoder = LabelEncoder()
categorical_cols = ['pitch_type', 'player_name', 'batter', 'pitcher', 'events', 'description', 'game_type', 'stand',
                     'p_throws', 'home_team', 'away_team', 'type', 'hit_location', 'bb_type', 'fielder_2', 'umpire',
                     'pitch_name', 'if_fielding_alignment', 'of_fielding_alignment']

for col in categorical_cols:
    df_selected[col] = label_encoder.fit_transform(df_selected[col].astype(str))



# 필요한 경우 여기에 추가적인 전처리 수행
numeric_cols = df_selected.select_dtypes(include='number').columns
non_numeric_cols = list(set(df_selected.columns) - set(numeric_cols))

df_selected[numeric_cols] = df_selected[numeric_cols].fillna(df_selected[numeric_cols].mean())
df_selected[non_numeric_cols] = df_selected[non_numeric_cols].fillna(df_selected[non_numeric_cols].mode().iloc[0])


# Prepare data for training
target_column = ['pitch_type', 'batter', 'pitcher', 'events', 'release_speed', 'release_pos_x', 'release_pos_z',
     'release_extension', 'hc_x', 'hc_y', 'hit_location', 'estimated_ba_using_speedangle',
    'estimated_woba_using_speedangle', 'babip_value', 'iso_value', 'launch_speed', 'launch_angle',
    'effective_speed', 'hit_distance_sc', 'on_3b', 'on_2b', 'on_1b', 'pitch_number', 'at_bat_number',
    'home_score', 'away_score', 'bat_score', 'fld_score']  # Replace with your target column name
X = df_selected[target_column]
y = df_selected[target_column]


# 결측치 제거
#df_selected = df_selected.fillna(df_selected.mean())  # 평균값으로 대체


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert data to PyTorch tensors
train_data = torch.tensor(X_train.values, dtype=torch.float32)
train_labels = torch.tensor(y_train.values, dtype=torch.float32)
test_data = torch.tensor(X_test.values, dtype=torch.float32)
test_labels = torch.tensor(y_test.values, dtype=torch.float32)


# Create DataLoader for training and testing
train_dataset = TensorDataset(train_data, train_labels)
test_dataset = TensorDataset(test_data, test_labels)

batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# GPU 사용 가능 여부 확인
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# 모델을 GPU로 이동
model.to(device)


Using device: cpu


MGMContrastiveModel(
  (embedding): Linear(in_features=42, out_features=512, bias=True)
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-7): 8 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (fc): Linear(in_features=512, out_features=42, bias=True)
)

In [13]:
class MGMContrastiveModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(MGMContrastiveModel, self).__init__()
        self.embedding = nn.Linear(input_size, hidden_size)
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=hidden_size, nhead=8, batch_first=True),
            num_layers=num_layers
        )
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x)
        x = self.fc(x)
        return x

# 모델 초기화
input_size = len(selected_columns) - 1 # game date열은 제외하고 나머지를 사용
hidden_size = 512
num_layers = 8
output_size = len(target_column)

# 확인된 입력 데이터의 열의 수를 input_size로 사용
input_size = X_train.shape[1]

model = MGMContrastiveModel(input_size, hidden_size, num_layers, output_size)

# 손실 함수 및 옵티마이저 설정
criterion = nn.MSELoss()  # 회귀 작업에 맞는 손실 함수 사용
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4, betas=(0.9, 0.999))

# 훈련 파라미터 설정
batch_size = 8
num_iterations = 1000  # 임의로 설정한 반복 횟수

# DataLoader에서 반복하여 모델 학습
for epoch in range(num_iterations):
    total_loss = 0.0

    # 모델을 훈련 모드로 설정
    model.train()

    for batch_data, batch_labels in train_loader:
        # 모델을 통해 forward pass 수행
        outputs = model(train_data)

        # 회귀 작업에 맞는 손실 계산
        loss = criterion(outputs, train_labels)

        # Backward pass 및 최적화
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # 현재 미니배치의 손실을 누적
        total_loss += loss.item()

    # 현재 epoch의 평균 손실 계산
    average_loss = total_loss / len(train_loader)

    # 학습 중 손실 출력 등을 추가로 수행할 수 있음
    if (epoch + 1) % 100 == 0:
        print(f'Epoch [{epoch + 1}/{num_iterations}], Train Loss: {average_loss}')

    # 테스트 데이터를 사용하여 모델 평가
    model.eval()
    test_loss = 0.0
    with torch.no_grad():
        for batch_data, batch_labels in test_loader:
            # 모델을 통해 forward pass 수행
            outputs = model(test_data)

            # 회귀 작업에 맞는 손실 계산
            loss = criterion(outputs, test_labels)

            # 테스트 데이터의 손실을 누적
            test_loss += loss.item()

    # 테스트 데이터에 대한 평균 손실 계산
    average_test_loss = test_loss / len(test_loader)

    print(f'Epoch [{epoch + 1}/{num_iterations}], Train Loss: {average_loss}, Test Loss: {average_test_loss}')

# 학습 종료 후 추가 작업 수행
# 예를 들어, 최종 모델 저장, 시각화 등의 작업을 수행할 수 있음