In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


## データの読み込みと確認

In [2]:
train=pd.read_csv("./train.csv")
test = pd.read_csv('./test.csv', header=0)

train.head()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [3]:
train["satisfaction"].value_counts()

neutral or dissatisfied    58879
satisfied                  45025
Name: satisfaction, dtype: int64

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 25 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Unnamed: 0                         103904 non-null  int64  
 1   id                                 103904 non-null  int64  
 2   Gender                             103904 non-null  object 
 3   Customer Type                      103904 non-null  object 
 4   Age                                103904 non-null  int64  
 5   Type of Travel                     103904 non-null  object 
 6   Class                              103904 non-null  object 
 7   Flight Distance                    103904 non-null  int64  
 8   Inflight wifi service              103904 non-null  int64  
 9   Departure/Arrival time convenient  103904 non-null  int64  
 10  Ease of Online booking             103904 non-null  int64  
 11  Gate location                      1039

In [5]:
train.isnull().sum()

Unnamed: 0                             0
id                                     0
Gender                                 0
Customer Type                          0
Age                                    0
Type of Travel                         0
Class                                  0
Flight Distance                        0
Inflight wifi service                  0
Departure/Arrival time convenient      0
Ease of Online booking                 0
Gate location                          0
Food and drink                         0
Online boarding                        0
Seat comfort                           0
Inflight entertainment                 0
On-board service                       0
Leg room service                       0
Baggage handling                       0
Checkin service                        0
Inflight service                       0
Cleanliness                            0
Departure Delay in Minutes             0
Arrival Delay in Minutes             310
satisfaction    

## データの前処理

### 特徴値の作成

#### 不要の情報を除外

In [6]:
train = train.drop(columns=['Unnamed: 0', 'id'])
test = test.drop(columns=['Unnamed: 0', 'id'])

#### データ前処理のpipeline

In [7]:
target= ["satisfaction"]
categorical_columns = ['Gender', 'Customer Type', 'Type of Travel', 'Class']
numeric_columns = train.columns[~train.columns.isin(target + categorical_columns)].tolist()

In [8]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),  # Fill missing with median
    ("scaler", StandardScaler())  # Scale features
])

In [9]:
categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore"))  # Encode categorical variables
])

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_columns),  # Apply numeric pipeline
        ("cat", categorical_transformer, categorical_columns)  # Apply categorical pipeline
    ]
)

In [11]:
X_train = preprocessor.fit_transform(train.drop(columns=["satisfaction"]))
X_test = preprocessor.transform(test.drop(columns=["satisfaction"]))

###  目標値の作成

In [12]:
label_encoder = LabelEncoder()

In [13]:
Y_train=label_encoder.fit_transform(train["satisfaction"])
Y_test=label_encoder.fit_transform(test["satisfaction"])

## 教師データの作成

In [14]:
feature_train, feature_val, target_train, target_val = train_test_split(X_train, Y_train, test_size = 0.20, random_state = 42)

In [15]:
class CustomDataset(Dataset):
    def __init__(self, features, targets):
        self.features = torch.tensor(features, dtype=torch.float32).to(device)
        self.targets = torch.tensor(targets, dtype=torch.long).to(device)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

In [23]:
train_dataset = CustomDataset(feature_train, target_train)
val_dataset = CustomDataset(feature_val, target_val)
test_dataset = CustomDataset(X_test, Y_test)

In [24]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


## モデルの実装

In [18]:
class SimpleNN(nn.Module):
    def __init__(self, input_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 2)  # Assuming binary classification

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

In [19]:
input_size = X_train.shape[1]  # Number of features
model = SimpleNN(input_size).to(device)

In [20]:
def train_model(model, train_loader, val_loader, n_epochs, learning_rate):
    criterion = nn.CrossEntropyLoss()  # Use CrossEntropyLoss for multi-class classification
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(n_epochs):
        model.train()
        train_loss = 0.0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch) 
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * X_batch.size(0)

        # Validation step
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                val_loss += loss.item() * X_batch.size(0)
                
                # Calculate accuracy
                _, predicted = torch.max(outputs.data, 1)
                total += y_batch.size(0)
                correct += (predicted == y_batch).sum().item()

        # Average losses
        train_loss /= len(train_loader.dataset)
        val_loss /= len(val_loader.dataset)
        accuracy = 100 * correct / total

        print(f"Epoch {epoch+1}/{n_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Accuracy: {accuracy:.2f}%")

In [21]:
train_model(model, train_loader, val_loader, n_epochs=20, learning_rate=0.001)

Epoch 1/20, Train Loss: 0.1970, Val Loss: 0.1464, Accuracy: 93.98%
Epoch 2/20, Train Loss: 0.1367, Val Loss: 0.1190, Accuracy: 94.99%
Epoch 3/20, Train Loss: 0.1174, Val Loss: 0.1169, Accuracy: 94.62%
Epoch 4/20, Train Loss: 0.1083, Val Loss: 0.1031, Accuracy: 95.48%
Epoch 5/20, Train Loss: 0.1018, Val Loss: 0.1003, Accuracy: 95.54%
Epoch 6/20, Train Loss: 0.0975, Val Loss: 0.0990, Accuracy: 95.62%
Epoch 7/20, Train Loss: 0.0945, Val Loss: 0.0977, Accuracy: 95.79%
Epoch 8/20, Train Loss: 0.0926, Val Loss: 0.0950, Accuracy: 95.81%
Epoch 9/20, Train Loss: 0.0903, Val Loss: 0.0953, Accuracy: 95.91%
Epoch 10/20, Train Loss: 0.0888, Val Loss: 0.0955, Accuracy: 95.86%
Epoch 11/20, Train Loss: 0.0872, Val Loss: 0.0937, Accuracy: 95.92%
Epoch 12/20, Train Loss: 0.0863, Val Loss: 0.0915, Accuracy: 95.93%
Epoch 13/20, Train Loss: 0.0846, Val Loss: 0.0905, Accuracy: 96.18%
Epoch 14/20, Train Loss: 0.0828, Val Loss: 0.0930, Accuracy: 96.03%
Epoch 15/20, Train Loss: 0.0820, Val Loss: 0.0917, Accura

## モデルの検証

In [36]:
from sklearn.metrics import classification_report

def evaluate_model(model, test_loader):
    model.eval()  # Set the model to evaluation mode
    all_predictions = []
    all_targets = []
    criterion = nn.CrossEntropyLoss()  # Use the same loss function as during training
    test_loss = 0.0

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            test_loss += loss.item() * X_batch.size(0)

            # Store predictions and targets for classification report
            _, predicted = torch.max(outputs.data, 1)
            all_predictions.extend(predicted.cpu().numpy())
            all_targets.extend(y_batch.cpu().numpy())

    # Average loss
    test_loss /= len(test_loader.dataset)

    # Generate classification report
    print(f"Test Loss: {test_loss:.4f}")
    print("Classification Report:")
    print(classification_report(all_targets, all_predictions,digits=4))

# Assuming you have a test_loader defined
evaluate_model(model, test_loader)

Test Loss: 0.0907
Classification Report:
              precision    recall  f1-score   support

           0     0.9567    0.9744    0.9655     14573
           1     0.9665    0.9436    0.9549     11403

    accuracy                         0.9609     25976
   macro avg     0.9616    0.9590    0.9602     25976
weighted avg     0.9610    0.9609    0.9608     25976

