# Thư viện

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Tải và chuẩn bị dữ liệu

In [None]:
print("--- Section 1: Loading and Preparing Data ---")

# Đường dẫn đến dữ liệu.
data_path = 'F:/dataML/data/TRX/output/TRX_18_26555370_20200628_061900_223181.csv'

# Tên dataset
dataset_name = os.path.splitext(os.path.basename(data_path))[0]

# Đọc dữ liệu
df = pd.read_csv(data_path)
print(f"Successfully loaded data: {dataset_name}")
print(f"Data shape: {df.shape}")

# Convert OpenTime to datetime and set as index
df['OpenTime'] = pd.to_datetime(df['OpenTime'])
df.set_index('OpenTime', inplace=True)
print("Data prepared with 'OpenTime' as index.")

# Lựa chọn thuộc tính và tách dữ liệu

In [None]:
Y_TYPE = 'close_vs_maxhigh60_r8'
features = [
    'sma_macd_diff_5',
    'sma_macd_diff_10',
    'sma_macd_diff_sub',
    'macd_diff',
    'macd',
    'rsi_14',
    'ema_9', 'ema_25', 'ema_50',
    'sma_9', 'sma_25', 'sma_50',
    'Close'
]

X = df[features]
y = df[Y_TYPE]

# Chia dữ liệu: 70% train, 20% val, 10% test
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42, shuffle=False
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.2222, random_state=42, shuffle=False
)

print(f"Train set size: {X_train.shape}")
print(f"Validation set size: {X_val.shape}")
print(f"Test set size: {X_test.shape}")

# Scale dữ liệu và cân bằng lớp

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print("Original y_train class distribution:")
print(y_train.value_counts())
print("\ny_train class distribution after SMOTE:")
print(pd.Series(y_train_resampled).value_counts())

# Tạo chuỗi cho mô hình LSTM

In [None]:
def create_sequences(X, y, time_steps=60):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i:(i + time_steps)])
        ys.append(y[i + time_steps])
    return np.array(Xs), np.array(ys)

TIME_STEPS = 60 # Using 60 minutes of data to predict

# Create sequences for train, validation, and test sets
X_train_seq, y_train_seq = create_sequences(X_train_resampled, y_train_resampled, TIME_STEPS)
X_val_seq, y_val_seq = create_sequences(X_val_scaled, y_val.values, TIME_STEPS)
X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test.values, TIME_STEPS)

print(f"X_train_seq shape: {X_train_seq.shape} (samples, time_steps, features)")
print(f"y_train_seq shape: {y_train_seq.shape}")

# Xây dựng DataLoaders với PyTorch

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Convert data to tensors
X_train_tensor = torch.tensor(X_train_seq, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train_seq, dtype=torch.float32).unsqueeze(1).to(device)
X_val_tensor = torch.tensor(X_val_seq, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val_seq, dtype=torch.float32).unsqueeze(1).to(device)
X_test_tensor = torch.tensor(X_test_seq, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test_seq, dtype=torch.float32).unsqueeze(1).to(device)

# Create DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)
print("DataLoaders created successfully.")

# Định nghĩa mô hình LSTM

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size=128, num_layers=2, dropout=0.3):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout
        )
        self.fc1 = nn.Linear(hidden_size, hidden_size // 2)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden_size // 2, 1)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        out = lstm_out[:, -1, :]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        return out

# Huấn luyện mô hình và Tối ưu

In [None]:
input_size = X_train_seq.shape[2] # Number of features
model = LSTMModel(input_size=input_size).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3, factor=0.5)

EPOCHS = 50
PATIENCE = 10 

best_val_loss = float('inf')
patience_counter = 0
train_losses, val_losses = [], []
best_model_state_dict = None

for epoch in range(EPOCHS):
    model.train()
    total_train_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()
    avg_train_loss = total_train_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            total_val_loss += loss.item()
    avg_val_loss = total_val_loss / len(val_loader)
    val_losses.append(avg_val_loss)

    print(f'Epoch {epoch+1}/{EPOCHS} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}')
    scheduler.step(avg_val_loss)

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        # Giữ lại trạng thái model tốt nhất trong bộ nhớ thay vì lưu file
        best_model_state_dict = model.state_dict().copy()
        patience_counter = 0
        print('Validation loss decreased. Saving model state in memory...')
    else:
        patience_counter += 1
        if patience_counter >= PATIENCE:
            print(f'Early stopping at epoch {epoch+1}')
            break

# Đánh giá mô hình trên tập Test

In [None]:
y_pred_test = None
if best_model_state_dict:
    # Tải trạng thái model tốt nhất từ bộ nhớ
    model.load_state_dict(best_model_state_dict)
    model.eval()

    y_pred_list = []
    with torch.no_grad():
        for X_batch, _ in test_loader:
            outputs = model(X_batch)
            preds = torch.sigmoid(outputs) > 0.5
            y_pred_list.extend(preds.cpu().numpy())
    y_pred_test = np.array(y_pred_list).flatten()
    
    print("Predictions generated successfully from in-memory model.")
else:
    print("Model was not trained or no best model state was saved.")


# Báo cáo phân loại và Biểu đồ Precision

In [None]:
if y_pred_test is not None:
    print("\n--- Section 9: Classification Report & Precision Plots ---")
    
    # Độ chính xác (Accuracy)
    accuracy = accuracy_score(y_test_seq, y_pred_test)
    print(f"Accuracy: {accuracy:.4f}")

    # Báo cáo phân loại (Classification Report)
    print("\nClassification Report:")
    # Use '1.0' as key for float-type labels, common in PyTorch tensors
    report = classification_report(y_test_seq, y_pred_test, output_dict=True, zero_division=0)
    print(pd.DataFrame(report).transpose())

    # Biểu đồ Precision, Recall, F1-Score cho nhãn 1
    # Check for '1.0' (float) or '1' (str/int)
    metrics_label_1 = report.get('1.0', report.get('1')) 
    if metrics_label_1:
        metrics_df = pd.DataFrame({
            'Metric': ['Precision', 'Recall', 'F1-Score'],
            'Score': [metrics_label_1['precision'], metrics_label_1['recall'], metrics_label_1['f1-score']]
        })
        plt.figure(figsize=(8, 5))
        sns.barplot(x='Metric', y='Score', data=metrics_df)
        plt.title('Precision, Recall, F1-Score for Label 1')
        plt.ylabel('Score')
        plt.ylim(0, 1)
        plt.savefig('precision_recall_f1_label_1.png')
        print("\nSaved Precision/Recall/F1 chart to 'precision_recall_f1_label_1.png'")
    else:
        print("Could not generate precision plot for Label 1.")


# Ma trận nhầm lẫn (Confusion Matrix)

In [None]:
if y_pred_test is not None:
    print("\n--- Section 10: Confusion Matrix ---")
    
    cm = confusion_matrix(y_test_seq, y_pred_test)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.savefig('confusion_matrix.png')
    print("Saved confusion matrix heatmap to 'confusion_matrix.png'")

# Độ quan trọng của Feature (Feature Importance)

In [None]:
if y_pred_test is not None:
    print("\n--- Section 11: Feature Importance (Permutation) ---")
    try:
        importances = []
        baseline_accuracy = accuracy_score(y_test_seq, y_pred_test)
        
        print("Calculating feature importance...")
        for i in range(X_test_tensor.shape[2]): # Iterate over each feature
            temp_tensor = X_test_tensor.clone()
            
            # Permute data of the i-th feature
            perm = torch.randperm(temp_tensor.size(0))
            temp_tensor[:, :, i] = temp_tensor[perm, :, i]
            
            permuted_loader = DataLoader(TensorDataset(temp_tensor, y_test_tensor), batch_size=128)
            
            y_perm_preds = []
            with torch.no_grad():
                for X_batch, _ in permuted_loader:
                    outputs = model(X_batch)
                    preds = torch.sigmoid(outputs) > 0.5
                    y_perm_preds.extend(preds.cpu().numpy())
            
            perm_accuracy = accuracy_score(y_test_seq, np.array(y_perm_preds).flatten())
            importance = baseline_accuracy - perm_accuracy
            importances.append(importance)
            print(f"Importance of '{features[i]}': {importance:.4f}")

        # Sort and plot
        sorted_indices = np.argsort(importances)[::-1]
        
        plt.figure(figsize=(12, 8))
        sns.barplot(x=np.array(features)[sorted_indices], y=np.array(importances)[sorted_indices])
        plt.xticks(rotation=90)
        plt.title('Feature Importance (Permutation Method)')
        plt.ylabel('Decrease in Accuracy')
        plt.tight_layout()
        plt.savefig('feature_importance.png')
        print("\nSaved feature importance plot to 'feature_importance.png'")
    except Exception as e:
        print(f"An error occurred during feature importance calculation: {e}")


# Test trên tập dữ liệu mới

In [None]:
if best_model_state_dict:
    print("\n--- Section 12: Testing on a New Dataset ---")
    
    # THAY ĐỔI ĐƯỜNG DẪN TỚI FILE DỮ LIỆU MỚI CỦA BẠN
    new_data_path = 'path/to/your/new_test_data.csv'
    
    try:
        print(f"Loading new dataset from: {new_data_path}")
        df_new = pd.read_csv(new_data_path)
        df_new['OpenTime'] = pd.to_datetime(df_new['OpenTime'])
        df_new.set_index('OpenTime', inplace=True)

        # Áp dụng cùng một quy trình tiền xử lý
        X_new = df_new[features]
        y_new = df_new[Y_TYPE]
        
        # Sử dụng scaler đã được fit trên dữ liệu train
        X_new_scaled = scaler.transform(X_new)
        
        # Tạo chuỗi
        X_new_seq, y_new_seq = create_sequences(X_new_scaled, y_new.values, TIME_STEPS)
        
        # Chuyển sang tensor
        X_new_tensor = torch.tensor(X_new_seq, dtype=torch.float32).to(device)
        y_new_tensor = torch.tensor(y_new_seq, dtype=torch.float32).unsqueeze(1).to(device)
        
        new_test_dataset = TensorDataset(X_new_tensor, y_new_tensor)
        new_test_loader = DataLoader(new_test_dataset, batch_size=128)

        # Lấy dự đoán cho dữ liệu mới
        model.eval()
        y_new_preds_list = []
        with torch.no_grad():
            for X_batch, _ in new_test_loader:
                outputs = model(X_batch)
                preds = torch.sigmoid(outputs) > 0.5
                y_new_preds_list.extend(preds.cpu().numpy())
        
        y_new_preds = np.array(y_new_preds_list).flatten()
        
        # In kết quả đánh giá
        print("\n--- Evaluation on New Dataset ---")
        new_accuracy = accuracy_score(y_new_seq, y_new_preds)
        print(f"Accuracy on new data: {new_accuracy:.4f}")
        print("\nClassification Report for new data:")
        print(classification_report(y_new_seq, y_new_preds, zero_division=0))

    except FileNotFoundError:
        print(f"File not found: {new_data_path}. Please update the path and run again.")
    except Exception as e:
        print(f"An error occurred during testing on new data: {e}")

print("\nScript finished.")