## 1Ô∏è‚É£ Ki·ªÉm Tra GPU

In [None]:
import tensorflow as tf
print("üîç Ki·ªÉm tra GPU...")
print(f"TensorFlow version: {tf.__version__}")

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"‚úÖ Found {len(gpus)} GPU(s):")
    for gpu in gpus:
        print(f"   - {gpu.name}")
else:
    print("‚ùå No GPU found. Vui l√≤ng b·∫≠t GPU: Runtime > Change runtime type > GPU")

## 2Ô∏è‚É£ Upload Dataset

Upload 3 files CSV t·ª´ m√°y t√≠nh c·ªßa b·∫°n:
- `XSS_dataset.csv`
- `Modified_SQL_Dataset.csv`
- `DDOS_dataset.csv`

In [None]:
from google.colab import files
import os

print("üì§ Upload c√°c file dataset...")
print("Ch·ªçn 3 files: XSS_dataset.csv, Modified_SQL_Dataset.csv, DDOS_dataset.csv")

uploaded = files.upload()

print("\n‚úÖ Files uploaded:")
for filename in uploaded.keys():
    print(f"   - {filename} ({len(uploaded[filename])} bytes)")

## 3Ô∏è‚É£ Import Libraries

In [None]:
import os
import time
import datetime
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, f1_score, recall_score, precision_score, roc_curve, auc, confusion_matrix
from sklearn.model_selection import train_test_split

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

print("‚úÖ All libraries imported successfully!")

## 4Ô∏è‚É£ Configuration

In [None]:
CONFIG = {
    "MODEL_NAME": "MalwareDetection_Text_LSTM",
    "MAX_TOKENS": 10000,
    "SEQUENCE_LENGTH": 200,
    "EMBEDDING_DIM": 128,
    "BATCH_SIZE": 128,
    "EPOCHS": 30,
    "OUTPUT_DIR": 'output'
}

os.makedirs(CONFIG["OUTPUT_DIR"], exist_ok=True)
print("‚úÖ Configuration loaded!")

## 5Ô∏è‚É£ Load and Prepare Data

In [None]:
def load_and_prepare_data():
    print("\nüìä Loading data...")
    
    datasets = {
        'XSS': 'XSS_dataset.csv',
        'SQL': 'Modified_SQL_Dataset.csv',
        'DDOS': 'DDOS_dataset.csv'
    }
    
    df_list = []
    for source, path in datasets.items():
        if os.path.exists(path):
            df = pd.read_csv(path)
            df['source'] = source
            df_list.append(df)
            print(f"‚úÖ Loaded {len(df):,} samples from {source}")
        else:
            print(f"‚ùå Warning: {path} not found")
    
    if not df_list:
        print("‚ùå No datasets loaded")
        return None, None, None, None
    
    df_all = pd.concat(df_list, ignore_index=True)
    
    # Separate DDoS
    df_ddos = df_all[df_all['source'] == 'DDOS'].copy()
    df_non_ddos = df_all[df_all['source'] != 'DDOS'].copy()
    
    print(f"\nüìà DDoS samples: {len(df_ddos):,}")
    print(f"üìà Non-DDoS samples: {len(df_non_ddos):,}")
    
    df = df_non_ddos.copy()
    df = df[df['Sentence'].str.strip().str.split().str.len() > 2]
    
    print(f"\n‚úÖ Total samples after filtering: {len(df):,}")
    print(f"   - Positive (Malware): {len(df[df['Label']==1]):,}")
    print(f"   - Negative (Benign): {len(df[df['Label']==0]):,}")
    
    texts = df['Sentence'].fillna('').astype(str).values
    labels = df['Label'].values
    
    # Split 70-15-15
    train_texts, temp_texts, train_labels, temp_labels = train_test_split(
        texts, labels, test_size=0.3, random_state=42, stratify=labels)
    val_texts, test_texts, val_labels, test_labels = train_test_split(
        temp_texts, temp_labels, test_size=0.5, random_state=42, stratify=temp_labels)
    
    print(f"\nüìä Data Split:")
    print(f"   - Train: {len(train_texts):,} samples (70%)")
    print(f"   - Val: {len(val_texts):,} samples (15%)")
    print(f"   - Test: {len(test_texts):,} samples (15%)")
    
    # Text Vectorization
    vectorize_layer = keras.layers.TextVectorization(
        max_tokens=CONFIG["MAX_TOKENS"],
        output_mode='int',
        output_sequence_length=CONFIG["SEQUENCE_LENGTH"])
    
    vectorize_layer.adapt(train_texts)
    
    # Create datasets
    train_ds = tf.data.Dataset.from_tensor_slices((train_texts, train_labels))
    train_ds = train_ds.shuffle(10000).batch(CONFIG["BATCH_SIZE"]).prefetch(tf.data.AUTOTUNE)
    
    val_ds = tf.data.Dataset.from_tensor_slices((val_texts, val_labels))
    val_ds = val_ds.batch(CONFIG["BATCH_SIZE"]).prefetch(tf.data.AUTOTUNE)
    
    test_ds = tf.data.Dataset.from_tensor_slices((test_texts, test_labels))
    test_ds = test_ds.batch(CONFIG["BATCH_SIZE"]).prefetch(tf.data.AUTOTUNE)
    
    return train_ds, val_ds, test_ds, vectorize_layer

train_ds, val_ds, test_ds, vectorize_layer = load_and_prepare_data()

## 6Ô∏è‚É£ Build BiLSTM Model

In [None]:
def build_text_model(vocab_size, embedding_dim):
    model = keras.Sequential([
        vectorize_layer,
        keras.layers.Embedding(vocab_size, embedding_dim),
        keras.layers.Bidirectional(keras.layers.LSTM(64, return_sequences=True)),
        keras.layers.Bidirectional(keras.layers.LSTM(32)),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dropout(0.5),
        keras.layers.Dense(1, activation='sigmoid')
    ], name="BiLSTM_MalwareDetection")
    return model

model = build_text_model(CONFIG["MAX_TOKENS"], CONFIG["EMBEDDING_DIM"])
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

print("\nüèóÔ∏è Model Architecture:")
model.summary()

## 7Ô∏è‚É£ Train Model

In [None]:
print("\nüöÄ Starting training...")
start_time = time.time()

callbacks = [
    keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1)
]

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=CONFIG["EPOCHS"],
    callbacks=callbacks,
    verbose=1
)

training_time = time.time() - start_time
print(f"\n‚úÖ Training completed in {training_time/60:.2f} minutes ({training_time:.2f} seconds)")

## 8Ô∏è‚É£ Evaluate Model

In [None]:
print("\nüìä Evaluating model on test set...")

# Get test data
test_texts_list = []
test_labels_list = []
for texts, labels in test_ds:
    test_texts_list.extend(texts.numpy())
    test_labels_list.extend(labels.numpy())

test_texts_array = np.array(test_texts_list)
test_labels_array = np.array(test_labels_list)

# Predictions
y_pred_probs = model.predict(test_ds)
y_pred = (y_pred_probs > 0.5).astype(int).flatten()

# Metrics
accuracy = np.mean(y_pred == test_labels_array)
f1 = f1_score(test_labels_array, y_pred)
recall = recall_score(test_labels_array, y_pred)
precision = precision_score(test_labels_array, y_pred)

print(f"\nüìà Test Results:")
print(f"   - Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"   - F1-Score: {f1:.4f}")
print(f"   - Recall: {recall:.4f}")
print(f"   - Precision: {precision:.4f}")

print("\nüìã Classification Report:")
print(classification_report(test_labels_array, y_pred, target_names=['Benign', 'Malware']))

## 9Ô∏è‚É£ Visualizations

In [None]:
# Training History
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Accuracy
ax1.plot(history.history['accuracy'], label='Train Accuracy', linewidth=2)
ax1.plot(history.history['val_accuracy'], label='Val Accuracy', linewidth=2)
ax1.set_title('Model Accuracy', fontsize=14, fontweight='bold')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Accuracy')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Loss
ax2.plot(history.history['loss'], label='Train Loss', linewidth=2)
ax2.plot(history.history['val_loss'], label='Val Loss', linewidth=2)
ax2.set_title('Model Loss', fontsize=14, fontweight='bold')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('output/training_history.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Training history plot saved!")

In [None]:
# Confusion Matrix
cm = confusion_matrix(test_labels_array, y_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Benign', 'Malware'],
            yticklabels=['Benign', 'Malware'],
            cbar_kws={'label': 'Count'})
plt.title('Confusion Matrix - Binary Classification', fontsize=16, fontweight='bold', pad=20)
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)

# Add percentages
for i in range(2):
    for j in range(2):
        percentage = cm[i, j] / cm[i].sum() * 100
        plt.text(j+0.5, i+0.7, f'({percentage:.1f}%)', 
                ha='center', va='center', fontsize=10, color='red')

plt.tight_layout()
plt.savefig('output/confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Confusion matrix saved!")

In [None]:
# ROC Curve
fpr, tpr, thresholds = roc_curve(test_labels_array, y_pred_probs)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curve - Binary Classification', fontsize=16, fontweight='bold')
plt.legend(loc="lower right", fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('output/roc_curve.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"‚úÖ ROC curve saved! AUC = {roc_auc:.4f}")

## üîü Save Model

In [None]:
# Save model
model_path = f"output/{CONFIG['MODEL_NAME']}.keras"
model.save(model_path)
print(f"‚úÖ Model saved to: {model_path}")

# Save results
results = {
    'Model': CONFIG['MODEL_NAME'],
    'Training Time (min)': f"{training_time/60:.2f}",
    'Accuracy': f"{accuracy:.4f}",
    'F1-Score': f"{f1:.4f}",
    'Recall': f"{recall:.4f}",
    'Precision': f"{precision:.4f}",
    'AUC': f"{roc_auc:.4f}",
    'Date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}

results_df = pd.DataFrame([results])
results_df.to_csv('output/evaluation_results.csv', index=False)
print("‚úÖ Results saved to: output/evaluation_results.csv")

print("\n" + "="*60)
print("üéâ Training completed successfully!")
print("="*60)

## üì• Download Results

In [None]:
from google.colab import files
import zipfile

# Create zip file
print("üì¶ Creating zip file...")
with zipfile.ZipFile('malware_detection_results.zip', 'w') as zipf:
    for root, dirs, filenames in os.walk('output'):
        for filename in filenames:
            file_path = os.path.join(root, filename)
            zipf.write(file_path)

print("‚úÖ Zip file created!")
print("üì• Downloading...")
files.download('malware_detection_results.zip')
print("‚úÖ Download complete!")