In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

# Load the featurized and categorized data
train_csv_path = '/kaggle/input/training-dataset/train_malicious_phish.csv'
val_csv_path = '/kaggle/input/validation-dataset/val_malicious_phish.csv'
train_df = pd.read_csv(train_csv_path)
val_df = pd.read_csv(val_csv_path)

# Convert 'TRUE'/'FALSE' to 1/0
bool_columns = ['at_symbol_0', 'at_symbol_1', 'redirection_0', 'redirection_1',
                'has_ip_0', 'has_ip_1', 'has_https_0', 'has_https_1',
                'is_tinyurl_0', 'is_tinyurl_1']

for col in bool_columns:
    train_df[col] = train_df[col].apply(lambda x: 1 if x == 'TRUE' else 0)
    val_df[col] = val_df[col].apply(lambda x: 1 if x == 'TRUE' else 0)

# Separate features and labels
X_train = train_df.drop(columns=['type', 'url'])
y_train = train_df['type']

X_val = val_df.drop(columns=['type', 'url'])
y_val = val_df['type']

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)

# Track accuracy for different numbers of estimators
n_estimators_list = list(range(10, 210, 10))
train_accuracies = []
val_accuracies = []

for n_estimators in n_estimators_list:
    rf_model = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
    rf_model.fit(X_train, y_train)

    y_pred_train = rf_model.predict(X_train)
    y_pred_val = rf_model.predict(X_val)

    train_accuracy = accuracy_score(y_train, y_pred_train)
    val_accuracy = accuracy_score(y_val, y_pred_val)

    train_accuracies.append(train_accuracy)
    val_accuracies.append(val_accuracy)

    print(f"n_estimators = {n_estimators}")
    print(f"Training Accuracy: {train_accuracy}")
    print(f"Validation Accuracy: {val_accuracy}")
    print("\nClassification Report on Validation Data:")
    print(classification_report(y_val, y_pred_val, target_names=label_encoder.classes_))

# Plotting the training accuracy
plt.figure(figsize=(10, 6))
plt.title("Training Curve (Accuracy)")
plt.plot(n_estimators_list, train_accuracies, label='Train', marker='o')
plt.xlabel("Number of Estimators")
plt.ylabel("Accuracy")
plt.legend(loc='best')
plt.show()

# Plotting the validation accuracy
plt.figure(figsize=(10, 6))
plt.title("Validation Curve (Accuracy)")
plt.plot(n_estimators_list, val_accuracies, label='Validation', marker='o')
plt.xlabel("Number of Estimators")
plt.ylabel("Accuracy")
plt.legend(loc='best')
plt.show()

print("Final Training Accuracy: {}".format(train_accuracies[-1]))
print("Final Validation Accuracy: {}".format(val_accuracies[-1]))


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/training-dataset/train_malicious_phish.csv'

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

# Load the featurized and categorized data
train_csv_path = '/kaggle/input/training-dataset/train_malicious_phish.csv'
val_csv_path = '/kaggle/input/validation-dataset/val_malicious_phish.csv'
train_df = pd.read_csv(train_csv_path)
val_df = pd.read_csv(val_csv_path)

# Convert 'TRUE'/'FALSE' to 1/0
bool_columns = ['at_symbol_0', 'at_symbol_1', 'redirection_0', 'redirection_1',
                'has_ip_0', 'has_ip_1', 'has_https_0', 'has_https_1',
                'is_tinyurl_0', 'is_tinyurl_1']

for col in bool_columns:
    train_df[col] = train_df[col].apply(lambda x: 1 if x == 'TRUE' else 0)
    val_df[col] = val_df[col].apply(lambda x: 1 if x == 'TRUE' else 0)

# Separate features and labels
X_train = train_df.drop(columns=['type', 'url'])
y_train = train_df['type']

X_val = val_df.drop(columns=['type', 'url'])
y_val = val_df['type']

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)

# Hyperparameter tuning using Grid Search
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt']  # Changed to 'sqrt' to avoid the deprecation warning
}

rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

# Evaluate the best model
y_pred_train = best_model.predict(X_train)
y_pred_val = best_model.predict(X_val)

print("Best Parameters Found: ", grid_search.best_params_)
print("Training Accuracy:", accuracy_score(y_train, y_pred_train))
print("Validation Accuracy:", accuracy_score(y_val, y_pred_val))
print("\nClassification Report on Validation Data:")
print(classification_report(y_val, y_pred_val, target_names=label_encoder.classes_))


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/training-dataset/train_malicious_phish.csv'