In [1]:
import pandas as pd
import os
import re
import numpy as np
import pickle
import Feature_Extraction

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from gensim.models import FastText, KeyedVectors
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score

# File Path

In [2]:
file_df = pd.read_csv('Dataset/File Path/full_data.csv')

In [3]:
def build_word_vectors(train_data, path=None):
    # Extracting sentences from the 'process_path' column in the provided DataFrame
    sentences = train_data['process_path'].str.lower().str.split('\\').tolist()
    
    # Creating a FastText model with specific parameters
    model = FastText(
        vector_size=32,      # Dimensionality of the word vectors
        window=10,           # Maximum distance between the current and predicted word within a sentence
        min_count=1,         # Ignores words with total frequency lower than this
        workers=4,           # Number of CPU cores to use
        sg=0,                # Skip-gram model (sg=0) or CBOW model (sg=1)
        sentences=sentences  # Training sentences
    )
    
    # Training the FastText model
    model.train(sentences, total_examples=len(sentences), epochs=10)
    
    # Saving the trained word vectors to a specified path if provided
    if path is not None:
        model.wv.save(path)
    
    # Returning the word vectors
    return model.wv


In [4]:
def load_word_vectors(path):
    return KeyedVectors.load(path)

In [5]:
import os

# Assuming 'load_word_vectors' and 'build_word_vectors' functions are defined

WORD_VECTORS_PATH = 'Model/File Path/Word Vector/fasttext_v1.model'

# Check if the file containing word vectors exists
if os.path.isfile(WORD_VECTORS_PATH):
    # Load word vectors if the file exists
    word_vectors = load_word_vectors(WORD_VECTORS_PATH)
    print(1)
else:
    # Build word vectors if the file does not exist
    word_vectors = build_word_vectors(file_df, WORD_VECTORS_PATH)


In [8]:
import numpy as np

# Assuming 'word_vectors' and 'file_df' are defined

# Tokenize the paths in the 'process_path' column and convert to lowercase
file_df['text_tokenized'] = file_df['process_path'].str.lower().str.split('\\')

# Calculate the mean vector for each tokenized path using pre-trained word vectors
file_df['text_vect_mean'] = file_df['text_tokenized'].apply(
    lambda x: np.array([word_vectors[token] for token in x]).mean(axis=0)
)


In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def score(y_true, y_pred):
    # Calculate accuracy
    accuracy = accuracy_score(y_true, y_pred)
    
    # Calculate precision
    precision = precision_score(y_true, y_pred)
    
    # Calculate recall
    recall = recall_score(y_true, y_pred)
    
    # Calculate F1 score
    f1 = f1_score(y_true, y_pred)
    
    # Return a dictionary containing the calculated metrics
    return {'accuracy': accuracy, 'precision': precision,
            'recall': recall, 'f1_score': f1}


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

def get_model(cfg, name='xgb'):
    """
    Get a machine learning model based on the specified name and configuration.

    Parameters:
    - cfg (dict): Configuration parameters for the model.
    - name (str): Name of the model (default is 'xgb').

    Returns:
    - model: An instance of a machine learning model.
    """
    if name == 'rf':
        # Random Forest Classifier
        model = RandomForestClassifier(
            n_estimators=cfg['n_estimators'], 
            max_depth=cfg['max_depth'], 
            min_samples_split=cfg['min_samples_split'], 
            min_samples_leaf=cfg['min_samples_leaf'], 
            max_features=cfg['max_features'], 
            n_jobs=-1, 
            random_state=42,
            class_weight='balanced'
        )
    elif name == 'dt':
        # Decision Tree Classifier
        model = DecisionTreeClassifier(
            max_depth=cfg['max_depth'], 
            min_samples_split=cfg['min_samples_split'], 
            min_samples_leaf=cfg['min_samples_leaf'], 
            max_features=cfg['max_features'],
            random_state=42
        )
    else:
        # Default: XGBoost or another model
        # Add additional model implementations as needed
        raise ValueError(f"Unsupported model name: {name}")

    return model


In [11]:
# Configuration parameters for the Random Forest model
RF_CONFIG = dict(
    n_estimators=60,
    max_depth=34,
    min_samples_split=2,
    min_samples_leaf=10,
    max_features=0.65
)

# Split the data into training and testing sets
train_data, test_data = train_test_split(file_df, random_state=42, stratify=file_df['label'], test_size=0.2)

# Get a Random Forest model using the specified configuration
rf_model = get_model(RF_CONFIG, name='rf')

# Train the Random Forest model on the training data
rf_model.fit(np.stack(train_data['text_vect_mean']), train_data['label'])

# Make predictions on the test data
rf_pred = rf_model.predict(np.stack(test_data['text_vect_mean']))

# Calculate performance scores using the 'score' function
rf_scores = score(test_data['label'], rf_pred)

# Print or use the calculated scores as needed
print(rf_scores)


{'accuracy': 0.9882014823778551,
 'precision': 0.9337620578778135,
 'recall': 0.9647840531561461,
 'f1_score': 0.9490196078431372}

In [12]:
# Obtain a Random Forest model with the specified configuration
rf_model = get_model(RF_CONFIG, name='rf')

# Train the Random Forest model on the entire dataset
rf_model.fit(np.stack(file_df['text_vect_mean']), file_df['label'])

# Specify the filename to save the trained model
filename = 'Model/File Path/rf_model.sav'

# Save the trained Random Forest model to the specified file using pickle
pickle.dump(rf_model, open(filename, 'wb'))


In [13]:
# Configuration parameters for the Decision Tree model
DT_CONFIG = dict(
    max_depth=6,
    min_samples_split=3,
    min_samples_leaf=2,
    max_features='log2'
)

# Split the data into training and testing sets
train_data, test_data = train_test_split(file_df, random_state=42, stratify=file_df['label'], test_size=0.2)

# Get a Decision Tree model using the specified configuration
dt_model = get_model(DT_CONFIG, name='dt')

# Train the Decision Tree model on the training data
dt_model.fit(np.stack(train_data['text_vect_mean']), train_data['label'])

# Make predictions on the test data
dt_pred = dt_model.predict(np.stack(test_data['text_vect_mean']))

# Calculate performance scores using the 'score' function
dt_scores = score(test_data['label'], dt_pred)

# Print or use the calculated scores as needed
print(dt_scores)


{'accuracy': 0.9847980638330056,
 'precision': 0.9617563739376771,
 'recall': 0.9023255813953488,
 'f1_score': 0.9310935893040795}

In [14]:
# Get a Decision Tree model using the specified configuration
dt_model = get_model(DT_CONFIG, name='dt')

# Train the Decision Tree model on the entire dataset
dt_model.fit(np.stack(file_df['text_vect_mean']), file_df['label'])

# Specify the filename to save the trained model
filename = 'Model/File Path/dt_model.sav'

# Save the trained Decision Tree model to the specified file using pickle
pickle.dump(dt_model, open(filename, 'wb'))


In [46]:
# Tokenize the paths in the 'process_path' column and create a new column 'text_tokenized'
file_phish['text_tokenized'] = file_phish['process_path'].str.lower().str.split('\\')

# Calculate the mean vector representation for each path and create a new column 'text_vect_mean'
file_phish['text_vect_mean'] = file_phish['text_tokenized'].apply(
    lambda x: np.array([word_vectors[token] for token in x]).mean(axis=0)
)

# Use the pre-trained Random Forest model to predict labels for the new dataset
tmp_pred = rf_model.predict(np.stack(file_phish['text_vect_mean']))


In [18]:
# Create a new DataFrame 'x' by stacking the 'text_vect_mean' column
x = pd.DataFrame(np.stack(file_df['text_vect_mean']))

# Generate column names 'd_0', 'd_1', ..., 'd_299'
cols = [f'd_{i}' for i in range(300)]

# Assign the generated column names to the columns of DataFrame 'x'
x.columns = cols

# Add a 'Label' column to 'x' by resetting the index of the 'label' column from 'file_df'
x['Label'] = file_df['label'].reset_index(drop=True)

# Save the DataFrame 'x' to a CSV file named 'training_final.csv' without including the index column
x.to_csv('Dataset/File Path/training_final.csv', index=False)