In [1]:
from scipy.io import arff
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [15]:
# Function to load and preprocess multiple datasets
def load_and_preprocess_datasets(file_paths):
    # List to hold DataFrames
    dataframes = []

    # Load each ARFF file and convert to DataFrame
    for file_path in file_paths:
        data, meta = arff.loadarff(file_path)
        df = pd.DataFrame(data)
        df['class1'] = df['class1'].str.decode('utf-8')
        dataframes.append(df)

    # Concatenate all DataFrames
    df_all = pd.concat(dataframes, ignore_index=True)

    # Separate features and target variable
    X = df_all.drop('class1', axis=1)
    y = df_all['class1']

    # Normalize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Encode the target variable
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    return X_scaled, y_encoded, label_encoder

# List of ARFF file paths
file_paths = [
    'D:/4th semester/CN_EL/ELFinal/Scenario B-ARFF/TimeBasedFeatures-Dataset-15s-AllinOne.arff',
    'D:/4th semester/CN_EL/ELFinal/Scenario B-ARFF/TimeBasedFeatures-Dataset-30s-AllinOne.arff',
    'D:/4th semester/CN_EL/ELFinal/Scenario B-ARFF/TimeBasedFeatures-Dataset-120s-AllinOne.arff'  
]

In [8]:
# Load and preprocess the datasets
X, y, label_encoder = load_and_preprocess_datasets(file_paths)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the models
log_reg = LogisticRegression(max_iter=1000, random_state=42)
random_forest = RandomForestClassifier(random_state=42)
svm = SVC(random_state=42)
gradient_boost = GradientBoostingClassifier(random_state=42)

In [9]:
# Load and preprocess the datasets
X, y, label_encoder = load_and_preprocess_datasets(file_paths)

In [10]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
# Initialize the models
log_reg = LogisticRegression(max_iter=1000, random_state=42)
random_forest = RandomForestClassifier(random_state=42)
svm = SVC(random_state=42)
gradient_boost = GradientBoostingClassifier(random_state=42)

In [12]:
# Train and evaluate each model
models = {'Logistic Regression': log_reg, 'Random Forest': random_forest, 'SVM': svm, 'Gradient Boosting': gradient_boost}
results = {}

In [13]:
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    # Make predictions
    y_pred = model.predict(X_test)
    # Evaluate the model
    report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
    results[name] = report

In [14]:
# Print the results
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(metrics)
    print("\n")

Model: Logistic Regression
              precision    recall  f1-score   support

    BROWSING       0.49      0.90      0.64      4499
        CHAT       0.58      0.05      0.10      1243
          FT       0.64      0.13      0.21      1957
        MAIL       0.04      0.00      0.00       761
         P2P       0.43      0.45      0.44      1665
   STREAMING       0.49      0.07      0.12       573
        VOIP       0.81      0.83      0.82      2560

    accuracy                           0.55     13258
   macro avg       0.50      0.35      0.33     13258
weighted avg       0.55      0.55      0.48     13258



Model: Random Forest
              precision    recall  f1-score   support

    BROWSING       0.93      0.97      0.95      4499
        CHAT       0.88      0.82      0.85      1243
          FT       0.92      0.79      0.85      1957
        MAIL       0.96      0.93      0.94       761
         P2P       0.84      0.97      0.90      1665
   STREAMING       0.92     