## Model LogisticRegression

In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report



file_path = "cyberdata_clean.csv"
cyberdata = pd.read_csv(file_path)

if "Attack Type" in cyberdata.columns:
    
    # Remove "Attack Type" from the nominal features since it is the target variable.
    nominal_cols = [
        "Protocol", "Packet Type", "Traffic Type", "Attack Signature",
        "Action Taken", "Network Segment", "Log Source", "City", "Region", "Browser", "Operating System"
    ]
    
    ordinal_cols = ["Severity Level", "Anomaly Score Category", "Packet Length Category"]
    
    numeric_cols = ["Source IP FirstOctet", "Destination IP FirstOctet"]
    
    # Define the complete list of feature columns to be used for training
    feature_cols = nominal_cols + ordinal_cols + numeric_cols
        
        
    features = cyberdata[feature_cols]
    target = cyberdata["Attack Type"]

    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42, stratify=target)
    # Create the ColumnTransformer with appropriate encoders
    preprocessor = ColumnTransformer(
        transformers=[
            ('nom', OneHotEncoder(handle_unknown='ignore'), nominal_cols),
            ('ord', OrdinalEncoder(), ordinal_cols),
            ('num', StandardScaler(), numeric_cols)
        ]
    )
    
    # Create a pipeline that applies the preprocessor and then the classifier
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('lr', LogisticRegression(max_iter=1000))
    ])
    
    
    pipeline.fit(X_train, y_train)

   
    y_pred = pipeline.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)

    model_results = {
        "Accuracy Score": accuracy,
        "Classification Report": classification_rep
    }
    model_results_df = pd.DataFrame({"Metric": ["Accuracy Score"], "Value": [accuracy]})
    classification_report_df = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).transpose()
    
    print(classification_report_df)

else:
    model_results = {"Error : 'Attack Type' does not exist"}

              precision    recall  f1-score    support
DDoS           0.329235  0.319062  0.324069  2686.0000
Intrusion      0.339161  0.329061  0.334035  2653.0000
Malware        0.329437  0.349493  0.339168  2661.0000
accuracy       0.332500  0.332500  0.332500     0.3325
macro avg      0.332611  0.332539  0.332424  8000.0000
weighted avg   0.332594  0.332500  0.332396  8000.0000


In [47]:
# Select the first sample from X_test as a DataFrame
sample = X_test.loc[[X_test.index[10]]]
print("Test sample:",sample)
# Predict the target using the best_pipeline from RandomizedSearchCV
predict_target = pipeline.predict(sample)
print("Predicted target:", predict_target)


Test sample:       Protocol Packet Type Traffic Type Attack Signature Action Taken  \
24413      TCP        Data          FTP  Known Pattern B       Logged   

      Network Segment Log Source      City     Region  Browser  \
24413       Segment C   Firewall  Fatehpur  Telangana  Mozilla   

      Operating System Severity Level Anomaly Score Category  \
24413          Windows           High                  80-50   

      Packet Length Category  Source IP FirstOctet  Destination IP FirstOctet  
24413            little long                    95                        141  
Predicted target: ['Intrusion']


In [39]:
X_test.columns

Index(['Protocol', 'Packet Type', 'Traffic Type', 'Attack Signature',
       'Action Taken', 'Network Segment', 'Log Source', 'City', 'Region',
       'Browser', 'Operating System', 'Severity Level',
       'Anomaly Score Category', 'Packet Length Category',
       'Source IP FirstOctet', 'Destination IP FirstOctet'],
      dtype='object')

Export Model

In [27]:
import joblib

model = joblib.dump(pipeline, "cyber_attack_lr_model.joblib")

Load the model 

In [36]:
model = joblib.load("cyber_attack_lr_model.joblib")

In [None]:
import pandas as pd
import joblib

# Load the  trained model (assumed to be a pipeline)
model = joblib.load("cyber_attack_lr_model.joblib")

# Correct test sample as a list of 16 values in the expected order:
test = [
    "TCP",               # Protocol
    "Data",              # Packet Type
    "DNS",            # Traffic Type
    "Known Pattern A",   # Attack Signature
    "Logged",       # Action Taken
    "Segment A",         # Network Segment
    "Firewall",          # Log Source
    "Bhilai",    # City
    "Assam",      # Region
    "Opera",             # Browser
    "Windows",        # Operating System
    "Low",               # Severity Level
    "100-80",            # Anomaly Score Category
    "long",              # Packet Length Category
    334,              # Source IP FirstOctet
    34                  # Destination IP FirstOctet
]

# Define the columns in the same order as used during training:
columns = [
    "Protocol",
    "Packet Type",
    "Traffic Type",
    "Attack Signature",
    "Action Taken",
    "Network Segment",
    "Log Source",
    "City",
    "Region",
    "Browser",
    "Operating System",
    "Severity Level",
    "Anomaly Score Category",
    "Packet Length Category",
    "Source IP FirstOctet",
    "Destination IP FirstOctet"
]

# Convert the test sample into a DataFrame
test_df = pd.DataFrame([test], columns=columns)

# Predict the target for the single sample
prediction = model.predict(test_df)
print("Predicted Attack Type:", prediction[0])



Predicted Attack Type: Malware
