<a href="https://colab.research.google.com/github/MudassirABBASSi/Linux-Log-Classifications-in-NLP-/blob/main/Linux_Log_Event_type_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Import libraries

In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Step 2: Load dataset

In [None]:
data = pd.read_csv("/kaggle/input/linux-logs/Linux_2k.log_structured.csv")
data.shape

In [None]:
data.shape

In [None]:
data.sample()

# Step 3: Prepare features and labels

In [None]:
data['Content'] = data['Content'].astype(str)
X = data['Content']
y = data['EventId']  # The event type we want to predict

# Step 4: Split into train/test sets

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Convert text to numerical features (TF-IDF)

In [None]:

vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Step 6: Train a classifier

In [None]:

model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

# Step 7: Evaluate model


In [None]:
y_pred = model.predict(X_test_vec)
print(classification_report(y_test, y_pred))

# Step 8: Predict new logs

In [None]:
new_logs = [
    "authentication failure; user=root",
    "kernel module loaded successfully",
    "check pass; user unknown",
    "connection from 82.83.227.67 (dsl-082-083-227-..."
]

In [None]:
new_vec = vectorizer.transform(new_logs)
preds = model.predict(new_vec)
print(list(zip(new_logs, preds)))
