<a href="https://colab.research.google.com/github/MudassirABBASSi/Linux-Log-Classifications-in-NLP-/blob/main/Log_Classification_of_linux.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# import Libraries

In [None]:
import pandas as pd

# Load the Dataset

In [None]:
# !unzip "/content/archive.zip"

In [None]:
data = pd.read_csv("/kaggle/input/linux-logs/Linux_2k.log_structured.csv")

data

In [None]:
data.EventId.unique()

In [None]:
# data.Content.unique()
data.Content.head()


In [None]:
data['Content'].shape

In [None]:
data.EventId.unique()

In [None]:
print(data['EventId'].value_counts())

# Text Cleaning & Preprocessing

In [None]:
import re

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # remove special chars
    text = re.sub(r'\s+', ' ', text).strip()
    return text

data["clean_content"] = data["Content"].apply(clean_text)

In [None]:
data["clean_content"]

In [None]:
X = data["Content"]
y = data["EventId"]

# Text Vectorization
1. Option A: TF-IDF (simple and effective)
2. Option B: Use pre-trained models like FastText, Word2Vec, or BERT for deeper semantics.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_vec = vectorizer.fit_transform(data["clean_content"])


In [None]:
X_vec[:2]

# Encode Target Labels

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y_enc = le.fit_transform(y)
y_enc

# Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_vec, y_enc, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


In [None]:
from sklearn.ensemble import RandomForestClassifier
model1 = RandomForestClassifier()

model1.fit(X_train, y_train)

y_pred = model1.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
# Assuming 'model' is your trained model (e.g., LogisticRegression or RandomForestClassifier)
# Assuming 'vectorizer' is your trained TfidfVectorizer
# Assuming 'le' is your trained LabelEncoder
# Assuming 'clean_text' is your text cleaning function

def predict_event_id(text, model, vectorizer, label_encoder, clean_text_func):
    # Clean the input text
    cleaned_text = clean_text_func(text)

    # Vectorize the cleaned text
    # Use transform, not fit_transform, as the vectorizer is already trained
    text_vec = vectorizer.transform([cleaned_text])

    # Predict the encoded event ID
    predicted_encoded_id = model.predict(text_vec)

    # Inverse transform the encoded ID to the original EventId
    predicted_event_id = label_encoder.inverse_transform(predicted_encoded_id)

    return predicted_event_id[0]

# Example usage:
new_log_text = "ACPI: Subsystem revision <*>"
predicted_id = predict_event_id(new_log_text, model1, vectorizer, le, clean_text) # Using model1 (RandomForestClassifier)

print(f"The predicted EventId for the log '{new_log_text}' is: {predicted_id}")