<a href="https://colab.research.google.com/github/MudassirABBASSi/Linux-Log-Classifications-in-NLP-/blob/main/Linux_log_anomalies_Detections.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
data = pd.read_csv("/content/Linux_2k.log_structured.csv")
data

# Step 3: Inspect data


In [None]:
print(data.head())
print("\nColumns:", data.columns)

# Step 4: Choose the text column for analysis (Content or EventTemplate)
**We'll use 'Content' as it contains the actual log message**

In [None]:

data['Content'] = data['Content'].astype(str).fillna("")

# Step 5: Convert text into numerical features using TF-IDF

In [None]:

vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(data['Content'])

# Optional Improvements

**You can also: Use both Content + EventTemplate together for better features:**

In [None]:


data['combined_text'] = data['Content'].astype(str) + " " + data['EventTemplate'].astype(str)
X = vectorizer.fit_transform(data['combined_text'])




# Step 6: Train Isolation Forest model\


**Visualize anomalies by frequency or time trends.Replace IsolationForest with OneClassSVM or a deep learning Autoencoder if you want more advanced detection.**


In [None]:

model = IsolationForest(contamination=0.05, random_state=42)
model.fit(X)


# Step 7: Predict anomalies

In [None]:

data['anomaly'] = model.predict(X)

# Step 8: Interpret results
**In Isolation Forest: -1 = anomaly, 1 = normal**

In [None]:

anomalies = data[data['anomaly'] == -1]
normal = data[data['anomaly'] == 1]

In [None]:
print(f" Total rows: {len(data)}")
print(f"| Anomalies detected: {len(anomalies)}")

# Show some examples of detected anomalies


In [None]:
print("\n Sample anomalies:")
print(anomalies[['Date', 'Time', 'Level', 'Component', 'Content']].head(10))

In [None]:
anomalies['Content'].value_counts()

In [None]:
anomalies['Content'].shape

In [None]:
plt.figure(figsize=(80, 20))
sns.countplot(x=anomalies['Content'], data=data, palette='coolwarm')
plt.title("Distribution of Normal vs Anomalous Logs")
plt.xlabel("Log Type")
plt.ylabel("Count")
plt.show()