In [2]:
import pandas as pd
from collections import Counter

In [None]:
# Load your cleaned data
df = pd.read_csv("/content/enron_recleaned.csv")

In [None]:
df.head()

Unnamed: 0,file,message,from,to,cc,bcc,date,subject,cleaned_message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,phillip.allen@enron.com,tim.belden@enron.com,,,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",Mime-Version: 1.0,mimevers contenttyp textplain charsetusascii c...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,phillip.allen@enron.com,john.lavorato@enron.com,,,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",Re:,mimevers contenttyp textplain charsetusascii c...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,phillip.allen@enron.com,leah.arsdall@enron.com,,,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",Re: test,mimevers contenttyp textplain charsetusascii c...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,phillip.allen@enron.com,randall.gay@enron.com,,,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",Mime-Version: 1.0,mimevers contenttyp textplain charsetusascii c...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,phillip.allen@enron.com,greg.piper@enron.com,,,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",Re: Hello,mimevers contenttyp textplain charsetusascii c...


#  PHASE 1: Behavioral Feature Engineering
## Goals:
Capture communication patterns that deviate from the norm

Measure metadata-based anomalies

## Features to Extract

In [None]:
# 1. Number of recipients
df["num_to"] = df["to"].fillna("").apply(lambda x: len(x.split(",")))
df["num_cc"] = df["cc"].fillna("").apply(lambda x: len(x.split(",")))
df["num_bcc"] = df["bcc"].fillna("").apply(lambda x: len(x.split(",")))

In [None]:
# 2. Time features
from dateutil import parser

def safe_parse_date(x):
    try:
        return parser.parse(x)
    except:
        return pd.NaT

df["date"] = df["date"].apply(safe_parse_date)

In [None]:
# Step 1: Ensure the column is in datetime format
df["date"] = pd.to_datetime(df["date"], errors="coerce")
# Step 2: Now you can safely extract the hour
df["hour"] = df["date"].dt.hour

# Step 3: Check if the hour is during off hours
df["is_off_hours"] = df["hour"].apply(lambda x: x < 6 or x > 20 if pd.notnull(x) else False)


In [None]:
# 3. Email length
df["char_length"] = df["cleaned_message"].str.len()
df["word_count"] = df["cleaned_message"].str.split().str.len()

In [None]:
 #4. Unique recipients (per sender) — useful for user profiling later
sender_recipient_map = df.groupby("from")["to"].apply(lambda x: set(",".join(x.dropna()).split(",")))
df["unique_recipient_count"] = df["from"].map(lambda sender: len(sender_recipient_map.get(sender, [])))

# 1. PHASE 2: NLP Feature Engineering
## Goals:
Convert messages into vector format (TF-IDF / Embeddings)

Detect keywords, sentiment, named entities

## A. TF-IDF Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=1000)
tfidf_matrix = tfidf.fit_transform(df["cleaned_message"])

## B. Sentiment Analysis

In [None]:
from textblob import TextBlob

df["sentiment_polarity"] = df["cleaned_message"].apply(lambda x: TextBlob(x).sentiment.polarity)

Range: -1 = negative, 0 = neutral, +1 = positive

Sudden shifts in tone can be an insider signal

## C. Insider Keyword Match (Tagging)


In [None]:
keywords = [
    "confidential", "internal", "secret", "leak", "hr", "access", "credentials",
    "breach", "login", "download", "report", "copy", "exfiltrate", "unauthorized"
]

df["threat_keyword_count"] = df["cleaned_message"].apply(
    lambda x: sum(1 for word in x.split() if word in keywords)
)

# PHASE 3: Modeling (Supervised + Anomaly-Based)

## Option A: Anomaly Detection (no labels)

In [None]:
features = pd.concat([
    df[["num_to", "num_cc", "num_bcc", "hour", "is_off_hours", "char_length", "word_count", "sentiment_polarity", "threat_keyword_count"]],
    pd.DataFrame(tfidf_matrix.toarray())
], axis=1)

In [None]:
# Ensure all feature names are strings
features.columns = features.columns.astype(str)

In [None]:
# 1a. Compute median hour (ignoring NaN)
median_hour = df["hour"].median()

# 1b. Fill missing hours with that median
df["hour"].fillna(median_hour)

# 1c. Recompute is_off_hours (now no NaN)
df["is_off_hours"] = df["hour"].apply(lambda x: x < 6 or x > 20)

In [None]:
import joblib

In [None]:
# — 2a. Impute missing values
imputer = SimpleImputer(strategy="mean")
features_imputed = pd.DataFrame(
    imputer.fit_transform(features),
    columns=features.columns
)

# Save the imputer for future inference
joblib.dump(imputer, "simple_imputer.pkl")

['simple_imputer.pkl']

In [None]:
from sklearn.ensemble import IsolationForest

In [None]:
# — 2b. Train the IsolationForest on the imputed features
if_model = IsolationForest(contamination=0.05, random_state=42)
df["anomaly_score"] = if_model.fit_predict(features_imputed)

# Save the trained anomaly detector
joblib.dump(if_model, "isolation_forest_insider.pkl")

# Quick sanity check
print(df["anomaly_score"].value_counts())

anomaly_score
 1    131969
-1      6946
Name: count, dtype: int64


Option B: Supervised Model

In [None]:
df["label"] = (
    (df["num_bcc"] > 5) |
    (df["is_off_hours"] & (df["threat_keyword_count"] > 0))
).astype(int)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

features = pd.concat([
    df[["num_to", "num_cc", "num_bcc", "hour", "is_off_hours", "char_length", "word_count", "sentiment_polarity", "threat_keyword_count"]],
    pd.DataFrame(tfidf_matrix.toarray())
], axis=1)

# Ensure all feature names are strings
features.columns = features.columns.astype(str)

X_train, X_test, y_train, y_test = train_test_split(features, df["label"], test_size=0.2, random_state=42)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

print(classification_report(y_test, clf.predict(X_test)))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     26885
           1       1.00      0.99      0.99       898

    accuracy                           1.00     27783
   macro avg       1.00      0.99      1.00     27783
weighted avg       1.00      1.00      1.00     27783



In [None]:
joblib.dump(clf, "random_forest_insider.pkl")

['random_forest_insider.pkl']

In [None]:
compressed_df = df.iloc[:100]  # 100 rows


In [None]:
compressed_df.to_csv("compressed_file.csv", index=False) #saving

In [None]:
from google.colab import files
files.download("compressed_file.csv")#downlaoding

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
df.columns

Index(['file', 'message', 'from', 'to', 'cc', 'bcc', 'date', 'subject',
       'cleaned_message'],
      dtype='object')

In [3]:
df2 = pd.read_csv("final_predictions2.csv")

In [None]:
df2.columns

Index(['from', 'to', 'date', 'cleaned_message', 'threat_keyword_count',
       'label', 'predicted_label', 'anomaly_score'],
      dtype='object')

In [4]:
print(df2.columns.tolist())

['from', 'to', 'date', 'cleaned_message', 'threat_keyword_count', 'label', 'predicted_label', 'anomaly_score']
