In [None]:
# --- Step 0: Install & Import Dependencies ---
!pip install -q textblob
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
from google.colab import files

In [None]:
import pandas as pd

df = pd.read_csv("/content/enron_recleaned.csv",
                  engine='python',
                  on_bad_lines='skip')
print(df.shape)

(164191, 9)


In [None]:
df.head()

Unnamed: 0,file,message,from,to,cc,bcc,date,subject,cleaned_message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,phillip.allen@enron.com,tim.belden@enron.com,,,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",Mime-Version: 1.0,mimevers contenttyp textplain charsetusascii c...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,phillip.allen@enron.com,john.lavorato@enron.com,,,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",Re:,mimevers contenttyp textplain charsetusascii c...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,phillip.allen@enron.com,leah.arsdall@enron.com,,,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",Re: test,mimevers contenttyp textplain charsetusascii c...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,phillip.allen@enron.com,randall.gay@enron.com,,,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",Mime-Version: 1.0,mimevers contenttyp textplain charsetusascii c...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,phillip.allen@enron.com,greg.piper@enron.com,,,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",Re: Hello,mimevers contenttyp textplain charsetusascii c...


In [None]:
print(df.columns)

Index(['file', 'message', 'from', 'to', 'cc', 'bcc', 'date', 'subject',
       'cleaned_message'],
      dtype='object')


In [None]:
df2 = pd.read_csv("/content/emails.csv",
                  engine='python',
                  on_bad_lines='skip')
print(df2.shape)

(61935, 2)


In [None]:
print(df2.columns)

Index(['file', 'message'], dtype='object')


In [None]:
# --- Step 1: Copy your preprocessing code here ---
# (or import from src/preprocessing.py if uploaded)
# src/preprocessing.py (Colab-safe)
import pandas as pd
import re
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer

def clean_text(text: str) -> str:
    text = str(text).lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.strip()

def extract_features(df: pd.DataFrame, fit_vectorizer=True, vectorizer=None, imputer=None):
    # --- Communication patterns ---
    df["num_to"] = df["to"].fillna("").apply(lambda x: len(x.split(",")) if x else 0)
    df["num_cc"] = df["cc"].fillna("").apply(lambda x: len(x.split(",")) if x else 0) if "cc" in df.columns else 0
    df["num_bcc"] = df["bcc"].fillna("").apply(lambda x: len(x.split(",")) if x else 0) if "bcc" in df.columns else 0

    # --- Temporal patterns ---
    df["date"] = pd.to_datetime(df["date"], errors="coerce", utc=True)
    df["hour"] = df["date"].apply(lambda x: x.hour if pd.notnull(x) else 12)
    df["is_off_hours"] = df["hour"].apply(lambda x: x < 6 or x > 20)

    # --- Message-level features ---
    df["char_length"] = df["cleaned_message"].astype(str).str.len()
    df["word_count"] = df["cleaned_message"].astype(str).str.split().str.len()

    # Unique recipients per sender
    if "to" in df.columns:
        sender_recipient_map = df.groupby("from")["to"].apply(
            lambda x: set(",".join(x.dropna()).split(",")))
        df["unique_recipient_count"] = df["from"].map(
            lambda s: len(sender_recipient_map.get(s, [])))
    else:
        df["unique_recipient_count"] = 0

    # --- Sentiment analysis ---
    df["sentiment_polarity"] = df["cleaned_message"].astype(str).apply(
        lambda x: TextBlob(x).sentiment.polarity)

    # --- Threat keyword features ---
    keywords = {"confidential","internal","secret","leak","hr","access",
                "credentials","breach","login","download","report",
                "copy","exfiltrate","unauthorized"}
    df["threat_keyword_count"] = df["cleaned_message"].astype(str).apply(
        lambda x: sum(1 for word in x.split() if word in keywords))

    # --- Text vectorization ---
    if fit_vectorizer:
        vectorizer = TfidfVectorizer(max_features=1000)
        X_text = vectorizer.fit_transform(df["cleaned_message"].astype(str))
    else:
        X_text = vectorizer.transform(df["cleaned_message"].astype(str))

    # --- Structured features ---
    structured_cols = ["num_to","num_cc","num_bcc","hour","is_off_hours",
                       "char_length","word_count","unique_recipient_count",
                       "sentiment_polarity","threat_keyword_count"]
    X_structured = df[structured_cols]

    # --- Combine structured + vectorized ---
    features = pd.concat([X_structured.reset_index(drop=True),
                          pd.DataFrame(X_text.toarray())], axis=1)

    # --- Convert all columns to string to avoid SimpleImputer errors ---
    features.columns = features.columns.astype(str)

    # --- Impute missing values ---
    if fit_vectorizer:
        imputer = SimpleImputer(strategy="mean")
        features = pd.DataFrame(imputer.fit_transform(features), columns=features.columns)
    else:
        features = pd.DataFrame(imputer.transform(features), columns=features.columns)

    return df, features, vectorizer, imputer
# --- Paste the Colab-safe extract_features() here ---
# (Use the version I provided in the previous message)
# Make sure the function 'extract_features' is defined in the notebook


In [None]:
# --- Step 2: Load the cleaned Enron dataset ---
df = pd.read_csv("/content/enron_recleaned.csv")  # adjust path if needed
# Check columns and types if needed:
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64147 entries, 0 to 64146
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   file             64147 non-null  object
 1   message          64147 non-null  object
 2   from             64147 non-null  object
 3   to               64147 non-null  object
 4   cc               15858 non-null  object
 5   bcc              13402 non-null  object
 6   date             64147 non-null  object
 7   subject          64147 non-null  object
 8   cleaned_message  64147 non-null  object
dtypes: object(9)
memory usage: 4.4+ MB
None


In [None]:
# Show column names and types
df.dtypes
# Get summary info
df.info()
# Show first few rows
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64147 entries, 0 to 64146
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   file             64147 non-null  object
 1   message          64147 non-null  object
 2   from             64147 non-null  object
 3   to               64147 non-null  object
 4   cc               15858 non-null  object
 5   bcc              13402 non-null  object
 6   date             64147 non-null  object
 7   subject          64147 non-null  object
 8   cleaned_message  64147 non-null  object
dtypes: object(9)
memory usage: 4.4+ MB


Unnamed: 0,file,message,from,to,cc,bcc,date,subject,cleaned_message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,phillip.allen@enron.com,tim.belden@enron.com,,,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",Mime-Version: 1.0,mimevers contenttyp textplain charsetusascii c...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,phillip.allen@enron.com,john.lavorato@enron.com,,,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",Re:,mimevers contenttyp textplain charsetusascii c...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,phillip.allen@enron.com,leah.arsdall@enron.com,,,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",Re: test,mimevers contenttyp textplain charsetusascii c...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,phillip.allen@enron.com,randall.gay@enron.com,,,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",Mime-Version: 1.0,mimevers contenttyp textplain charsetusascii c...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,phillip.allen@enron.com,greg.piper@enron.com,,,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",Re: Hello,mimevers contenttyp textplain charsetusascii c...


In [None]:
# --- Step 3: Extract features ---
# This will create numeric features from structured columns + TF-IDF
df, features, vectorizer, imputer = extract_features(df, fit_vectorizer=True)

  df["date"] = pd.to_datetime(df["date"], errors="coerce", utc=True)


In [None]:
from sklearn.ensemble import IsolationForest

# Step 4a. :Train Isolation Forest on features
iso = IsolationForest(
    n_estimators=100,
    contamination=0.05,  # assume ~5% anomalies
    random_state=42
)
iso.fit(features)

# Predict anomalies
df["anomaly_score"] = iso.predict(features)  # -1 = anomaly, 1 = normal
df["anomaly_score"].value_counts()


Unnamed: 0_level_0,count
anomaly_score,Unnamed: 1_level_1
1,60939
-1,3208


In [None]:
#Step 4.b - One class SVM
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

ocsvm = OneClassSVM(kernel="rbf", gamma="auto", nu=0.05)  # nu ≈ anomaly fraction
ocsvm.fit(X_scaled)

df["svm_pred"] = ocsvm.predict(X_scaled)  # -1 = anomaly, 1 = normal


In [None]:
#Step - 4.b : AutoEncoder
import tensorflow as tf
from tensorflow.keras import layers, models

input_dim = features.shape[1]

# Build Autoencoder
autoencoder = models.Sequential([
    layers.Input(shape=(input_dim,)),
    layers.Dense(128, activation="relu"),
    layers.Dense(64, activation="relu"),
    layers.Dense(128, activation="relu"),
    layers.Dense(input_dim, activation="linear")
])

autoencoder.compile(optimizer="adam", loss="mse")

# Train on "normal" data (unsupervised)
history = autoencoder.fit(
    features, features,
    epochs=10,
    batch_size=256,
    shuffle=True,
    validation_split=0.1
)

# Compute reconstruction error
recon = autoencoder.predict(features)
mse = ((features - recon) ** 2).mean(axis=1)

df["autoencoder_score"] = mse

Epoch 1/10
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 19ms/step - loss: 3695.3020 - val_loss: 41.0301
Epoch 2/10
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - loss: 82.8913 - val_loss: 175.1095
Epoch 3/10
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - loss: 139.8847 - val_loss: 16.4535
Epoch 4/10
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 23ms/step - loss: 20.3090 - val_loss: 2080.3774
Epoch 5/10
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - loss: 145.6516 - val_loss: 20.0635
Epoch 6/10
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - loss: 89.2668 - val_loss: 12.6188
Epoch 7/10
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - loss: 21.2055 - val_loss: 17.1512
Epoch 8/10
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - loss: 18.0987 - val_loss: 101.4515
Epoch 9/

In [None]:
import pickle
from google.colab import files

# --- Save Isolation Forest ---
with open("isolation_forest.pkl", "wb") as f:
    pickle.dump(iso, f)

# --- Save One-Class SVM (with scaler) ---
with open("ocsvm.pkl", "wb") as f:
    pickle.dump(ocsvm, f)

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

# --- Save Autoencoder ---
autoencoder.save("autoencoder_model.keras")  # Keras format

# --- Save preprocessing objects ---
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

with open("simple_imputer.pkl", "wb") as f:
    pickle.dump(imputer, f)

with open("feature_columns.pkl", "wb") as f:
    pickle.dump(features.columns.tolist(), f)

In [None]:
# Models
files.download("isolation_forest.pkl")
files.download("ocsvm.pkl")
files.download("scaler.pkl")
files.download("autoencoder_model.keras")

# Preprocessing
files.download("tfidf_vectorizer.pkl")
files.download("simple_imputer.pkl")
files.download("feature_columns.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
print(df.columns)
print(df.dtypes)
df.head()

Index(['file', 'message', 'from', 'to', 'cc', 'bcc', 'date', 'subject',
       'cleaned_message', 'num_to', 'num_cc', 'num_bcc', 'hour',
       'is_off_hours', 'char_length', 'word_count', 'unique_recipient_count',
       'sentiment_polarity', 'threat_keyword_count', 'anomaly_score',
       'svm_pred', 'autoencoder_score'],
      dtype='object')
file                                   object
message                                object
from                                   object
to                                     object
cc                                     object
bcc                                    object
date                      datetime64[ns, UTC]
subject                                object
cleaned_message                        object
num_to                                  int64
num_cc                                  int64
num_bcc                                 int64
hour                                    int64
is_off_hours                             bool
char_len

Unnamed: 0,file,message,from,to,cc,bcc,date,subject,cleaned_message,num_to,...,hour,is_off_hours,char_length,word_count,unique_recipient_count,sentiment_polarity,threat_keyword_count,anomaly_score,svm_pred,autoencoder_score
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,phillip.allen@enron.com,tim.belden@enron.com,,,2001-05-14 23:39:00+00:00,Mime-Version: 1.0,mimevers contenttyp textplain charsetusascii c...,1,...,23,True,185,21,208,0.0,0,1,1,0.252895
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,phillip.allen@enron.com,john.lavorato@enron.com,,,2001-05-04 20:51:00+00:00,Re:,mimevers contenttyp textplain charsetusascii c...,1,...,20,False,613,95,208,0.1375,0,1,1,0.173619
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,phillip.allen@enron.com,leah.arsdall@enron.com,,,2000-10-18 10:00:00+00:00,Re: test,mimevers contenttyp textplain charsetusascii c...,1,...,10,False,174,22,208,0.3,0,1,1,0.024553
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,phillip.allen@enron.com,randall.gay@enron.com,,,2000-10-23 13:13:00+00:00,Mime-Version: 1.0,mimevers contenttyp textplain charsetusascii c...,1,...,13,False,257,34,208,0.0,0,1,1,0.03717
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,phillip.allen@enron.com,greg.piper@enron.com,,,2000-08-31 12:07:00+00:00,Re: Hello,mimevers contenttyp textplain charsetusascii c...,1,...,12,False,172,21,208,0.0,0,1,1,0.038956


In [None]:
df2 = pd.read_csv("/content/emails.csv",
                  engine='python',
                  on_bad_lines='skip')

In [None]:
print(df2.columns)
print(df2.dtypes)
df2.head()

Index(['file', 'message'], dtype='object')
file       object
message    object
dtype: object


Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [35]:
# --- Step 0: Setup ---
import pandas as pd
import pickle
import numpy as np
import re
from google.colab import files
from tensorflow.keras.models import load_model
#from src.preprocessing import extract_features, clean_text

# --- Step 1: Upload raw email dataset ---
#uploaded = files.upload()
#for fname in uploaded.keys():
    #print("Uploaded file:", fname)
#raw_df = pd.read_csv('/content/emails.csv')
raw_df = pd.read_csv("/content/emails.csv",
                     engine='python',
                     on_bad_lines='skip')

# --- Step 1a: Parse metadata from raw message using regex ---
def extract_email_metadata(msg):
    if pd.isna(msg):
        msg = ""
    data = {"from": "", "to": "", "cc": "", "bcc": "", "date": pd.NaT, "subject": ""}

    m = re.search(r"From:\s*(.*)", msg, re.IGNORECASE)
    if m: data["from"] = m.group(1).strip()

    m = re.search(r"To:\s*(.*)", msg, re.IGNORECASE)
    if m: data["to"] = m.group(1).strip()

    m = re.search(r"Cc:\s*(.*)", msg, re.IGNORECASE)
    if m: data["cc"] = m.group(1).strip()

    m = re.search(r"Bcc:\s*(.*)", msg, re.IGNORECASE)
    if m: data["bcc"] = m.group(1).strip()

    m = re.search(r"Date:\s*(.*)", msg, re.IGNORECASE)
    if m:
        try:
            data["date"] = pd.to_datetime(m.group(1).strip(), errors="coerce")
        except:
            data["date"] = pd.NaT

    m = re.search(r"Subject:\s*(.*)", msg, re.IGNORECASE)
    if m: data["subject"] = m.group(1).strip()

    return pd.Series(data)

metadata_df = raw_df['message'].apply(extract_email_metadata)
raw_df = pd.concat([raw_df, metadata_df], axis=1)

# --- Step 2: Upload saved models & preprocessing objects ---
# uploaded = files.upload()  # upload .pkl files + autoencoder_model.keras

# --- Step 3: Load models & preprocessing objects ---
iso_model = pickle.load(open("isolation_forest.pkl", "rb"))
svm_model = pickle.load(open("ocsvm.pkl", "rb"))
auto_model = load_model("autoencoder_model.keras")   # Keras model
vectorizer = pickle.load(open("tfidf_vectorizer.pkl", "rb"))
imputer = pickle.load(open("simple_imputer.pkl", "rb"))
feature_cols = pickle.load(open("feature_columns.pkl", "rb"))

# --- Step 4: Preprocess & extract features ---
# Minimal cleaning (or use your clean_text function)
raw_df["cleaned_message"] = raw_df["message"].astype(str).apply(clean_text)

# Extract features without refitting vectorizer/imputer
df_features, features, _, _ = extract_features(raw_df, fit_vectorizer=False, vectorizer=vectorizer, imputer=imputer)

# Keep only columns used during training
features = features[feature_cols]

# --- Step 5: Run inference on Isolation Forest & One-Class SVM ---
iso_pred = iso_model.predict(features)        # -1 = anomaly
svm_pred = svm_model.predict(features)        # -1 = anomaly

# --- Step 6: Run inference on Autoencoder ---
features_array = features.to_numpy()          # Keras expects numpy array
reconstruction = auto_model.predict(features_array)
mse = np.mean((features_array - reconstruction) ** 2, axis=1)

# Decide threshold for anomaly (tune based on training data)
threshold = np.percentile(mse, 95)           # top 5% reconstruction errors as anomalies
auto_pred = (mse > threshold).astype(int)    # 1 = anomaly, 0 = normal

# --- Step 7: Majority vote ensemble ---
final_pred = []
for i in range(len(features)):
    votes = [iso_pred[i] == -1, svm_pred[i] == -1, auto_pred[i] == 1]
    final_pred.append(sum(votes) >= 2)  # anomaly if 2+ models agree

df_features["final_anomaly"] = final_pred

# --- Step 8: Save predictions ---
df_features.to_csv("enron_anomaly_predictions.csv", index=False)
files.download("enron_anomaly_predictions.csv")

print("✅ Anomaly predictions complete and saved!")



[1m3054/3054[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Anomaly predictions complete and saved!


In [36]:
files.download('/content/enron_anomaly_predictions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [37]:
pred_df = pd.read_csv('/content/enron_anomaly_predictions.csv')

In [38]:
pred_df.head()

Unnamed: 0,file,message,from,to,cc,bcc,date,subject,cleaned_message,num_to,num_cc,num_bcc,hour,is_off_hours,char_length,word_count,unique_recipient_count,sentiment_polarity,threat_keyword_count,final_anomaly
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...,phillip.allen@enron.com,tim.belden@enron.com,X-bcc:,"X-Folder: \Phillip_Allen_Jan2002_1\Allen, Phil...",2001-05-14 23:39:00+00:00,Mime-Version: 1.0,messageid javamailevansthyme\ndate mon may ...,1,1,2,23,True,379,42,208,-0.166667,0,False
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...,phillip.allen@enron.com,john.lavorato@enron.com,X-bcc:,"X-Folder: \Phillip_Allen_Jan2002_1\Allen, Phil...",2001-05-04 20:51:00+00:00,Re:,messageid javamailevansthyme\ndate fri may ...,1,1,2,20,False,1152,180,208,0.152778,0,False
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...,phillip.allen@enron.com,leah.arsdall@enron.com,X-bcc:,X-Folder: \Phillip_Allen_Dec2000\Notes Folders...,2000-10-18 10:00:00+00:00,Re: test,messageid javamailevansthyme\ndate wed oct ...,1,1,1,10,False,357,42,208,0.291667,0,False
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...,phillip.allen@enron.com,randall.gay@enron.com,X-bcc:,X-Folder: \Phillip_Allen_Dec2000\Notes Folders...,2000-10-23 13:13:00+00:00,Mime-Version: 1.0,messageid javamailevansthyme\ndate mon oct ...,1,1,1,13,False,502,69,208,0.125,0,False
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...,phillip.allen@enron.com,greg.piper@enron.com,X-bcc:,X-Folder: \Phillip_Allen_Dec2000\Notes Folders...,2000-08-31 12:07:00+00:00,Re: Hello,messageid javamailevansthyme\ndate thu aug ...,1,1,1,12,False,349,41,208,-0.166667,0,False


In [39]:
pred_df.columns

Index(['file', 'message', 'from', 'to', 'cc', 'bcc', 'date', 'subject',
       'cleaned_message', 'num_to', 'num_cc', 'num_bcc', 'hour',
       'is_off_hours', 'char_length', 'word_count', 'unique_recipient_count',
       'sentiment_polarity', 'threat_keyword_count', 'final_anomaly'],
      dtype='object')