In [1]:
import pandas as pd

In [2]:
!pwd

/cpfs01/user/lvhongxia/ML-NN-final/problem5_email_classify


In [3]:
file_path = "/cpfs01/user/lvhongxia/ML-NN-final/problem5_email_classify/enron_spam_data/enron_spam_data.csv"

In [4]:
data = pd.read_csv(file_path, encoding='utf-8')

In [10]:
data['Message'][1]

'gary , production from the high island larger block a - 1 # 2 commenced on\nsaturday at 2 : 00 p . m . at about 6 , 500 gross . carlos expects between 9 , 500 and\n10 , 000 gross for tomorrow . vastar owns 68 % of the gross production .\ngeorge x 3 - 6992\n- - - - - - - - - - - - - - - - - - - - - - forwarded by george weissman / hou / ect on 12 / 13 / 99 10 : 16\nam - - - - - - - - - - - - - - - - - - - - - - - - - - -\ndaren j farmer\n12 / 10 / 99 10 : 38 am\nto : carlos j rodriguez / hou / ect @ ect\ncc : george weissman / hou / ect @ ect , melissa graves / hou / ect @ ect\nsubject : vastar resources , inc .\ncarlos ,\nplease call linda and get everything set up .\ni \' m going to estimate 4 , 500 coming up tomorrow , with a 2 , 000 increase each\nfollowing day based on my conversations with bill fischer at bmar .\nd .\n- - - - - - - - - - - - - - - - - - - - - - forwarded by daren j farmer / hou / ect on 12 / 10 / 99 10 : 34\nam - - - - - - - - - - - - - - - - - - - - - - - - - - 

In [9]:
data

Unnamed: 0,Message ID,Subject,Message,Spam/Ham,Date
0,0,christmas tree farm pictures,,ham,1999-12-10
1,1,"vastar resources , inc .","gary , production from the high island larger ...",ham,1999-12-13
2,2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,ham,1999-12-14
3,3,re : issue,fyi - see note below - already done .\nstella\...,ham,1999-12-14
4,4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,ham,1999-12-14
...,...,...,...,...,...
33711,33711,= ? iso - 8859 - 1 ? q ? good _ news _ c = eda...,"hello , welcome to gigapharm onlinne shop .\np...",spam,2005-07-29
33712,33712,all prescript medicines are on special . to be...,i got it earlier than expected and it was wrap...,spam,2005-07-29
33713,33713,the next generation online pharmacy .,are you ready to rock on ? let the man in you ...,spam,2005-07-30
33714,33714,bloow in 5 - 10 times the time,learn how to last 5 - 10 times longer in\nbed ...,spam,2005-07-30


In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Step 1: Load and Prepare the Data
data = pd.read_csv(file_path)

# Combine 'Subject' and 'Message' columns into a single 'Content' column
data['Content'] = data['Subject'].fillna('') + ' ' + data['Message'].fillna('')

# Encode the labels ('Spam/Ham') into binary format (0 for Ham, 1 for Spam)
label_encoder = LabelEncoder()
data['Spam/Ham'] = label_encoder.fit_transform(data['Spam/Ham'])

# Extract features (X) and labels (y)
X = data['Content']
y = data['Spam/Ham']

# Step 2: Split the Data into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 3: Text Vectorization using TF-IDF
# Convert the text data into numerical features
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2), sublinear_tf=True, use_idf=True)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 4: Grid Search to Optimize LDA Hyperparameters
# Define the LDA model
lda = LatentDirichletAllocation(random_state=42)

# Define the parameter grid for LDA
param_grid = {
    'n_components': [10, 15, 20, 25, 30],  # Number of topics
    'learning_method': ['batch', 'online'],  # Learning method
    'max_iter': [10, 50, 100],  # Maximum iterations
    'learning_decay': [0.5, 0.7, 0.9],  # Learning decay
    'learning_offset': [10, 20, 30],  # Learning offset
}

# Create a GridSearchCV object with LDA and the parameter grid
grid_search_lda = GridSearchCV(lda, param_grid, cv=3, n_jobs=-1, verbose=1)

# Fit GridSearchCV on the training data
grid_search_lda.fit(X_train_tfidf)

# Get the best LDA model from grid search
best_lda = grid_search_lda.best_estimator_

print("Best LDA Parameters: ", grid_search_lda.best_params_)

# Step 5: Apply the Best LDA Model to Extract Features
X_train_lda = best_lda.transform(X_train_tfidf)
X_test_lda = best_lda.transform(X_test_tfidf)

# Step 6: Train a Logistic Regression Model for Classification
classifier = LogisticRegression(random_state=42, max_iter=1000)
classifier.fit(X_train_lda, y_train)

# Step 7: Evaluate the Model
# Predict on the test set
y_pred = classifier.predict(X_test_lda)

# Generate a classification report
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
print("Classification Report:")
print(report)

Fitting 3 folds for each of 270 candidates, totalling 810 fits
Best LDA Parameters:  {'learning_decay': 0.7, 'learning_method': 'online', 'learning_offset': 30, 'max_iter': 100, 'n_components': 10}
Classification Report:
              precision    recall  f1-score   support

         ham       0.99      0.83      0.90      3309
        spam       0.86      0.99      0.92      3435

    accuracy                           0.91      6744
   macro avg       0.92      0.91      0.91      6744
weighted avg       0.92      0.91      0.91      6744

