In [None]:
# --- CELL 1: Imports and Setup ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer # Changed from TfidfVectorizer based on later screenshots
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB



In [None]:
# --- CELL 2: Load Data ---
# Make sure 'resume.csv' is in your directory
resume_df = pd.read_csv('resume.csv', encoding='latin-1')
print("Initial Data Head:")
print(resume_df.head())


In [None]:
# --- CELL 3: Initial Exploration and Conversion ---
print("\nNull Values:")
print(resume_df.isnull().sum())
print("\nClass Value Counts Before Conversion:")
print(resume_df['class'].value_counts()) # Shows 'not_flagged' and 'flagged'

# Convert 'class' column to numerical (0 and 1)
resume_df['class'] = resume_df['class'].apply(lambda x: 1 if x == 'flagged' else 0)
print("\nClass Value Counts After Conversion:")
print(resume_df['class'].value_counts())

In [None]:
# --- CELL 4: Stopwords Preparation ---
nltk.download("stopwords")
stop_words = stopwords.words('english')
# Extend stopwords with common non-content words found in text data
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'email', 'com'])

In [None]:
# --- CELL 5: Define Preprocessing Function ---
def preprocess(text):
    # Remove stop words and remove words with 2 or less characters
    result = []
    for token in gensim.utils.simple_preprocess(text) :
        # Check against both gensim default STOPWORDS and the extended custom list
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2 and token not in stop_words:
            result.append(token)
    return ' '.join(result)

In [None]:
# --- CELL 6: Apply Preprocessing ---
resume_df['cleaned'] = resume_df['resume_text'].apply(preprocess)

# Display a sample of the cleaned text
print("Cleaned Text (Sample 0):\n", resume_df['cleaned'][0])
print("\nOriginal Text (Sample 0):\n", resume_df['resume_text'][0])

In [None]:
# --- CELL 7: Word Cloud for Class 0 (Not Flagged) ---
plt.figure(figsize=(10, 10))
subset = resume_df[resume_df['class'] == 0]
text = subset.cleaned.values

cloud_0 = WordCloud(
    stopwords=stop_words,
    background_color='black',
    collocations=False,
    max_words=2000, 
    width=1600, 
    height=800
).generate(" ".join(text))

plt.axis('off')
plt.title('Wordcloud for Not Flagged Resumes', fontsize=20)
plt.imshow(cloud_0)
plt.show()

In [None]:
# --- CELL 8: Word Cloud for Class 1 (Flagged) ---
plt.figure(figsize=(10, 10))
subset = resume_df[resume_df['class'] == 1]
text = subset.cleaned.values

cloud_1 = WordCloud(
    stopwords=stop_words,
    background_color='black',
    collocations=False,
    max_words=2000, 
    width=1600, 
    height=800
).generate(" ".join(text))

plt.axis('off')
plt.title('Wordcloud for Flagged Resumes', fontsize=20)
plt.imshow(cloud_1)
plt.show()

In [None]:
# --- CELL 9: Vectorization (Count Vectorizer) ---
# CountVectorizer is used here, as indicated by the later screenshots.
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(resume_df['cleaned'])
y = resume_df['class']

In [None]:
# --- CELL 10: Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training Samples: {X_train.shape[0]}")
print(f"Testing Samples: {X_test.shape[0]}")

In [None]:
# --- CELL 12: Prediction and Evaluation ---
y_pred_nb = nb.predict(X_test)

print("--- Classification Report (Naive Bayes) ---")
print(classification_report(y_test, y_pred_nb))

# Plot Confusion Matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred_nb)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Not Flagged (0)', 'Flagged (1)'],
            yticklabels=['Not Flagged (0)', 'Flagged (1)'])
plt.title('Confusion Matrix for Naive Bayes')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()