Natural Language
 Processing (NLP) - Text
 Classification:

Description: Classify text data into categories (e.g.,
 spam vs. non-spam, sentiment analysis)

In [9]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Using the Initial Sentiment Analysis Data

In [14]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

# Download NLTK data files (will only run if they are not already downloaded)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

# 1. Data Loading and Preprocessing
# Load the dataset from your file
try:
    df = pd.read_csv('/content/3) Sentiment dataset.csv')
except FileNotFoundError:
    print("Error: 'sentiment_data.csv' not found. Please check the file name and path.")
    exit()

# Define the columns to use based on your image
text_column = 'Text'
label_column = 'Sentiment'

# Drop the 'Unnamed: 0' column if it exists, as it's an index
if 'Unnamed: 0' in df.columns:
    df = df.drop('Unnamed: 0', axis=1)

# Preprocessing functions
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    processed_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(processed_tokens)

# Apply preprocessing to the text column
df['processed_text'] = df[text_column].apply(preprocess_text)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df['processed_text'],
    df[label_column],
    test_size=0.2,
    random_state=42
)

# 2. Feature Extraction (TF-IDF)
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# 3. Model Training
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# 4. Evaluation
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))

# --- FIXED: Added the zero_division parameter to remove the warning ---
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=1))

Accuracy: 0.10884353741496598

Classification Report:
                         precision    recall  f1-score   support

         Acceptance          1.00      0.00      0.00         2
           Admiration        1.00      0.00      0.00         1
        Admiration           1.00      0.00      0.00         1
         Affection           1.00      0.00      0.00         1
      Ambivalence            1.00      0.00      0.00         1
         Anger               1.00      0.00      0.00         1
        Anticipation         1.00      0.00      0.00         1
        Arousal              1.00      0.00      0.00         3
                  Awe        1.00      0.00      0.00         1
         Awe                 1.00      0.00      0.00         1
                  Bad        1.00      0.00      0.00         1
             Betrayal        1.00      0.00      0.00         2
        Betrayal             1.00      0.00      0.00         1
         Bitter              1.00      0.00     

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Using the Email Phishing Dataset

In [7]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier # A good choice for this type of data
from sklearn.metrics import classification_report, accuracy_score

# 1. Data Loading
# Load the dataset
df = pd.read_csv('/content/email_phishing_data.csv')

# 2. Data Preparation
# Based on your image, the 'label' column is the target (what we want to predict).
# All other columns are the features (what we will use to predict).
X = df.drop('label', axis=1) # Features are all columns EXCEPT 'label'
y = df['label']              # Target is the 'label' column

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

# 3. Model Training
# We'll use a RandomForestClassifier, which is effective for this type of data.
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 4. Evaluation
# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9897685052872249

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99    103573
           1       0.85      0.28      0.42      1397

    accuracy                           0.99    104970
   macro avg       0.92      0.64      0.71    104970
weighted avg       0.99      0.99      0.99    104970

