In [35]:
import pandas as pd

# 10 sample long emails across different sectors
Emails = [
    """Subject: Cybersecurity Update - Phishing Attempt Blocked
Dear User,
Our systems have successfully blocked a phishing attempt targeting your account.
We recommend staying vigilant against suspicious links and updating your password regularly.
For more details, please review the security guidelines in your dashboard.
Stay safe,
Cybersecurity Operations Team""",

    """Subject: Automotive & Transportation - Autonomous Vehicle Trials
Dear Customer,
We are thrilled to announce the commencement of our autonomous vehicle road trials.
The project focuses on advanced safety, eco-friendly design, and AI-driven navigation systems.
Your feedback will play an important role in shaping the future of mobility.
Warm regards,
Automotive Innovation Team""",

    """Subject: Food & Beverages - Health Drink Launch
Dear Valued Customer,
Introducing our new line of fortified energy drinks with natural vitamins and minerals.
Perfect for busy lifestyles, these beverages combine great taste with proven health benefits.
Join us at our stores for free samples and introductory offers this weekend.
Best wishes,
Healthy Beverages Team""",

    """Subject: BFSI - Credit Card Upgrade Offer
Dear Client,
You are eligible for a complimentary upgrade to our Platinum Credit Card with exclusive travel rewards.
Enjoy benefits like lounge access, higher cashback, and complimentary insurance coverage.
Apply through our website or mobile app before the end of this billing cycle.
Sincerely,
Your Banking Partner""",

    """Subject: Education & EdTech - Scholarship Opportunity
Dear Student,
We are excited to inform you about our new merit-based scholarship program.
This initiative covers up to 80% of tuition fees for students with outstanding academic performance.
Visit our portal to apply before the scholarship deadline.
Warm regards,
Admissions Office""",

    """Subject: Healthcare - Free Annual Health Checkup
Dear Member,
As part of our wellness program, you are entitled to a free annual health checkup.
This includes diagnostic tests, doctor consultations, and personalized health reports.
Please schedule your appointment via our online health portal.
Stay healthy,
Healthcare Services""",

    """Subject: Retail & E-Commerce - Exclusive Festive Sale
Dear Shopper,
Celebrate this season with our biggest sale of the year!
Avail up to 70% discount on electronics, apparel, and home essentials.
Hurry, offers valid only till stocks last.
Happy Shopping,
Retail Deals Team""",

    """Subject: Travel & Hospitality - Holiday Package Deals
Dear Traveler,
Plan your perfect getaway with our exclusive holiday packages to Europe and Southeast Asia.
Enjoy discounted rates, guided tours, and luxury stays with our trusted partners.
Book now and make memories that last a lifetime.
Bon Voyage,
Travel Experts""",

    """Subject: Real Estate - New Housing Project Launch
Dear Investor,
We are delighted to introduce our latest premium housing project in the heart of the city.
The development includes modern apartments, smart home features, and eco-friendly designs.
Contact our sales team to schedule a site visit today.
Sincerely,
Real Estate Developers""",

    """Subject: IT & Software - Cloud Migration Webinar
Dear Professional,
Join our upcoming webinar on best practices for cloud migration and data security.
Industry experts will share insights on cost optimization, scalability, and compliance.
Register now to reserve your spot in this interactive session.
Best regards,
IT Solutions Team"""
]

# Create DataFrame
df = pd.DataFrame({"Emails": Emails})

# Save to CSV
df.to_csv("/content/sample_emails.csv", index=False)
print("✅ Sample CSV created at /content/sample_emails_10.csv")
print(df.head())


✅ Sample CSV created at /content/sample_emails_10.csv
                                              Emails
0  Subject: Cybersecurity Update - Phishing Attem...
1  Subject: Automotive & Transportation - Autonom...
2  Subject: Food & Beverages - Health Drink Launc...
3  Subject: BFSI - Credit Card Upgrade Offer\nDea...
4  Subject: Education & EdTech - Scholarship Oppo...


In [39]:
ad = pd.read_csv("/content/predicted_sectors.csv")
ad.head(10)

Unnamed: 0,Emails,Cleaned_Emails,Predicted_Sector
0,Subject: Cybersecurity Update - Phishing Attem...,subject cybersecurity update phishing attempt ...,Cybersecurity
1,Subject: Automotive & Transportation - Autonom...,subject automotive transportation autonomous v...,Automotive & Transportation
2,Subject: Food & Beverages - Health Drink Launc...,subject food beverages health drink launch dea...,Food & Beverages
3,Subject: BFSI - Credit Card Upgrade Offer\nDea...,subject bfsi credit card upgrade offer dear cl...,"Banking, Financial Services & Insurance (BFSI)"
4,Subject: Education & EdTech - Scholarship Oppo...,subject education edtech scholarship opportuni...,Education & EdTech
5,Subject: Healthcare - Free Annual Health Check...,subject healthcare free annual health checkup ...,Healthcare & Pharmaceuticals
6,Subject: Retail & E-Commerce - Exclusive Festi...,subject retail e commerce exclusive festive sa...,E-Commerce & Retail
7,Subject: Travel & Hospitality - Holiday Packag...,subject travel hospitality holiday package dea...,Travel & Hospitality
8,Subject: Real Estate - New Housing Project Lau...,subject real estate new housing project launch...,Construction & Real Estate
9,Subject: IT & Software - Cloud Migration Webin...,subject it software cloud migration webinar de...,Cloud Computing & Data Centers


In [None]:
# Install the correct versions in Colab first
!pip install scikit-learn==1.6.1 numpy==1.26.4 joblib==1.3.2

In [1]:


import pandas as pd
import re
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, classification_report
import numpy as np

# Print versions for verification
import sklearn
print(f"Scikit-learn version: {sklearn.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"Joblib version: {joblib.__version__}")

# -----------------------------
# Step 1: Load dataset
# -----------------------------
df = pd.read_csv("/content/email_dataset.csv")
print(f"Dataset loaded: {len(df)} rows")

# -----------------------------
# Step 2: Clean text
# -----------------------------
def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", " ", text)       # remove links
    text = re.sub(r"<.*?>", " ", text)                # remove html tags
    text = re.sub(r"[^a-z\s]", " ", text)             # keep only letters
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["Cleaned_Emails"] = df["Emails"].apply(clean_text)
df = df.dropna(subset=["Sector"]).reset_index(drop=True)
print(f"After cleaning: {len(df)} rows")

# -----------------------------
# Step 3: Encode labels
# -----------------------------
le = LabelEncoder()
df["label"] = le.fit_transform(df["Sector"])
classes = list(le.classes_)
print("Classes:", classes)
print(f"Number of classes: {len(classes)}")

# -----------------------------
# Step 4: Train/test split
# -----------------------------
X_text = df["Cleaned_Emails"]
y = df["label"]

X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training set: {len(X_train_text)} samples")
print(f"Test set: {len(X_test_text)} samples")

# -----------------------------
# Step 5: TF-IDF
# -----------------------------
vectorizer = TfidfVectorizer(
    max_features=20000,
    stop_words="english",
    ngram_range=(1,2),
    min_df=2,  # Add minimum document frequency
    max_df=0.95  # Add maximum document frequency
)
X_train = vectorizer.fit_transform(X_train_text)
X_test = vectorizer.transform(X_test_text)
print(f"TF-IDF features: {X_train.shape[1]}")

# -----------------------------
# Step 6: Train model (SVM)
# -----------------------------
clf = LinearSVC(random_state=42, max_iter=2000)  # Increase max_iter
clf.fit(X_train, y_train)
print("✅ Model training completed!")

# -----------------------------
# Step 7: Evaluate
# -----------------------------
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="weighted")

print(f"Accuracy: {accuracy:.4f}")
print(f"F1-weighted: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=classes))

# -----------------------------
# Step 8: Save the trained model and components
# -----------------------------
# Create a directory to save model files
os.makedirs('/content/email_sector_model', exist_ok=True)

# Save with protocol 4 for better compatibility
joblib.dump(clf, '/content/email_sector_model/model.pkl', protocol=4)
joblib.dump(vectorizer, '/content/email_sector_model/vectorizer.pkl', protocol=4)
joblib.dump(le, '/content/email_sector_model/label_encoder.pkl', protocol=4)

# Save metadata
metadata = {
    'sklearn_version': sklearn.__version__,
    'numpy_version': np.__version__,
    'joblib_version': joblib.__version__,
    'accuracy': accuracy,
    'f1_score': f1,
    'num_classes': len(classes),
    'classes': classes,
    'num_features': X_train.shape[1],
    'training_samples': len(X_train_text),
    'test_samples': len(X_test_text)
}

import json
with open('/content/email_sector_model/metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

# Save the classes list for reference
with open('/content/email_sector_model/classes.txt', 'w') as f:
    for class_name in classes:
        f.write(f"{class_name}\n")

print("\n✅ Model saved successfully!")
print("Files saved:")
print("- /content/email_sector_model/model.pkl")
print("- /content/email_sector_model/vectorizer.pkl")
print("- /content/email_sector_model/label_encoder.pkl")
print("- /content/email_sector_model/metadata.json")
print("- /content/email_sector_model/classes.txt")

# Zip the model folder for easy download
import shutil
shutil.make_archive('/content/email_sector_model', 'zip', '/content/email_sector_model')
print("\n📦 Model files zipped: /content/email_sector_model.zip")

# Test prediction function
def predict_sector(email_text):
    cleaned_email = clean_text(email_text)
    email_vector = vectorizer.transform([cleaned_email])
    prediction = clf.predict(email_vector)[0]
    predicted_sector = le.inverse_transform([prediction])[0]
    return predicted_sector

# Test with a few sample emails
test_emails = [
    "Hello, we are looking for software developers with Python experience.",
    "Our hospital is seeking qualified nurses for the emergency department.",
    "Join our sales team and earn great commissions selling cars!",
    "We provide investment advisory services and financial planning."
]

print("\n🧪 Testing predictions:")
for email in test_emails:
    predicted = predict_sector(email)
    print(f"'{email[:50]}...' -> {predicted}")

# Download instructions
print("\n📥 To download the model files:")
print("from google.colab import files")
print("files.download('/content/email_sector_model.zip')")

Scikit-learn version: 1.6.1
NumPy version: 1.26.4
Joblib version: 1.3.2
Dataset loaded: 17460 rows
After cleaning: 17460 rows
Classes: ['Artificial Intelligence & Machine Learning', 'Automotive & Transportation', 'Banking, Financial Services & Insurance (BFSI)', 'Cloud Computing & Data Centers', 'Construction & Real Estate', 'Consulting & Professional Services', 'Cybersecurity', 'E-Commerce & Retail', 'Education & EdTech', 'Energy (Oil, Gas, Renewable, Power)', 'Fashion & Apparel', 'FinTech', 'Food & Beverages', 'Healthcare & Pharmaceuticals', 'Human Resources & Recruitment', 'Information Technology (IT) & Software', 'Legal & Law Firms', 'Logistics & Supply Chain', 'Manufacturing & Industrial Goods', 'Marketing, Advertising & PR', 'Media, Entertainment & Publishing', 'Non-Profit Organizations (NGOs)', 'Telecommunications', 'Travel & Hospitality']
Number of classes: 24
Training set: 13968 samples
Test set: 3492 samples
TF-IDF features: 3423
✅ Model training completed!
Accuracy: 1.0000
F

In [42]:
import gradio as gr
import pandas as pd
import joblib
import re
import os
from io import StringIO

# Load the trained model and components
model = joblib.load('/content/email_sector_model/model.pkl')
vectorizer = joblib.load('/content/email_sector_model/vectorizer.pkl')
label_encoder = joblib.load('/content/email_sector_model/label_encoder.pkl')

# Text cleaning function (same as training)
def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", " ", text)       # remove links
    text = re.sub(r"<.*?>", " ", text)                # remove html tags
    text = re.sub(r"[^a-z\s]", " ", text)             # keep only letters
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Prediction function for single email
def predict_single_email(email_text):
    if not email_text.strip():
        return "Please enter an email text."

    try:
        # Clean the email text
        cleaned_email = clean_text(email_text)

        # Vectorize the email
        email_vector = vectorizer.transform([cleaned_email])

        # Make prediction
        prediction = model.predict(email_vector)[0]
        predicted_sector = label_encoder.inverse_transform([prediction])[0]

        # Get prediction probabilities (if supported by the model)
        try:
            probabilities = model.decision_function(email_vector)[0]
            # Get top 3 predictions
            top_indices = probabilities.argsort()[-3:][::-1]
            top_sectors = label_encoder.inverse_transform(top_indices)
            top_scores = probabilities[top_indices]

            result = f"**Predicted Sector: {predicted_sector}**\n\n"
            result += "**Top 3 Predictions:**\n"
            for i, (sector, score) in enumerate(zip(top_sectors, top_scores)):
                result += f"{i+1}. {sector}: {score:.3f}\n"

        except:
            result = f"**Predicted Sector: {predicted_sector}**"

        return result

    except Exception as e:
        return f"Error making prediction: {str(e)}"

# Prediction function for CSV file
def predict_csv_file(file):
    if file is None:
        return None, "Please upload a CSV file."

    try:
        # Read the uploaded CSV file
        df = pd.read_csv(file.name)

        # Check if 'Emails' column exists
        if 'Emails' not in df.columns:
            return None, "Error: CSV file must contain an 'Emails' column."

        # Clean the email texts
        df['Cleaned_Emails'] = df['Emails'].apply(clean_text)

        # Make predictions
        email_vectors = vectorizer.transform(df['Cleaned_Emails'])
        predictions = model.predict(email_vectors)
        predicted_sectors = label_encoder.inverse_transform(predictions)

        # Add predictions to dataframe
        df['Predicted_Sector'] = predicted_sectors

        # Create output CSV
        output_csv = StringIO()
        df[['Emails', 'Predicted_Sector']].to_csv(output_csv, index=False)
        output_csv.seek(0)

        # Save to temporary file for download
        output_filename = "predictions.csv"
        df[['Emails', 'Predicted_Sector']].to_csv(output_filename, index=False)

        success_msg = f"✅ Successfully processed {len(df)} emails. Download the results below."

        return output_filename, success_msg

    except Exception as e:
        return None, f"Error processing CSV file: {str(e)}"

# Get available sectors for display
available_sectors = list(label_encoder.classes_)
sectors_text = ", ".join(available_sectors)

# Create Gradio interface
with gr.Blocks(title="Email Sector Classification", theme=gr.themes.Soft()) as demo:

    gr.Markdown("# 📧 Email Sector Classification")
    gr.Markdown("Classify emails into business sectors using machine learning.")

    gr.Markdown(f"**Available Sectors:** {sectors_text}")

    with gr.Tabs():
        # Tab 1: Single Email Prediction
        with gr.Tab("Single Email Prediction"):
            gr.Markdown("### Enter an email to classify its sector")

            with gr.Row():
                with gr.Column(scale=2):
                    email_input = gr.Textbox(
                        label="Email Text",
                        placeholder="Enter your email content here...",
                        lines=8,
                        max_lines=15
                    )
                    predict_btn = gr.Button("Predict Sector", variant="primary", size="lg")

                with gr.Column(scale=1):
                    prediction_output = gr.Markdown(label="Prediction Result")

            predict_btn.click(
                fn=predict_single_email,
                inputs=email_input,
                outputs=prediction_output
            )

            # Example emails
            gr.Markdown("### Example Emails")
            examples = [
                "We are looking for experienced software developers to join our tech team. Requirements include Python, JavaScript, and cloud technologies.",
                "Our hospital is seeking qualified nurses for the emergency department. Must have current RN license and BLS certification.",
                "Join our sales team! We offer competitive commission rates and comprehensive training for motivated individuals.",
                "We provide comprehensive financial planning services including investment management and retirement planning."
            ]

            gr.Examples(
                examples=examples,
                inputs=email_input,
                outputs=prediction_output,
                fn=predict_single_email,
                cache_examples=True
            )

        # Tab 2: CSV File Upload
        with gr.Tab("Batch CSV Processing"):
            gr.Markdown("### Upload a CSV file with emails to classify")
            gr.Markdown("**CSV Format:** Your file should have an 'Emails' column containing the email texts.")

            with gr.Row():
                with gr.Column():
                    file_input = gr.File(
                        label="Upload CSV File",
                        file_types=[".csv"],
                        file_count="single"
                    )
                    process_btn = gr.Button("Process CSV", variant="primary", size="lg")

                with gr.Column():
                    file_status = gr.Markdown()
                    download_file = gr.File(label="Download Results", visible=False)

            def process_and_update(file):
                result_file, status = predict_csv_file(file)
                if result_file:
                    return status, gr.File(value=result_file, visible=True)
                else:
                    return status, gr.File(visible=False)

            process_btn.click(
                fn=process_and_update,
                inputs=file_input,
                outputs=[file_status, download_file]
            )

            # CSV format example
            gr.Markdown("### CSV Format Example")
            gr.Markdown("""
            ```
            Emails
            "We are hiring software engineers with Python experience"
            "Our clinic needs registered nurses for patient care"
            "Looking for sales representatives in the automotive industry"
            ```
            """)

    gr.Markdown("---")
    gr.Markdown("*Powered by scikit-learn and Gradio*")

# Launch the app
if __name__ == "__main__":
    demo.launch()

Caching examples at: '/content/.gradio/cached_examples/15'
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://fc5afd4b2f36fdf7ad.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
