In [1]:
import pandas as pd
import numpy as np
import sklearn
data = {
    "subject": [
        "Win a free vacation",
        "Your receipt from Bookstore",
        "Limited time offer",
        "Meeting agenda",
        "Update your account info",
        "Family photos inside",
        "Exclusive promo just for you",
        "Weekly project status"
    ],
    "label": [
        "spam",
        "not_spam",
        "spam",
        "not_spam",
        "spam",
        "not_spam",
        "spam",
        "not_spam"
    ],
    "sender_domain": [
        "promos.biz",
        "gmail.com",
        "offers.mail",
        "company.com",
        "alerts.bank",
        "yahoo.com",
        "promos.biz",
        "company.com"
    ],
    "num_links": [5, 0, 7, 1, np.nan, 0, 4, 1],
    "num_words": [40, 120, 35, 80, 60, 100, 50, 90],
    "received_date": [
        "2024-01-15",
        "2024-01-18",
        "2024-02-02",
        "2024-02-10",
        "2024-03-05",
        "2024-03-08",
        "2024-03-20",
        "2024-04-01"
    ]
}


In [2]:
df = pd.DataFrame(data)
df

Unnamed: 0,subject,label,sender_domain,num_links,num_words,received_date
0,Win a free vacation,spam,promos.biz,5.0,40,2024-01-15
1,Your receipt from Bookstore,not_spam,gmail.com,0.0,120,2024-01-18
2,Limited time offer,spam,offers.mail,7.0,35,2024-02-02
3,Meeting agenda,not_spam,company.com,1.0,80,2024-02-10
4,Update your account info,spam,alerts.bank,,60,2024-03-05
5,Family photos inside,not_spam,yahoo.com,0.0,100,2024-03-08
6,Exclusive promo just for you,spam,promos.biz,4.0,50,2024-03-20
7,Weekly project status,not_spam,company.com,1.0,90,2024-04-01


In [3]:
df["is_spam"] = df["label"].map({"spam":1, "not_spam":0})
df[["label", "is_spam"]]

Unnamed: 0,label,is_spam
0,spam,1
1,not_spam,0
2,spam,1
3,not_spam,0
4,spam,1
5,not_spam,0
6,spam,1
7,not_spam,0


In [4]:
# Converting Dummy Data
df = pd.get_dummies(df, columns=["sender_domain"], prefix="domain", dtype="int")
df

Unnamed: 0,subject,label,num_links,num_words,received_date,is_spam,domain_alerts.bank,domain_company.com,domain_gmail.com,domain_offers.mail,domain_promos.biz,domain_yahoo.com
0,Win a free vacation,spam,5.0,40,2024-01-15,1,0,0,0,0,1,0
1,Your receipt from Bookstore,not_spam,0.0,120,2024-01-18,0,0,0,1,0,0,0
2,Limited time offer,spam,7.0,35,2024-02-02,1,0,0,0,1,0,0
3,Meeting agenda,not_spam,1.0,80,2024-02-10,0,0,1,0,0,0,0
4,Update your account info,spam,,60,2024-03-05,1,1,0,0,0,0,0
5,Family photos inside,not_spam,0.0,100,2024-03-08,0,0,0,0,0,0,1
6,Exclusive promo just for you,spam,4.0,50,2024-03-20,1,0,0,0,0,1,0
7,Weekly project status,not_spam,1.0,90,2024-04-01,0,0,1,0,0,0,0


In [5]:
# Handling Null Values
numlinks_median = df["num_links"].mean()
df["num_links"] = df["num_links"].fillna(numlinks_median)
df

Unnamed: 0,subject,label,num_links,num_words,received_date,is_spam,domain_alerts.bank,domain_company.com,domain_gmail.com,domain_offers.mail,domain_promos.biz,domain_yahoo.com
0,Win a free vacation,spam,5.0,40,2024-01-15,1,0,0,0,0,1,0
1,Your receipt from Bookstore,not_spam,0.0,120,2024-01-18,0,0,0,1,0,0,0
2,Limited time offer,spam,7.0,35,2024-02-02,1,0,0,0,1,0,0
3,Meeting agenda,not_spam,1.0,80,2024-02-10,0,0,1,0,0,0,0
4,Update your account info,spam,2.571429,60,2024-03-05,1,1,0,0,0,0,0
5,Family photos inside,not_spam,0.0,100,2024-03-08,0,0,0,0,0,0,1
6,Exclusive promo just for you,spam,4.0,50,2024-03-20,1,0,0,0,0,1,0
7,Weekly project status,not_spam,1.0,90,2024-04-01,0,0,1,0,0,0,0
