In [None]:
import pandas as pd

data_path = "./data/SMSSpamCollection.csv"

df = pd.read_csv(
    data_path,
    sep="\t",          # tab-separated values
    header=None,       # no header row
    names=["label", "text"]
)

print(df.head())
print(df.label.unique())

lbl = {
    "ham":0,
    "spam":1
}

df['label'] = df["label"].map(lbl)

print(df.head())

print(f"\nDataset shape: {df.shape}")
print(f"Spam percentage: {(df['label'].sum() / len(df) * 100):.2f}%")

print(f"\nMissing values:\n{df.isnull().sum()}")
# print(f"\nMissing values:\n{df.isnull()[2:25]}")

X = df['text']
y= df['label']

print(X.head())
print(y.head())

  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
<StringArray>
['ham', 'spam']
Length: 2, dtype: str
   label                                               text
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...

Dataset shape: (5572, 2)
Spam percentage: 13.41%

Missing values:
label    0
text     0
dtype: int64
0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

texts = ["hello world", "machine learning is fun", "hello python"]

# Convert to numbers
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts)  # Now a numerical matrix
print(X.toarray())

[[0 1 0 0 0 0 1]
 [1 0 1 1 1 0 0]
 [0 1 0 0 0 1 0]]


In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words="english"
)
print(df['text'].head())
X_vec = vectorizer.fit_transform(df['text'])

print(X_vec)

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: text, dtype: str
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 43578 stored elements and shape (5572, 8444)>
  Coords	Values
  (0, 4248)	0.35097479377088364
  (0, 5779)	0.27451666535585145
  (0, 2282)	0.27180581777101714
  (0, 1284)	0.26251769953088055
  (0, 1719)	0.29650492406235857
  (0, 3555)	0.19387866945820545
  (0, 8285)	0.23740715800944148
  (0, 4374)	0.29650492406235857
  (0, 1717)	0.3350433781715565
  (0, 2007)	0.29650492406235857
  (0, 3515)	0.16453831818791093
  (0, 1064)	0.35097479377088364
  (0, 8083)	0.1961033223643189
  (1, 5377)	0.2718944069420321
  (1, 4410)	0.4083258549263009
  (1, 4216)	0.5236804332035243
  (1, 8191)	0.43162957585464123
  (1, 5403)	0.5466243141314314
  (2, 32

In [None]:
#REGESSION TRAINING
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline


# Create pipeline: Text → Numbers → Model
pipeline = make_pipeline(
    TfidfVectorizer(max_features=1000),  # Text to numbers
    LogisticRegression()                  # Model that uses numbers
)

# Train
pipeline.fit(df['text'],df['label'])

# Predict
new_text = ["Free entry in 2 a wkly comp to win FA Cup fina..."]
prediction = pipeline.predict(new_text)  # 1 (spam)

pred_label = [k for k, v in lbl.items() if v == prediction[0]]

print(pred_label[0])
print(prediction[0])

ham
0
