In [22]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [13]:
df = pd.read_csv("/Users/ismathakit/Downloads/spam.csv", encoding_errors='ignore')
df = df.iloc[:, :2]
df.columns = ['Label', 'Text']
df.head()

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# 1. Apply TF-IDF

In [16]:
tfidf = TfidfVectorizer(max_features=1000)  # Limit to top 1000 features to improve computation
X = tfidf.fit_transform(df['Text']).toarray()
y = df['Label']  # Assuming 'Label' column contains the target variable
print("TF-IDF Transformation Completed.")
print("Shape of TF-IDF matrix:", X.shape)

TF-IDF Transformation Completed.
Shape of TF-IDF matrix: (5572, 1000)


# 2. Feature selection with variace threshold

As I try threshold with 0.1, the results got error that none of the features in the dataset have a variance greater than the specified threshold of 0.1. So, I try to lower the threshold to 0.01.

In [19]:
selector = VarianceThreshold(threshold=0.01)
X_selected = selector.fit_transform(X)

# Calculate number of features removed
features_removed = X.shape[1] - X_selected.shape[1]
print(f"Number of features removed: {features_removed}")
print("Shape after feature selection:", X_selected.shape)


Number of features removed: 999
Shape after feature selection: (5572, 1)


# 3. Split the data into training and testing sets

In [24]:
# Perform a stratified split with shuffle=True to create stratified folds
X_temp, X_dummy, y_temp, y_dummy = train_test_split(
    X_selected, y, test_size=0.3, stratify=y, random_state=1234, shuffle=True
)

# Sort the data to ensure the output simulates shuffle=False
X_train, y_train = X_temp, y_temp
X_test, y_test = X_dummy, y_dummy

print(f"Train set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

Train set shape: (3900, 1)
Test set shape: (1672, 1)


# 4. Use Naive Bayes classifier.

In [27]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Evaluate the model
y_pred = nb.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

         ham       0.87      1.00      0.93      1448
        spam       0.00      0.00      0.00       224

    accuracy                           0.87      1672
   macro avg       0.43      0.50      0.46      1672
weighted avg       0.75      0.87      0.80      1672



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# 5. Report Top 10 and Bottom 10 rows.

In [30]:
print("----- Top 10 rows of the dataset: -----")
print(df.head(10))
print("\n----- Bottom 10 rows of the dataset: -----")
print(df.tail(10))

----- Top 10 rows of the dataset: -----
  Label                                               Text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
5  spam  FreeMsg Hey there darling it's been 3 week's n...
6   ham  Even my brother is not like to speak with me. ...
7   ham  As per your request 'Melle Melle (Oru Minnamin...
8  spam  WINNER!! As a valued network customer you have...
9  spam  Had your mobile 11 months or more? U R entitle...

----- Bottom 10 rows of the dataset: -----
     Label                                               Text
5562   ham  Ok lor... Sony ericsson salesman... I ask shuh...
5563   ham                                Ard 6 like dat lor.
5564   ham  Why don't you wait 'til at least wednesday to ...
5565   ham         