# Spam classification 


In [1]:
import numpy as np
import pandas as pd

In [7]:
# Read a CSV file named "words_list.csv" into a pandas DataFrame and 
# squeeze it into a pandas Series
words = pd.read_csv("words_list.csv").squeeze()

In [8]:
words

0       the
1        be
2       and
3        of
4         a
       ... 
5043      🚀
5044      🌟
5045      💥
5046      🎊
5047      💪
Name: 0, Length: 5048, dtype: object

In [12]:
# Read data from the "comments.csv" file into a pandas DataFrame
comments = pd.read_csv("comments.csv")

In [21]:
# Load preprocessed data from the file "comments_data.csv", ready for further analysis
comments_processed = pd.read_csv("comments_data.csv")

In [22]:
comments_processed = comments_processed.loc[:, ~comments_processed.columns.duplicated()]
comments_processed

Unnamed: 0,the,be,and,of,a,in,to,have,to.1,it,...,😎,🙌,👏,✨,🚀,🌟,💥,🎊,💪,target_
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,0,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2088,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2089,1,0,1,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2090,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2091,0,0,1,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [24]:
def message_to_words(message):
    """
    The message_to_words(message) function accepts a string as input. 
    It splits the input string into individual words, 
    converts each word to lowercase, and returns a list of these lowercase words.
    """
    return message.lower().split()

In [27]:
def count_uppercase(strings):
    uppercase_counts = []
    for string in strings:
        uppercase_count = sum(1 for char in string if char.isupper())
        uppercase_counts.append(uppercase_count)
    return sum(uppercase_counts)

In [28]:
def make_rows(messages, words=words):
    """
    The make_rows(messages, words) function accepts a list of messages and a list of words.
    It generates a list of dictionaries where each dictionary represents a message and 
    indicates the presence of each word from the specified word list.
    """
    result = []
    symbols_to_check = set(".,!@#$%^&*()-+_=:;?/\\1234567890><😊😂😍👍🔥💯🎉❤️😎🙌👏✨🚀🌟💥🎊💪")
    for message in messages:
        words_dict = {word: 1 if word in message else 0 for word in words}
        words_dict["words_count"] = len(message)
        words_dict["big_letter_count"] = count_uppercase(message)
        for word in message:
            for letter in word:
                if letter in symbols_to_check:
                    words_dict[letter] = 1
            new_word = ''.join(char for char in word if char not in symbols_to_check)
            if new_word in words:
                words_dict[new_word] = 1
        result.append(words_dict)
    return result

In [32]:
messages = [message_to_words(message) for message in list(comments.CONTENT)]
rows = make_rows(messages)
df_final = pd.DataFrame(rows, columns=words)

In [35]:
df_final["target_"] = comments.CLASS

In [36]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score


X = df_final.drop("target_", axis=1)
y = df_final["target_"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [37]:
models = {"Logistic Regression": LogisticRegression(),
          "KNN": KNeighborsClassifier(),
          "Random Forest": RandomForestClassifier(),
          "SVC": SVC(),
         "LinearSVC": LinearSVC(dual="auto")}

def fit_and_score(models, X_train, X_test, y_train, y_test):
    np.random.seed(42)
    model_scores = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        model_scores[name] = model.score(X_test, y_test)
    return model_scores

In [38]:
fit_and_score(models, X_train, X_test, y_train, y_test)

{'Logistic Regression': 0.9236276849642004,
 'KNN': 0.8663484486873508,
 'Random Forest': 0.9284009546539379,
 'SVC': 0.9331742243436754,
 'LinearSVC': 0.9069212410501193}

In [141]:
np.random.seed(42)
svc_grid = {"C": np.logspace(-4, 4, 20),
           "kernel": ["sigmoid", "linear", "rbf", "poly"],
           'gamma': [1, 0.7, 0.1, 0.0001]}

rs_svc = RandomizedSearchCV(SVC(), svc_grid, cv=3, verbose=True, n_iter=200, n_jobs=-1)
rs_svc.fit(X_train, y_train)



Fitting 3 folds for each of 36 candidates, totalling 108 fits


In [221]:
rs_svc.best_params_

{'kernel': 'rbf', 'gamma': 0.1, 'C': 1.0}

In [67]:
model = SVC(kernel="rbf", gamma=0.1, C=1.0, random_state=42)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.9355608591885441

In [68]:
preds = model.predict(X_test)

In [69]:
(recall_score(y_test, preds) + precision_score(y_test, preds) + f1_score(y_test, preds) + model.score(X_test, y_test))/4

0.9374004575615889

In [70]:
confusion_matrix(y_test, preds)

array([[188,  11],
       [ 16, 204]], dtype=int64)

In [71]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.92      0.94      0.93       199
           1       0.95      0.93      0.94       220

    accuracy                           0.94       419
   macro avg       0.94      0.94      0.94       419
weighted avg       0.94      0.94      0.94       419


In [46]:
def is_spam(message, model=model):
    """
    Predicts whether a given message or list of messages is spam or not.

    Parameters:
    - message (str or list of str): The message or list of messages to be classified. If a single message is provided,
      it should be a string. If multiple messages are provided, they should be contained within a list of strings.

    Returns:
    - pred (array-like): An array-like object containing the predicted labels for each input message. A value of 1
      indicates that the message is classified as spam, while a value of 0 indicates that the message is not spam.
    """
    if isinstance(message, list):
        words_ = [message_to_words(message_) for message_ in message]
    elif isinstance(message, str):
        words_ = [message_to_words(message)]
    else:
        print("The provided data is in the wrong format. Please provide the data in the format of a list or a string.")
    rows = make_rows(words_)
    df_test = pd.DataFrame(rows, columns=words)
    df_test = df_test.dropna()
    pred = model.predict(df_test)
    return pred

In [47]:
comments = [
    "Beautiful photo! 😍",
    "Great content as always!",
    "Love your style! 💖",
    "Amazing shot! 📸",
    "Nice feed! Keep it up!",
    "This is so cool! 😎",
    "You're so talented!",
    "Awesome post!",
    "I admire your work!",
    "Fantastic picture! 👌",
    "Check out my profile for amazing deals!",
    "Get free followers now! Click the link in bio!",
    "Want to be Instagram famous? Follow me!",
    "Buy followers and likes! Best prices here!",
    "I make $1000 a day on Instagram! Ask me how!",
    "Get rich quick! Follow my strategy!",
    "Want to grow your account? DM me!",
    "Limited time offer! Follow for discounts!",
    "Boost your engagement! Follow for tips!",
    "Get verified on Instagram! DM me for details!",
]
results = is_spam(comments)

In [48]:
results

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1],
      dtype=int64)

In [49]:
is_spam("🚨🚨🚨 ATTENTION ALL INSTAGRAM USERS 🚨🚨🚨 Want to make $1000 a day from home? 💰💰💰 Click the link in my bio NOW to join our exclusive money-making club! 💵💵💵 Don't miss out on this amazing opportunity! 🔥🔥🔥 #makemoneyonline #financialfreedom #getrichquick")

array([1], dtype=int64)

In [50]:
is_spam("Absolutely stunning capture! 😍 The colors in this photo are so vibrant, it's like a work of art. Makes me miss our adventures together!")

array([0], dtype=int64)

In [55]:
import pickle

with open('final_model_test.pkl', 'wb') as f:
    pickle.dump(model, f)


In [53]:
with open('final_model_test.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [54]:
is_spam("This photo is beautiful. Where was it taken? I'm curious to know!", model=loaded_model)

array([0], dtype=int64)