In [14]:
# Define the SMS classification function
def sms_classification(df):
    # Set features and target variables
    features = df['text']
    target = df['label']
    
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.33, random_state=42)
    
    # Build pipeline
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', LinearSVC())
    ])
    
    # Fit model to transformed training data
    model = pipeline.fit(X_train, y_train)
    
    return model



In [15]:
def sms_classification(sms_text_df):
    """
    Perform SMS classification using a pipeline with TF-IDF vectorization and Linear Support Vector Classification.

    Parameters:
    - sms_text_df (pd.DataFrame): DataFrame containing 'text_message' and 'label' columns for SMS classification.

    Returns:
    - text_clf (Pipeline): Fitted pipeline model for SMS classification.

    This function takes a DataFrame with 'text_message' and 'label' columns, splits the data into
    training and testing sets, builds a pipeline with TF-IDF vectorization and Linear Support Vector
    Classification, and fits the model to the training data. 
    The fitted pipeline is returned to make future predictions.
    """
    # Set the features variable to the text message column.
    features = sms_text_df['text_message']
    
    # Set the target variable to the "label" column.
    target = sms_text_df['label']
    
    # Split data into training and testing and set the test_size = 33%
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.33, random_state=42)
    
    # Build a pipeline to transform the test set to compare to the training set.
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', LinearSVC())
    ])
    
    # Fit the model to the transformed training data and return model.
    text_clf = pipeline.fit(X_train, y_train)
    
    return text_clf






In [16]:
# Load the dataset into a DataFrame
sms_text_df = pd.read_csv('Resources/SMSSpamCollection.csv')
sms_text_df.head()





Unnamed: 0,label,text_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [17]:
# Check for missing values. 
sms_text_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   label         5572 non-null   object
 1   text_message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [18]:
#  Get the number of "ham" and "spam" from the "label" column:
sms_text_df['label'].value_counts()


label
ham     4825
spam     747
Name: count, dtype: int64

In [19]:
# Set the features variable to the text message. 
X = sms_text_df['text_message']  
# Set the target variable to the "label" column.
y = sms_text_df['label']

# Split data into training and testing and set the test_size = 33%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [21]:
from sklearn.metrics import accuracy_score


# Build a pipeline to transform the test set to compare to the training set.
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC())
])

# Fit the model to the transformed training data.
text_clf = pipeline.fit(X_train, y_train)

# Make predictions on the test set.
predictions = text_clf.predict(X_test)

# Calculate the accuracy of the model.
accuracy = accuracy_score(y_test, predictions)
print(f"Model Accuracy: {accuracy}")


Model Accuracy: 0.989668297988037




In [27]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

# Define the SMS classification function
def sms_classification(df):
    # Set features and target variables
    features = df['text_message']  # Update column name here
    target = df['label']
    
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.33, random_state=42)
    
    # Build pipeline
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', LinearSVC())
    ])
    
    # Fit model to transformed training data
    model = pipeline.fit(X_train, y_train)
    
    return model

# Load the dataset into a DataFrame
sms_text_df = pd.read_csv('Resources/SMSSpamCollection.csv')

# Call the sms_classification function
model = sms_classification(sms_text_df)

# Save the model
joblib.dump(model, 'sms_spam_model.joblib')

# Load the model
model = joblib.load('sms_spam_model.joblib')

# Test the model with a sample text message
sample_text = ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"]

# Make a prediction
prediction = model.predict(sample_text)
print(prediction)




['spam']


