In [None]:
# Import required libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the dataset
raw_mail_data = pd.read_csv('mail_data.csv')

# Display the first 5 rows of the dataset
print(raw_mail_data.head())

# Replace any null values in the dataset with empty strings
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)), '')

# Display the shape (rows, columns) of the dataset
print(mail_data.shape)

# Label encoding:
# Replace 'spam' with 0 and 'ham' with 1 in the 'Category' column
mail_data.loc[mail_data['Category'] == 'spam', 'Category'] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category'] = 1

# Separate the data into features (messages) and labels (categories)
x = mail_data['Message']
y = mail_data['Category']

# Split the dataset into training data and testing data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=3)

# Display the shapes of total, training, and testing data
print(x.shape, x_train.shape, x_test.shape)

# Convert text data to numerical feature vectors using TF-IDF
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

# Convert the target labels to integer type
y_train = y_train.astype('int')
y_test = y_test.astype('int')

# Initialize the Logistic Regression model
model = LogisticRegression()

# Train the model on the training data
model.fit(x_train_features, y_train)

# Make predictions on the training data
x_train_prediction = model.predict(x_train_features)

# Calculate and display the accuracy on the training data
training_data_accuracy = accuracy_score(y_train, x_train_prediction)
print('Training accuracy:', training_data_accuracy)

# Make predictions on the test data
x_test_prediction = model.predict(x_test_features)

# Calculate and display the accuracy on the testing data
testing_data_accuracy = accuracy_score(y_test, x_test_prediction)
print('Testing accuracy:', testing_data_accuracy)

# Prediction system for classifying new messages
# Take message input from the user
user_input = input("Enter a message to classify (spam/ham): ")

# Transform the input message into numerical feature vector using the trained TF-IDF vectorizer
user_input_features = feature_extraction.transform([user_input])

# Predict the category (spam or ham) using the trained model
prediction = model.predict(user_input_features)

# Display the prediction result
if prediction[0] == 1:
    print('The message is: Ham (Not Spam)')
else:
    print('The message is: Spam')




  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
(5572, 2)
(5572,) (4457,) (1115,)
Training accuracy: 0.9676912721561588
Testing accuracy: 0.9668161434977578
