In [1]:
#Random Forest model identifying scam messages (primarily targeting payID scams)
#using combined sample datasets from ChatGPT

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# Load dataset
data = pd.read_csv('scam_sample.csv')


# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(data['Message Text'], data['Label'], test_size=0.2, random_state=42)

# Vectorize text using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train Random Forest model
rf = RandomForestClassifier()
rf.fit(X_train_vectorized, y_train)

# Make predictions on test set
y_pred = rf.predict(X_test_vectorized)

# Calculate accuracy and confusion matrix
accuracy = (y_pred == y_test).mean()
confusion = confusion_matrix(y_test, y_pred)

print('Accuracy:', accuracy)
print('Confusion matrix:'
print(confusion)

# Load new messages from excel file
new_data = pd.read_csv(('new_messages.csv'))

# Vectorize new messages
new_messages_vectorized = vectorizer.transform(new_data['message'])

# Predict labels for new messages
new_predictions = rf.predict(new_messages_vectorized)

# Print new messages with their predicted labels
print('\nNew Messages:\n')
for i, row in new_data.iterrows():
    print('Message:', row['message'])
    print('Predicted label:', new_predictions[i])
    print('----------------------')

Accuracy: 0.9
Confusion matrix: [[12  0]
 [ 4 24]]

New Messages:

Message: Hello, I'm wondering if your item is still available for sale?
Predicted label: Legitimate
----------------------
Message: Is the product still available for purchase?
Predicted label: Legitimate
----------------------
Message: I am interested in buying your product.
Predicted label: Legitimate
----------------------
Message: Is the item still up for sale? I am interested in purchasing it.
Predicted label: Legitimate
----------------------
Message: Could you please provide more information on the condition of the item?
Predicted label: Legitimate
----------------------
Message: Would it be possible to see the item in person?
Predicted label: Legitimate
----------------------
Message: I'm satisfied with the price, let's proceed with the purchase.
Predicted label: Legitimate
----------------------
Message: I can offer you additional funds for the item.
Predicted label: Legitimate
----------------------
Message: I

In [3]:
#Naive Bayes model identifying scam messages (primarily targeting payID scams)
#using combined sample datasets from ChatGPT

In [4]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

# Load dataset
data = pd.read_csv('scam_sample.csv')

# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(data['Message Text'], data['Label'], test_size=0.2, random_state=42)

# Vectorize text using CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train Naive Bayes model
nb = MultinomialNB()
nb.fit(X_train_vectorized, y_train)

# Make predictions on test set
y_pred = nb.predict(X_test_vectorized)

# Calculate accuracy and confusion matrix
accuracy = (y_pred == y_test).mean()
confusion = confusion_matrix(y_test, y_pred)

print('Accuracy:', accuracy)
print('Confusion matrix:')
print(confusion)

# Load new messages from excel file
new_data = pd.read_csv('new_messages.csv')

# Vectorize new messages
new_messages_vectorized = vectorizer.transform(new_data['message'])

# Predict labels for new messages
new_predictions = nb.predict(new_messages_vectorized)

# Print new messages with their predicted labels
print('\nNew Messages:\n')
for i, row in new_data.iterrows():
    print('Message:', row['message'])
    print('Predicted label:', new_predictions[i])
    print('----------------------')

Accuracy: 0.95
Confusion matrix:
[[12  0]
 [ 2 26]]

New Messages:

Message: Hello, I'm wondering if your item is still available for sale?
Predicted label: Legitimate
----------------------
Message: Is the product still available for purchase?
Predicted label: Legitimate
----------------------
Message: I am interested in buying your product.
Predicted label: Suspicious
----------------------
Message: Is the item still up for sale? I am interested in purchasing it.
Predicted label: Legitimate
----------------------
Message: Could you please provide more information on the condition of the item?
Predicted label: Legitimate
----------------------
Message: Would it be possible to see the item in person?
Predicted label: Legitimate
----------------------
Message: I'm satisfied with the price, let's proceed with the purchase.
Predicted label: Legitimate
----------------------
Message: I can offer you additional funds for the item.
Predicted label: Legitimate
----------------------
Message: 

In [5]:
#logistic regression model identifying scam messages (primarily targeting payID scams)
#using combined sample datasets from ChatGPT

In [6]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

# Load dataset
data = pd.read_csv('scam_sample.csv')

# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(data['Message Text'], data['Label'], test_size=0.2, random_state=42)

# Vectorize text using CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train logistic regression model
lr = LogisticRegression()
lr.fit(X_train_vectorized, y_train)

# Make predictions on test set
y_pred = lr.predict(X_test_vectorized)

# Calculate accuracy and confusion matrix
accuracy = (y_pred == y_test).mean()
confusion = confusion_matrix(y_test, y_pred)

print('Accuracy:', accuracy)
print('Confusion matrix:')
print(confusion)

# Load new messages from excel file
new_data = pd.read_csv('new_messages.csv')

# Vectorize new messages
new_messages_vectorized = vectorizer.transform(new_data['message'])

# Predict labels for new messages
new_predictions = lr.predict(new_messages_vectorized)

# Print new messages with their predicted labels
print('\nNew Messages:\n')
for i, row in new_data.iterrows():
    print('Message:', row['message'])
    print('Predicted label:', new_predictions[i])
    print('----------------------')


Accuracy: 0.925
Confusion matrix:
[[12  0]
 [ 3 25]]

New Messages:

Message: Hello, I'm wondering if your item is still available for sale?
Predicted label: Legitimate
----------------------
Message: Is the product still available for purchase?
Predicted label: Legitimate
----------------------
Message: I am interested in buying your product.
Predicted label: Legitimate
----------------------
Message: Is the item still up for sale? I am interested in purchasing it.
Predicted label: Legitimate
----------------------
Message: Could you please provide more information on the condition of the item?
Predicted label: Legitimate
----------------------
Message: Would it be possible to see the item in person?
Predicted label: Legitimate
----------------------
Message: I'm satisfied with the price, let's proceed with the purchase.
Predicted label: Legitimate
----------------------
Message: I can offer you additional funds for the item.
Predicted label: Legitimate
----------------------
Message:

In [7]:
#SVM model identifying scam messages (primarily targeting payID scams)
#using combined sample datasets from ChatGPT

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

# Load dataset
data = pd.read_csv('scam_sample.csv')

# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(data['Message Text'], data['Label'], test_size=0.2, random_state=42)

# Vectorize text using CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train SVM model
svm = SVC(kernel='linear')
svm.fit(X_train_vectorized, y_train)

# Make predictions on test set
y_pred = svm.predict(X_test_vectorized)

# Calculate accuracy and confusion matrix
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)

print('Accuracy:', accuracy)
print('Confusion matrix:')
print(confusion)

# Load new messages from excel file
new_data = pd.read_csv('new_messages.csv')

# Vectorize new messages
new_messages_vectorized = vectorizer.transform(new_data['message'])

# Predict labels for new messages
new_predictions = svm.predict(new_messages_vectorized)

# Print new messages with their predicted labels
print('\nNew Messages:\n')
for i, row in new_data.iterrows():
    print('Message:', row['message'])
    print('Predicted label:', new_predictions[i])
    print('----------------------')


Accuracy: 0.85
Confusion matrix:
[[11  1]
 [ 5 23]]

New Messages:

Message: Hello, I'm wondering if your item is still available for sale?
Predicted label: Legitimate
----------------------
Message: Is the product still available for purchase?
Predicted label: Legitimate
----------------------
Message: I am interested in buying your product.
Predicted label: Suspicious
----------------------
Message: Is the item still up for sale? I am interested in purchasing it.
Predicted label: Legitimate
----------------------
Message: Could you please provide more information on the condition of the item?
Predicted label: Legitimate
----------------------
Message: Would it be possible to see the item in person?
Predicted label: Legitimate
----------------------
Message: I'm satisfied with the price, let's proceed with the purchase.
Predicted label: Legitimate
----------------------
Message: I can offer you additional funds for the item.
Predicted label: Legitimate
----------------------
Message: 

In [9]:
#Decision Tree model identifying scam messages (primarily targeting payID scams)
#using combined sample datasets from ChatGPT

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Load dataset
data = pd.read_csv('scam_sample.csv')

# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(data['Message Text'], data['Label'], test_size=0.2, random_state=42)

# Vectorize text using CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train Decision Tree model
dt = DecisionTreeClassifier()
dt.fit(X_train_vectorized, y_train)

# Make predictions on test set
y_pred = dt.predict(X_test_vectorized)

# Calculate accuracy and confusion matrix
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)

print('Accuracy:', accuracy)
print('Confusion matrix:')
print(confusion)

# Load new messages from excel file
new_data = pd.read_csv('new_messages.csv')

# Vectorize new messages
new_messages_vectorized = vectorizer.transform(new_data['message'])

# Predict labels for new messages
new_predictions = dt.predict(new_messages_vectorized)

# Print new messages with their predicted labels
print('\nNew Messages:\n')
for i, row in new_data.iterrows():
    print('Message:', row['message'])
    print('Predicted label:', new_predictions[i])
    print('----------------------')


Accuracy: 0.8
Confusion matrix:
[[11  1]
 [ 7 21]]

New Messages:

Message: Hello, I'm wondering if your item is still available for sale?
Predicted label: Legitimate
----------------------
Message: Is the product still available for purchase?
Predicted label: Legitimate
----------------------
Message: I am interested in buying your product.
Predicted label: Legitimate
----------------------
Message: Is the item still up for sale? I am interested in purchasing it.
Predicted label: Legitimate
----------------------
Message: Could you please provide more information on the condition of the item?
Predicted label: Legitimate
----------------------
Message: Would it be possible to see the item in person?
Predicted label: Legitimate
----------------------
Message: I'm satisfied with the price, let's proceed with the purchase.
Predicted label: Legitimate
----------------------
Message: I can offer you additional funds for the item.
Predicted label: Legitimate
----------------------
Message: I

In [11]:
#From above results, SVM and Decision Tree Model makes least mistakes, consistant
#highest accuracy of correct predicted labels
#However, reruns can cause variation in accuracy and predicted output.

In [12]:
#Decision Tree model identifying SPAM using SMS SPAM master dataset from Kaggle (not necessarily scam messages)
#https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset
#Effective with HashingVectorizer slightly more accuracy

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Load dataset

try:
    data = pd.read_csv('spam_master1.csv', encoding='utf-8')
# If utf-8 doesn't work, try reading the file using latin1 encoding
except UnicodeDecodeError:
    data = pd.read_csv('spam_master1.csv', encoding='latin1')

# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(data['Message Text'], data['Label'], test_size=0.3, random_state=42)

# Vectorize text using CountVectorizer or HashingVectorizer
vectorizer = HashingVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train Decision Tree model
dt = DecisionTreeClassifier()
dt.fit(X_train_vectorized, y_train)

# Make predictions on test set
y_pred = dt.predict(X_test_vectorized)

# Calculate accuracy and confusion matrix
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)

print('Accuracy:', accuracy)
print('Confusion matrix:')
print(confusion)

# Load new messages from excel file
new_messages = [
    "Congratulations! You have been selected as the winner of our Facebook Marketplace raffle! To claim your prize, please click on this link and provide your personal information.",
    "Hello, I am interested in your item for sale. Can you please ship it to me and I will pay you via PayPal?",
    "URGENT: Your Facebook account has been compromised. Please click on this link to reset your password and secure your account.",
    "Hey, I saw your post on Facebook Marketplace and I'm interested in buying your item. Can you please provide me with your PayID so I can make the payment?",
    "Congratulations! You have won a free gift card. Please provide your personal information to claim your prize.",
    "YOU WON A FREE IPAD, PROVIDE YOUR CREDIT CARD NUMBER TO REDEEM NOW",
    "Click here to download your free software",
    "Overdue payments! urgent click here: www.malware.com",
    "URGENT - YOU MUST PAY THE OVERDUE TAX CALL THIS NUMBER: 0223494293",
    "Hi I'm interested in purchasing",
    "Is this free?"
]

# Vectorize new messages
new_messages_vectorized = vectorizer.transform(new_messages)

# Predict labels for new messages
new_predictions = dt.predict(new_messages_vectorized)

# Print new messages with their predicted labels
print('\nNew Messages:\n')
for i, message in enumerate(new_messages):
    print('Message:', message)
    print('Predicted label:', new_predictions[i])
    print('----------------------')


Accuracy: 0.9665071770334929
Confusion matrix:
[[1429   24]
 [  32  187]]

New Messages:

Message: Congratulations! You have been selected as the winner of our Facebook Marketplace raffle! To claim your prize, please click on this link and provide your personal information.
Predicted label: spam
----------------------
Message: Hello, I am interested in your item for sale. Can you please ship it to me and I will pay you via PayPal?
Predicted label: ham
----------------------
Message: URGENT: Your Facebook account has been compromised. Please click on this link to reset your password and secure your account.
Predicted label: ham
----------------------
Message: Hey, I saw your post on Facebook Marketplace and I'm interested in buying your item. Can you please provide me with your PayID so I can make the payment?
Predicted label: ham
----------------------
Message: Congratulations! You have won a free gift card. Please provide your personal information to claim your prize.
Predicted label:

In [18]:
#Random Forest model identifying SPAM using SMS SPAM master dataset from Kaggle (not necessarily scam messages)

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Load dataset

try:
    data = pd.read_csv('spam_master1.csv', encoding='utf-8')
# If utf-8 doesn't work, try reading the file using latin1 encoding
except UnicodeDecodeError:
    data = pd.read_csv('spam_master1.csv', encoding='latin1')

# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(data['Message Text'], data['Label'], test_size=0.2, random_state=42)

# Vectorize text using CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train Random Forest model
rf = RandomForestClassifier()
rf.fit(X_train_vectorized, y_train)

# Make predictions on test set
y_pred = rf.predict(X_test_vectorized)

# Calculate accuracy and confusion matrix
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)

print('Accuracy:', accuracy)
print('Confusion matrix:')
print(confusion)

# Load new messages from excel file
new_messages = [
    "Congratulations! You have been selected as the winner of our Facebook Marketplace raffle! To claim your prize, please click on this link and provide your personal information.",
    "Hello, I am interested in your item for sale. Can you please ship it to me and I will pay you via PayPal?",
    "URGENT: Your Facebook account has been compromised. Please click on this link to reset your password and secure your account.",
    "Hey, I saw your post on Facebook Marketplace and I'm interested in buying your item. Can you please provide me with your PayID so I can make the payment?",
    "Congratulations! You have won a free gift card. Please provide your personal information to claim your prize.",
    "YOU WON A FREE IPAD, PROVIDE YOUR CREDIT CARD NUMBER TO REDEEM NOW",
    "Click here to download your free software",
    "Overdue payments! urgent click here: www.malware.com",
    "URGENT - YOU MUST PAY THE OVERDUE TAX CALL THIS NUMBER: 0223494293",
    "Hi I'm interested in purchasing",
    "Is this free?"
]

# Vectorize new messages
new_messages_vectorized = vectorizer.transform(new_messages)

# Predict labels for new messages
new_predictions = rf.predict(new_messages_vectorized)

# Print new messages with their predicted labels
print('\nNew Messages:\n')
for i, message in enumerate(new_messages):
    print('Message:', message)
    print('Predicted label:', new_predictions[i])
    print('----------------------')

Accuracy: 0.9721973094170404
Confusion matrix:
[[965   0]
 [ 31 119]]

New Messages:

Message: Congratulations! You have been selected as the winner of our Facebook Marketplace raffle! To claim your prize, please click on this link and provide your personal information.
Predicted label: spam
----------------------
Message: Hello, I am interested in your item for sale. Can you please ship it to me and I will pay you via PayPal?
Predicted label: ham
----------------------
Message: URGENT: Your Facebook account has been compromised. Please click on this link to reset your password and secure your account.
Predicted label: ham
----------------------
Message: Hey, I saw your post on Facebook Marketplace and I'm interested in buying your item. Can you please provide me with your PayID so I can make the payment?
Predicted label: ham
----------------------
Message: Congratulations! You have won a free gift card. Please provide your personal information to claim your prize.
Predicted label: spa

In [16]:
#From the above results, Decision Tree model is most effective at
#identifying SPAM using SMS SPAM master dataset from Kaggle (not necessarily scam messages)