# Simulation of the dataset

In [4]:
import pandas as pd
import numpy as np

# Define the number of samples for each type
n_samples_phishing = 3000
n_samples_legitimate = 3000

# Sample phishing SMS messages
phishing_sms_messages = [
    "Your account has been compromised. Click here to verify your details.",
    "Urgent: Your account will be suspended unless you confirm your personal information.",
    "Unusual login attempt detected. Verify your account now to secure it.",
    "Congratulations! You've won a prize. Claim now by providing your bank details.",
    "Action required: Confirm your account information to avoid service interruption.",
    "Security alert: Please verify your account by clicking this link.",
    "Immediate action needed: Update your billing information.",
    "You have a pending transaction. Confirm your details to complete it.",
    "Your bank account has been flagged for suspicious activity. Log in to secure.",
    "Update your account information to avoid access restrictions.",
    "New login detected from an unknown device. Verify your identity now.",
    "We couldn't verify your last payment. Please update your information."
]

# Sample phishing email messages
phishing_email_messages = [
    "Dear customer, We detected suspicious activity in your account. Log in to verify your recent transactions.",
    "Your account has been flagged for a security review. Click the link to verify your identity.",
    "Warning: Your account may be at risk. Please confirm your banking details immediately.",
    "To secure your account, please confirm your details by clicking here.",
    "Your password needs to be updated due to a security issue. Log in to update it.",
    "Verify your account now to avoid unexpected charges.",
    "Your account has been locked due to unusual activity. Click here to unlock it.",
    "Our records indicate a discrepancy in your account information. Update now to prevent account suspension.",
    "Dear user, we noticed a new login from an unfamiliar location. Confirm it was you.",
    "Thank you for using our service. To continue, confirm your identity by logging in.",
    "Your recent transaction could not be completed. Please provide updated billing information.",
    "Security Notice: Your account was accessed from a foreign device. Verify if this was you."
]

# Sample legitimate SMS messages
legitimate_sms_messages = [
    "Your recent transaction was successful. Thank you for banking with us.",
    "Monthly statement: Your account balance is now available.",
    "Reminder: Your payment is due in 5 days. Thank you for your attention.",
    "Thank you for choosing our services. Your account details are secure.",
    "This is your account summary for the month. Please review for your records.",
    "Your deposit of $500 has been received. Your new balance is now updated.",
    "Thank you for setting up direct deposit. Your funds are now available.",
    "We hope you are enjoying our services. Contact us if you need support.",
    "Your e-statement for this month is now available in your account.",
    "Thank you for updating your profile. Your changes have been saved.",
    "Your scheduled transfer has been processed successfully.",
    "We are here to support you. Reach out to us with any questions."
]

# Sample legitimate email messages
legitimate_email_messages = [
    "Dear customer, thank you for your recent transaction. Your new balance is now updated.",
    "Your monthly statement is now available. Please review it at your convenience.",
    "Reminder: Your loan payment is due in 5 days. Contact us if you have any questions.",
    "Thank you for your recent payment. We appreciate your prompt attention.",
    "Your account balance summary for this month is now available for review.",
    "You successfully updated your profile. If this was not you, please contact us.",
    "Thank you for choosing our banking services. Your account is secure with us.",
    "Your recent deposit has been processed. Thank you for using our bank.",
    "Your e-statement for the month is now ready for viewing in your account.",
    "Your transaction was completed successfully. Your new balance is updated.",
    "Thank you for updating your contact information with us.",
    "We are here to assist you with any questions about your recent transaction."
]

# Combine phishing and legitimate data for SMS and email
phishing_sms = np.random.choice(phishing_sms_messages, n_samples_phishing)
legitimate_sms = np.random.choice(legitimate_sms_messages, n_samples_legitimate)
phishing_email = np.random.choice(phishing_email_messages, n_samples_phishing)
legitimate_email = np.random.choice(legitimate_email_messages, n_samples_legitimate)

# Create labels and types
sms_labels = ["phishing"] * n_samples_phishing + ["not phishing"] * n_samples_legitimate
email_labels = ["phishing"] * n_samples_phishing + ["not phishing"] * n_samples_legitimate
sms_types = ["SMS"] * (n_samples_phishing + n_samples_legitimate)
email_types = ["Email"] * (n_samples_phishing + n_samples_legitimate)

# Combine all messages, labels, and types into a single DataFrame
messages = np.concatenate([phishing_sms, legitimate_sms, phishing_email, legitimate_email])
labels = sms_labels + email_labels
types = sms_types + email_types

# Create the DataFrame
df = pd.DataFrame({
    "Message": messages,
    "Label": labels,
    "Type": types
})

# Shuffle the dataset
df = df.sample(frac=1).reset_index(drop=True)

# Display a sample of the dataset
print("Combined SMS and Email Messages Dataset Sample:")
print(df.head())

# Optional: Save the dataset to a CSV file
df.to_csv('phishing_detection_dataset.csv', index=False)


Combined SMS and Email Messages Dataset Sample:
                                             Message         Label   Type
0  Thank you for updating your profile. Your chan...  not phishing    SMS
1  Congratulations! You've won a prize. Claim now...      phishing    SMS
2  Your recent transaction was successful. Thank ...  not phishing    SMS
3  Unusual login attempt detected. Verify your ac...      phishing    SMS
4  Your e-statement for the month is now ready fo...  not phishing  Email


# Preprocessing the dataset

In [5]:
df = pd.read_csv("/content/phishing_detection_dataset.csv")

In [7]:
#drop the type column
df.drop('Type', axis=1, inplace=True)

In [8]:
df.head()

Unnamed: 0,Message,Label
0,Thank you for updating your profile. Your chan...,not phishing
1,Congratulations! You've won a prize. Claim now...,phishing
2,Your recent transaction was successful. Thank ...,not phishing
3,Unusual login attempt detected. Verify your ac...,phishing
4,Your e-statement for the month is now ready fo...,not phishing


# Training the Logistic Regression Model

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


# Step 1: Split the dataset into training and testing sets
X = df['Message']  # Input feature
y = df['Label']    # Output label

# Encode labels as 0 (not phishing) and 1 (phishing)
y = y.map({'not phishing': 0, 'phishing': 1})

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Convert text data to TF-IDF features
tfidf = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Step 3: Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Step 4: Evaluate the model
y_pred = model.predict(X_test_tfidf)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))




Accuracy: 1.0
Confusion Matrix:
 [[1222    0]
 [   0 1178]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1222
           1       1.00      1.00      1.00      1178

    accuracy                           1.00      2400
   macro avg       1.00      1.00      1.00      2400
weighted avg       1.00      1.00      1.00      2400



# Save the TF-IDF model

In [12]:
#save the TF-IDF model
import pickle
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

In [13]:
#Save the trained model
import pickle
with open('logistic_regression_model.pkl', 'wb') as f:
    pickle.dump(model, f)