In [None]:
import pickle
import requests
from bs4 import BeautifulSoup
import pandas as pd
from transformers import AutoTokenizer
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
import numpy as npcm
from requests.exceptions import Timeout

all = pd.read_csv("all.csv")

In [None]:

# Returns the content of the link while still being secure as some of the links may be malicious
def get_content(link):
    #print(link)
    try:
        response = requests.get(link, timeout=2)
        if response.status_code == 200:
            soupContent = BeautifulSoup(response.content, 'html.parser')
            
        #print (soupContent)
            str =' '.join([text.get_text() for text in soupContent.find_all(['h1', 'h2', 'h3', 'p'])])
            if str =="":
                return None
            print ("found")
            return str
        else:
            #print("not ok", response.status_code)
            return None
    
    except Timeout:
        #print("Request timed out!")
        return None
    
    except Exception:
        # print(Exception)
        return None


tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
max_length = tokenizer.model_max_length



# Tokenizes the text using a pretrained BERT tokenizer
def tokenize_text(text):
    # Truncate the input text to fit within the maximum sequence length
    truncated_text = text[:max_length]
    tokens = tokenizer(truncated_text, truncation=True,
                    max_length=max_length, padding='max_length',
                    return_tensors='pt')
    return tokens

# Read the CSV file
# data = all.iloc[47000:50000]
data = all.iloc[0:300]

print("Done reading")

# Get the content for each URL and store it in a new column
data['content'] = data['URL'].apply(get_content)
print("Done applying links")

# Remove rows where content is None and their corresponding spam values
data = data.dropna(subset=['content'])

In [None]:
data


# b = pd.read_csv("df_full of 1.csv")
# b = pd.concat([b, data], axis=0)
data.to_csv("df_full of 1.csv")

In [None]:
spams = pd.read_csv("df_full of AAAA.csv")
db = pd.concat([spams, data], axis=0)

In [None]:
data = pd.read_csv("urlCont.csv")

In [None]:
print("Done applying links")

# Remove rows where content is None and their corresponding spam values
data = db.dropna(subset=['content'])

# Tokenize the content and store it in a new column
data['tokens'] = data['content'].apply(tokenize_text)

print("Done tokenizing")

# Convert tokens to a string
data['tokens_str'] = data['tokens'].apply(lambda tokens: ' '.join(tokens))

# Create a TF-IDF vectorizer
# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(data['tokens_str'])
X = data['tokens_str']
labels = data['spam']

# Trains a logistic regression model using 10-fold cross-validation
nSplit = 5
kfold = KFold(n_splits=nSplit, shuffle=True, random_state=42)

In [None]:
X = data['content'].values
Y = data['spam'].values
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7, max_features=1000)
vectorizer.fit(data['content'].values)
X = vectorizer.transform(X)

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=7)
Y_train = [int(numeric_string) for numeric_string in Y_train]
Y_test = [int(numeric_string) for numeric_string in Y_test]

In [None]:
from sklearn.utils import class_weight
class_weights = class_weight.compute_sample_weight("balanced", Y_train)
class_weights = dict(enumerate(class_weights))
clf = LogisticRegression(max_iter=5000, class_weight=class_weights)
clf.fit(X_train, Y_train)

#testing
prediction2 = clf.predict(X_test)
score = metrics.accuracy_score(Y_test, prediction2)

cm1 = metrics.confusion_matrix(Y_test, prediction2)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print ("accuracy_scores" ,accuracy_score(Y_test, prediction2))
print ("precision_scores", precision_score(Y_test, prediction2, pos_label=1))
print ("recall_scores", recall_score(Y_test, prediction2, pos_label=1))
print ("f1_scores", f1_score(Y_test, prediction2, pos_label=1))

In [None]:
pickle.dump(clf, open('linkContentClassifier.sav', 'wb'))

In [None]:


accuracyScores = []
f1Scores = []
precisionScores = []
recallScores = []

i = 0
for train, test in kfold.split(X):
    print(f"testing fold {i}")
    i += 1
    trainX, testX = X[train], X[test]
    trainY, testY = labels.iloc[train], labels.iloc[test]
    print(trainX)
    model = LogisticRegression(max_iter=5000, random_state=42)
    model.fit(trainX, trainY)

    yPredictions = model.predict(testX)
    curAccuracy = metrics.accuracy_score(testY, yPredictions)
    curF1 = metrics.f1_score(testY, yPredictions)
    curPrecision = metrics.precision_score(testY, yPredictions)
    curRecall = metrics.recall_score(testY, yPredictions)

    accuracyScores.append(curAccuracy)
    f1Scores.append(curF1)
    precisionScores.append(curPrecision)
    recallScores.append(curRecall)




# Uses metrics module to print the average accuracy of the model
print('Average accuracy: ', sum(accuracyScores) / len(accuracyScores))
print('Average F1 score: ', sum(f1Scores) / len(f1Scores))
print('Average precision: ', sum(precisionScores) / len(precisionScores))
print('Average recall: ', sum(recallScores) / len(recallScores))

# save the model locally
pickle.dump(model, open('linkContentClassifier.sav', 'wb'))