In [1]:
import pandas as pd
import numpy as np

In [2]:
websites=["akakce","amazon","arabam","donanimhaber","haberturk","mgm","nefisyemektarifleri","pazarama","trendyol"]

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to read text from a file
def read_text(file_path,text):
    try:
        with open("data/"+file_path+"/"+text+".txt", 'r',encoding='latin1') as file:
            return file.read()
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""




datas=[]
for website in websites:
    data=pd.read_csv(website+"_paired.csv",encoding='latin1')
    data = data.groupby("pair").apply(lambda x: x.sample(min(184,len(x)))).reset_index(drop=True)
    print(website,data.shape)
    for text in ["htmltags","javascript","meta","performance","response_headers","url"]:
        print(text ,"similarity is computing")
        # Read texts from url1_dir and url2_dir
        texts1 = data['url1_dir'].apply(lambda x:read_text(x,text))
        texts2 = data['url2_dir'].apply(lambda x:read_text(x,text))

        # Compute similarity scores
        vectorizer = TfidfVectorizer()
        try:
            tfidf_matrix = vectorizer.fit_transform(pd.concat([texts1, texts2]))
            similarity_scores = cosine_similarity(tfidf_matrix[:len(data)], tfidf_matrix[len(data):])
        except Exception as e:
            print(f"Error computing similarity for {text}: {e}")
            similarity_scores = np.zeros((len(data), len(data)))

        # Add similarity scores to the dataframe
        data[text+'_cs'] = [similarity_scores[i, i] for i in range(len(data))]

    datas.append(data)
data=pd.concat(datas,ignore_index=True)
#drop the rows with pair value count is 1
valuecounts=data["pair"].value_counts()
data["pairC"] = data["pair"].map(lambda x: valuecounts[x])
data = data[data["pairC"] > 1].reset_index(drop=True)
print(data.shape)
data.head()

akakce (2217, 6)
htmltags similarity is computing
javascript similarity is computing
meta similarity is computing
performance similarity is computing
response_headers similarity is computing
Error reading www.akakce.com/42: [Errno 2] No such file or directory: 'data/www.akakce.com/42/response_headers.txt'
Error reading www.akakce.com/7: [Errno 2] No such file or directory: 'data/www.akakce.com/7/response_headers.txt'
Error reading www.akakce.com/43: [Errno 2] No such file or directory: 'data/www.akakce.com/43/response_headers.txt'
Error reading www.akakce.com/43: [Errno 2] No such file or directory: 'data/www.akakce.com/43/response_headers.txt'
Error reading www.akakce.com/7: [Errno 2] No such file or directory: 'data/www.akakce.com/7/response_headers.txt'
Error reading www.akakce.com/42: [Errno 2] No such file or directory: 'data/www.akakce.com/42/response_headers.txt'
Error reading www.akakce.com/43: [Errno 2] No such file or directory: 'data/www.akakce.com/43/response_headers.txt'
E

Unnamed: 0,url1,url2,url1_dir,url2_dir,pair,identical,htmltags_cs,javascript_cs,meta_cs,performance_cs,response_headers_cs,url_cs,pairC
0,https://www.akakce.com/brosurler/a101,https://www.akakce.com/yorum/?p=436908675#Yoru...,www.akakce.com/78,www.akakce.com/81,aktcom,False,0.739922,0.999582,0.0,0.063893,0.901833,0.075132,3
1,https://www.akakce.com/brosurler/a101,https://www.akakce.com/yorum/?p=1958594929,www.akakce.com/78,www.akakce.com/80,aktcom,False,0.780092,0.999647,0.0,0.059108,0.902206,0.090899,3
2,https://www.akakce.com/brosurler/a101,https://www.akakce.com/yorum/?p=131983292#Yoru...,www.akakce.com/78,www.akakce.com/79,aktcom,False,0.739922,0.999591,0.0,0.0607,0.891392,0.073561,3
3,https://www.akakce.com/brosurler/a101,https://www.akakce.com/#/9876/?z=144&v=2108&s=...,www.akakce.com/78,www.akakce.com/89,akthome,False,0.578461,0.999037,0.0,0.059108,0.901465,0.034744,2
4,https://www.akakce.com/brosurler/a101,https://www.akakce.com/,www.akakce.com/78,www.akakce.com/90,akthome,False,0.578461,0.999045,0.0,0.061807,0.90518,0.278008,2


In [3]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(
    data,stratify=data["pair"],
    test_size=0.2,
    random_state=42,
    shuffle=True,
)
train, val = train_test_split(
    train, stratify=train["pair"],
    test_size=0.25,
    random_state=42,
    shuffle=True,
)

In [18]:
from pycaret.classification import *

columns=["identical","htmltags_cs","javascript_cs","meta_cs","performance_cs","response_headers_cs","url_cs"]
# Initialize the setup
clf = setup(data=train[columns], target='identical', session_id=42,test_data=val[columns])

bests = compare_models(n_select=3)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,identical
2,Target type,Binary
3,Original data shape,"(36754, 7)"
4,Transformed data shape,"(36754, 7)"
5,Transformed train set shape,"(27565, 7)"
6,Transformed test set shape,"(9189, 7)"
7,Numeric features,6
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9843,0.9962,0.867,0.9546,0.9084,0.8998,0.9012,0.121
xgboost,Extreme Gradient Boosting,0.9842,0.995,0.8834,0.9379,0.9097,0.9011,0.9016,0.099
rf,Random Forest Classifier,0.9821,0.9944,0.8565,0.9402,0.8962,0.8865,0.8877,0.516
lightgbm,Light Gradient Boosting Machine,0.982,0.9942,0.8678,0.9285,0.8969,0.887,0.8878,0.076
dt,Decision Tree Classifier,0.9704,0.9059,0.8272,0.8429,0.8347,0.8185,0.8187,0.032
knn,K Neighbors Classifier,0.9683,0.9636,0.7621,0.8705,0.8124,0.7952,0.7974,0.223
gbc,Gradient Boosting Classifier,0.9672,0.9825,0.7347,0.8819,0.8014,0.7837,0.7877,0.547
ada,Ada Boost Classifier,0.9535,0.9714,0.6463,0.801,0.7151,0.6901,0.695,0.144
lr,Logistic Regression,0.9299,0.9379,0.3412,0.7444,0.4673,0.4353,0.4739,0.503
svm,SVM - Linear Kernel,0.9286,0.9373,0.285,0.7947,0.4167,0.3879,0.4476,0.009


In [None]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp
from hyperopt import STATUS_OK, Trials
from hyperopt.pyll import scope



from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score



columns=["identical","htmltags_cs","javascript_cs","meta_cs","performance_cs","response_headers_cs","url_cs"]
x_train = train[columns].drop(columns=["identical"])
y_train = train["identical"]
x_val= val[columns].drop(columns=["identical"])
y_val = val["identical"]
x_test = test[columns].drop(columns=["identical"])
y_test = test["identical"]


# Define the hyperparameter space
space = {
    'max_depth': scope.int(hp.quniform('max_depth', 2, 15, 1)),
    'learning_rate': hp.loguniform('learning_rate', -5, -1),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'n_estimators': scope.int(hp.quniform('n_estimators', 100, 1500, 50)),
    'min_child_weight': scope.int(hp.quniform('min_child_weight', 1, 10, 1)),
    'gamma': hp.uniform('gamma', 0, 5),
}

# Define the objective function to minimize
def objective(params):
    xgb_model = xgb.XGBClassifier(**params)
    xgb_model.fit(x_train, y_train)
    y_pred = xgb_model.predict(x_val)
    score = accuracy_score(y_val, y_pred)
    return {'loss': -score, 'status': STATUS_OK}

# Perform the optimization
best_params = fmin(objective, space, algo=tpe.suggest, max_evals=100)
print("Best set of hyperparameters: ", best_params)

xgb_model = xgb.XGBClassifier(max_depth=int(best_params['max_depth']),
                            subsample=best_params['subsample'],
                            colsample_bytree=best_params['colsample_bytree'],
                            n_estimators=int(best_params['n_estimators']),
                            min_child_weight=int(best_params['min_child_weight']),
                            gamma=best_params['gamma'],
                            learning_rate=best_params['learning_rate'])
xgb_model.fit(x_train, y_train)
print("Train accuracy: ", accuracy_score(y_train, xgb_model.predict(x_train)))
print("Validation accuracy: ", accuracy_score(y_val, xgb_model.predict(x_val)))
print("Test accuracy: ", accuracy_score(y_test, xgb_model.predict(x_test)))
print("Test f1: ", f1_score(y_test, xgb_model.predict(x_test)))
print("Test precision: ", precision_score(y_test, xgb_model.predict(x_test)))
print("Test recall: ", recall_score(y_test, xgb_model.predict(x_test)))

Train accuracy:  0.9997823326682387
Validation accuracy:  0.9862879529872673
Test accuracy:  0.9846555664381326
Test f1:  0.9120399251403618
Test precision:  0.9530638852672751
Test recall:  0.8744019138755981


In [15]:
import tensorflow as tf 
metric=tf.metrics.SensitivityAtSpecificity(0.99)
metric.update_state(y_test,xgb_model.predict(x_test))
print("Test sensitivity: ", metric.result().numpy())

Test sensitivity:  0.8744019
