In [4]:
import math
import requests
import socket
import pickle
import urllib.request
import pandas as pd
import numpy as np
import warnings
import tldextract
import pycountry
from bs4 import BeautifulSoup
from flask import Flask, request, jsonify
from flask_cors import CORS
import mysql.connector

warnings.filterwarnings("ignore", category=UserWarning)

In [5]:
connection = mysql.connector.connect(
    host = "localhost", 
    user = "root", 
    password = "Nad.Safwat123", 
    database = "ml"
)

app = Flask(__name__)
CORS(app) 
CORS(app, origins=["http://127.0.0.1:5500"])

with open(r"..\PickleFiles\trained_ohe.pkl", "rb") as ohe_file:
    trained_ohe = pickle.load(ohe_file)

def count_digits_in_string(s):
    return sum(c.isdigit() for c in s)

def compute_entropy(url):
    char_count = len(url)
    char_freq = {char: url.count(char) / char_count for char in set(url)}
    entropy = -sum(p * math.log2(p) for p in char_freq.values())

    # Normalize the entropy to [0, 1]
    normalized_entropy = entropy / math.log2(char_count)

    return round(normalized_entropy, 5)

def transformIP(ip_add):
    if ip_add == 0 or ip_add == '0':
        return 0
    else:
        parts = ip_add.split('.')
        return (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3])

def get_ip_address(url):
    try:
        response = requests.get(url)
        hostname = urllib.parse.urlparse(response.url).hostname
        ip_address = socket.gethostbyname(hostname)

        return ip_address
    except:
        return 0

def get_js_len(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")

        script_tags = soup.find_all("script")

        total_length = sum(len(tag.text) for tag in script_tags)
        return total_length
    except:
        return 0

def get_geo_loc(url):
    try: 
        response = requests.get(url)
        hostname = urllib.parse.urlparse(response.url).hostname
        ip_address = socket.gethostbyname(hostname)

        # Get geolocation data using IPinfo API (replace with your API key)
        api_key = '89e9d2d437b634'
        url = f"https://ipinfo.io/{ip_address}/json?token={api_key}"
        response = requests.get(url)
        geolocation_data = response.json()
        try:
            country = pycountry.countries.get(alpha_2=geolocation_data['country'])
            return country.name
        except:
            return None
    except:
        return None

def get_tld(url):
    try:
        tld_ext = tldextract.extract(url)
        return tld_ext.suffix
    except:
        return None

def process_data(url):
    UserInput = {
        'url_numOf_digits': count_digits_in_string(url),
        'url_entropy': compute_entropy(url),
        'url_len': len(url),
        'ip_add': transformIP(get_ip_address(url)),
        'who_is': 0,
        'https': 1 if url.startswith('https') else 0,
        'js_len': get_js_len(url),
        'geo_loc': get_geo_loc(url),
        'tld': get_tld(url)
    }

    return UserInput

def ohe_data(UserInput):
    InptData = pd.DataFrame([UserInput])
    
    ohetransform = trained_ohe.transform(InptData[['geo_loc', 'tld']]).astype('int8')
    InptData = pd.concat([InptData,ohetransform], axis=1).drop(columns=['geo_loc', 'tld'])
    return InptData

@app.route('/predict', methods=['POST'])
def predict():
    with open(r'..\PickleFiles\Model.pkl', 'rb') as m:
        model = pickle.load(m)

    cursor = connection.cursor(buffered=True)
    cursor.execute("SELECT * FROM data_to_view")
    rows = cursor.fetchall()

    url = request.get_json()
    url = url['url']

    urlExists = """SELECT url FROM data_to_view WHERE url = '%s'""" % (url)
    cursor.execute(urlExists)
    if (cursor.fetchall() != []):
        predict = """SELECT label FROM data_to_view WHERE url = '%s'""" % (url)
        cursor.execute(predict)
        prediction = cursor.fetchall()[0][0]
        print(f'{url} is predicted {prediction}')
        cursor.close()
        return jsonify([prediction])
    else:
        data = process_data(url)
        data = ohe_data(data)
        prediction = model.predict(data)
        print(f'{url} is predicted {prediction}')
        return jsonify(prediction.tolist())

@app.route('/save_url', methods=['POST'])
def save_url():
    cursor = connection.cursor(buffered=True)
    cursor.execute("SELECT * FROM data_to_view")
    rows = cursor.fetchall()

    data = request.get_json()

    url = data['url']
    prediction = data['prediction']
    feedback = data['isCorrect']

    if feedback == 0:
        if prediction == 1:
            prediction = 0
        else:
            prediction = 1
    
    data = process_data(url)
    addUser = """INSERT INTO data_to_view VALUES ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')""" % (
        url, data['url_numOf_digits'], data['url_entropy'], data['url_len'], get_ip_address(url), data['geo_loc'], data['tld'], data['who_is'], data['https'], data['js_len'] , prediction)
    cursor.execute(addUser)
    connection.commit()
    cursor.close()
    return {'message': 'URL saved'}

@app.after_request
def add_cors_headers(response):
    response.headers["Access-Control-Allow-Origin"] = "http://127.0.0.1:5500"
    response.headers["Access-Control-Allow-Methods"] = "POST"
    response.headers["Access-Control-Allow-Headers"] = "Content-Type"
    return response

try:
    if __name__ == '__main__':
        app.run(port=5000, debug= False)
except Exception as e:
    print('Error:', e)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [12/May/2024 15:37:18] "OPTIONS /predict HTTP/1.1" 200 -
127.0.0.1 - - [12/May/2024 15:37:35] "POST /predict HTTP/1.1" 200 -


http://short.io is predicted [0]


127.0.0.1 - - [12/May/2024 15:37:43] "OPTIONS /save_url HTTP/1.1" 200 -
127.0.0.1 - - [12/May/2024 15:37:57] "POST /save_url HTTP/1.1" 200 -
127.0.0.1 - - [12/May/2024 15:38:00] "OPTIONS /predict HTTP/1.1" 200 -
127.0.0.1 - - [12/May/2024 15:38:16] "POST /predict HTTP/1.1" 200 -


http://github.com is predicted [0]


127.0.0.1 - - [12/May/2024 15:38:36] "OPTIONS /predict HTTP/1.1" 200 -


http://www.google.com is predicted [0]


127.0.0.1 - - [12/May/2024 15:38:50] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [12/May/2024 15:39:23] "OPTIONS /predict HTTP/1.1" 200 -
127.0.0.1 - - [12/May/2024 15:40:03] "POST /predict HTTP/1.1" 200 -


http://www.haverford.edu/athletics/fieldhockey/ is predicted [0]


127.0.0.1 - - [12/May/2024 15:40:12] "OPTIONS /predict HTTP/1.1" 200 -


http://www.cnn.com is predicted [0]


127.0.0.1 - - [12/May/2024 15:41:19] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [12/May/2024 15:41:21] "OPTIONS /predict HTTP/1.1" 200 -


http://www.aucegypt.edu is predicted [0]


127.0.0.1 - - [12/May/2024 15:41:41] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [12/May/2024 15:42:05] "OPTIONS /predict HTTP/1.1" 200 -
127.0.0.1 - - [12/May/2024 15:42:22] "POST /predict HTTP/1.1" 200 -


http://viatalia.com is predicted [0]


127.0.0.1 - - [12/May/2024 15:42:34] "OPTIONS /save_url HTTP/1.1" 200 -
127.0.0.1 - - [12/May/2024 15:42:49] "POST /save_url HTTP/1.1" 200 -
127.0.0.1 - - [12/May/2024 15:42:54] "OPTIONS /predict HTTP/1.1" 200 -


http://www.bucklin.org/ is predicted [0]


127.0.0.1 - - [12/May/2024 15:43:31] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [12/May/2024 15:43:55] "OPTIONS /predict HTTP/1.1" 200 -


http://viatalia.com is predicted 1


127.0.0.1 - - [12/May/2024 15:44:14] "POST /predict HTTP/1.1" 200 -
