In [2]:
import math
import requests
import socket
import pickle
import urllib.request
import pandas as pd
import numpy as np
import warnings
import tldextract
import pycountry
import whois
from bs4 import BeautifulSoup

warnings.filterwarnings("ignore", category=UserWarning)

In [12]:
UserInput = {
    'url': None,
    'url_numOf_digits': None,
    'url_entropy': None,
    'url_len':None,
    'ip_add':None,
    'geo_loc':None,
    'tld':None,
    'who_is':None,
    'https':None,
    'js_len':None,
    'label':None
}

# Functions

In [13]:
def count_digits_in_string(s):
    return sum(c.isdigit() for c in s)

def compute_entropy(url):
    char_count = len(url)
    char_freq = {char: url.count(char) / char_count for char in set(url)}
    entropy = -sum(p * math.log2(p) for p in char_freq.values())

    # Normalize the entropy to [0, 1]
    normalized_entropy = entropy / math.log2(char_count)

    return round(normalized_entropy, 5)

def transformIP(ip_add):
    parts = ip_add.split('.')
    return (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + int(parts[3])

def get_ip_address(url):
    try:
        response = requests.get(url)
        hostname = urllib.parse.urlparse(response.url).hostname
        ip_address = socket.gethostbyname(hostname)

        return ip_address
    except:
        return None

def get_js_len(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")

        script_tags = soup.find_all("script")

        total_length = sum(len(tag.text) for tag in script_tags)
        return total_length
    except:
        return 0

def get_geo_loc(url):
    try: 
        response = requests.get(url)
        hostname = urllib.parse.urlparse(response.url).hostname
        ip_address = socket.gethostbyname(hostname)

        # Get geolocation data using IPinfo API (replace with your API key)
        api_key = '89e9d2d437b634'
        url = f"https://ipinfo.io/{ip_address}/json?token={api_key}"
        response = requests.get(url)
        geolocation_data = response.json()
        try:
            country = pycountry.countries.get(alpha_2=geolocation_data['country'])
            return country.name
        except:
            return None
    except:
        return None

def get_tld(url):
    try:
        tld_ext = tldextract.extract(url)
        return tld_ext.suffix
    except:
        return None

# Get the values

In [6]:
url = "https://docs.google.com/"

In [9]:
UserInput['url_numOf_digits'] = count_digits_in_string(url)
UserInput['url_entropy'] = compute_entropy(url)
UserInput['url_len'] = len(url)
UserInput['ip_add'] = get_ip_address(url)
UserInput['who_is'] = 1
UserInput['https'] = 1 if url.startswith('https') else 0
UserInput['js_len'] = get_js_len(url)
UserInput['geo_loc'] = get_geo_loc(url)
UserInput['tld'] = get_tld(url)

In [10]:
UserInput

{'url_numOf_digits': 0,
 'url_entropy': 0.79321,
 'url_len': 24,
 'ip_add': 2398776916,
 'who_is': 1,
 'https': 1,
 'js_len': 110986,
 'geo_loc': 'Belgium',
 'tld': 'com'}

In [11]:
InptData = pd.DataFrame([UserInput])

In [12]:
InptData

Unnamed: 0,url_numOf_digits,url_entropy,url_len,ip_add,who_is,https,js_len,geo_loc,tld
0,0,0.79321,24,2398776916,1,1,110986,Belgium,com


# OHE GEO_LOC and TLD

In [193]:
with open(r"..\PickleFiles\trained_ohe.pkl", "rb") as ohe_file:
    trained_ohe = pickle.load(ohe_file)

ohetransform = trained_ohe.transform(InptData[['geo_loc', 'tld']]).astype('int32')
InptData = pd.concat([InptData,ohetransform], axis=1).drop(columns=['geo_loc', 'tld'])

# Final input to be sent to model

In [194]:
InptData

Unnamed: 0,url_numOf_digits,url_entropy,url_len,ip_add,who_is,https,js_len,geo_loc_Afghanistan,geo_loc_Albania,geo_loc_Algeria,...,tld_yokohama.jp,tld_yokosuka.kanagawa.jp,tld_z.se,tld_za.com,tld_za.net,tld_za.org,tld_zagan.pl,tld_zgora.pl,tld_zm,tld_zp.ua
0,-0.33909,0.74338,1.60206,1.51787,1,0,4.19883,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
def get_pred(user_input):
    response = requests.post("http://localhost:5000/predict", json=user_input)
    if response.status_code == 200:
        data = response.json()
        print("Prediction:", data)
    else:
        print("Error:", response.text)

get_pred(url)  

Predicted house price: [1]


# Test

In [3]:
df = pd.read_csv(r"..\test_dump_v2.csv")

In [9]:
URLS_1 = df[df['label'] == 1].sample(20)['url'].to_list()
URLS_0 = df[df['label'] == 0].sample(20)['url'].to_list()

In [14]:
testData = []

for url in URLS_0:
    UserInput['url'] = url
    UserInput['url_numOf_digits'] = count_digits_in_string(url)
    UserInput['url_entropy'] = compute_entropy(url)
    UserInput['url_len'] = len(url)
    UserInput['ip_add'] = get_ip_address(url)
    UserInput['who_is'] = 0
    UserInput['https'] = 1 if url.startswith('https') else 0
    UserInput['js_len'] = get_js_len(url)
    UserInput['geo_loc'] = get_geo_loc(url)
    UserInput['tld'] = get_tld(url)
    UserInput['label'] = 0

    testData.append(UserInput.copy())
    

In [16]:
for url in URLS_1:
    UserInput['url'] = url
    UserInput['url_numOf_digits'] = count_digits_in_string(url)
    UserInput['url_entropy'] = compute_entropy(url)
    UserInput['url_len'] = len(url)
    UserInput['ip_add'] = get_ip_address(url)
    UserInput['who_is'] = 0
    UserInput['https'] = 1 if url.startswith('https') else 0
    UserInput['js_len'] = get_js_len(url)
    UserInput['geo_loc'] = get_geo_loc(url)
    UserInput['tld'] = get_tld(url)
    UserInput['label'] = 1

    testData.append(UserInput.copy())

In [18]:
df = pd.DataFrame.from_dict(testData)
df

Unnamed: 0,url,url_numOf_digits,url_entropy,url_len,ip_add,geo_loc,tld,who_is,https,js_len,label
0,http://www.edgewaterwest.com/,0,0.75607,29,,,com,0,0,0,0
1,http://peca-original.blogspot.com/,0,0.80191,34,142.251.37.201,France,com,0,0,95919,0
2,http://www.voksenfilmer.net,0,0.84215,27,183.181.82.150,Japan,net,0,0,2674,0
3,http://www.yngmen.com/,0,0.82152,22,,,com,0,0,0,0
4,http://groups.yahoo.com/group/ratboat,0,0.72101,37,87.248.119.252,Germany,com,0,0,262350,0
5,http://www.okinawa315.com/,3,0.83017,26,183.181.96.15,Japan,com,0,0,86,0
6,http://www.xxx-free-blowjob-porn-videos.com/,0,0.78405,44,,,com,0,0,0,0
7,http://www.ewa-tantra.ch/,0,0.74582,25,116.202.48.150,Germany,ch,0,0,0,0
8,http://www.brasilporno.tv,0,0.83195,25,104.21.18.81,United States,tv,0,0,7895,0
9,http://www.sweethour.com/shemale-hardcore/,0,0.73563,42,,,com,0,0,0,0


In [19]:
df.to_csv(r"..\newTest.csv", index=False)