In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
df = pd.read_csv('All.csv')

In [None]:
print(df.head())

In [None]:
print(df.shape)

In [None]:
count = (df['URL_Type_obf_Type'] == 'benign').sum()
print(count)
count = (df['URL_Type_obf_Type'] == 'phishing').sum()
print(count)

In [None]:
desired_classes=['benign','phishing']
df_flitered = df[df['URL_Type_obf_Type'].isin(desired_classes)].copy()
print(df.shape)
print(df_flitered.shape)

In [None]:
count = (df_flitered['URL_Type_obf_Type'] == 'benign').sum()
print(count)
count = (df_flitered['URL_Type_obf_Type'] == 'phishing').sum()
print(count)

In [None]:
label_mapping = {'benign':0 , 'phishing':1}
df_flitered['target']= df_flitered['URL_Type_obf_Type'].map(label_mapping)
df_flitered = df_flitered.drop(columns=['URL_Type_obf_Type'])
print(df_flitered['target'].value_counts())

In [None]:
df_flitered.head()

In [None]:
print(df_flitered.isnull().sum().sum())
df_flitered = df_flitered.fillna(0)
print(df_flitered.isnull().sum().sum())

In [None]:
df_final = df_flitered.drop_duplicates()
df_final = df_final.replace([np.inf, -np.inf], 0)

In [None]:
X = df_final.drop(columns=['target'])
y = df_final['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
print(len(X_train))
print(len(X_test))

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled,y_train)

In [None]:
y_pred = model.predict(X_test_scaled)
print(f"Accuracy: {accuracy_score(y_test,y_pred)*100: .2f}%")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nDetailed Report:")
print(classification_report(y_test, y_pred))

In [None]:
feature_names = X.columns
coefficients = model.coef_[0]
feature_importance = pd.DataFrame({'Feature': feature_names, 'Coefficient': abs(coefficients)})
print(feature_importance)

In [None]:
top_features = feature_importance.sort_values(by='Coefficient', ascending=False).head(20)
print(top_features)

In [None]:
import urllib.parse
import re
import math
from collections import Counter

In [None]:
def get_entropy(s):
  if not s:
    return 0
  counts = Counter(s)
  length = len(s)

  entropy = -sum((count/length) * math.log2(count/length) for count in counts.values())

  return entropy


def extractor(url):
  features={}

  if not url.startswith('http'):
    url = 'http://' + url

  parsed = urllib.parse.urlparse(url)

  domain = parsed.netloc
  path = parsed.path
  query = parsed.query

  try:
    extension = path.split('.')[-1] if '.' in path else ""
  except:
    extension = ""

  features['urlLen']=len(url)

  features['domainlength']=len(domain)

  features['pathLength']=len(path)

  features['pathurlRatio']=len(path)/len(url) if len(url)>0 else 0

  features['pathDomainRatio']=len(path)/len(domain) if len(domain)>0 else 0

  features['host_letter_count']=len(re.findall(r'[a-zA-Z]', domain))

  features['URL_DigitCount']=len(re.findall(r'[0-9]', url))

  features['NumberofDotsinURL']= url.count('.')

  features['delimeter_path']=len(re.findall(r'[^a-zA-Z0-9]', path))

  if len(query)>0:
    features['URLQueries_variable']=query.count('&')+1
  else:
    features['URLQueries_variable']=0

  tokens=domain.split('.')

  token_lengths= [len(t) for t in tokens if len(t)>0]

  features['avgdomaintokenlen']=np.mean(token_lengths) if len(token_lengths)>0 else 0

  if len(extension)>0:
    digits_in_ext = len(re.findall(r'[0-9]',extension))
    features['NumberRate_Extension'] = digits_in_ext / len(extension)
  else:
    features['NumberRate_Extension'] = 0

  if '/' in path:
    directory_part = path.rsplit('/',1)[0]+'/'
    filename_part = path.rsplit('/',1)[1]
  else:
    directory_part=""
    filename_part=path

  features['subDirLen']= len(directory_part)

  features['Directory_DigitCount'] = len(re.findall(r'[0-9]', directory_part))

  features['Directory_LetterCount'] = len(re.findall(r'[a-zA-Z]', directory_part))

  features['Entropy_Filename'] = get_entropy(filename_part)

  features['Query_LetterCount'] = len(re.findall(r'[a-zA-Z]', query))

  if len(query) > 0:
    digits_in_query = len(re.findall(r'[0-9]', query))
    features['NumberRate_AfterPath'] = digits_in_query / len(query)
  else:
    features['NumberRate_AfterPath'] = 0

  features['charcompvowels'] = len(re.findall(r'[aeiouAEIOU]', url))

  features['SymbolCount_URL'] = len(re.findall(r'[^a-zA-Z0-9]', url))

  return features

In [None]:

test_url = "http://google.com/login/secure?user=123"
print(extractor(test_url))

In [None]:
feature_order = [
    'urlLen', 'domainlength', 'pathLength', 'pathurlRatio', 'pathDomainRatio',
    'host_letter_count', 'URL_DigitCount', 'NumberofDotsinURL', 'delimeter_path',
    'URLQueries_variable', 'avgdomaintokenlen', 'NumberRate_Extension',
    'subDirLen', 'Directory_DigitCount', 'Directory_LetterCount', 'Entropy_Filename',
    'Query_LetterCount', 'NumberRate_AfterPath', 'charcompvowels', 'SymbolCount_URL'
]

In [None]:
# X_selected = X[feature_order].copy()

# X_train_new, X_test_new, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

In [None]:
# from sklearn.metrics import f1_score, roc_auc_score
# scaler_new = StandardScaler()
# X_train_new_scaled = scaler_new.fit_transform(X_train_new)
# X_test_new_scaled = scaler_new.transform(X_test_new)

# model_new = LogisticRegression(max_iter=1000,class_weight='balanced')
# model_new.fit(X_train_new_scaled, y_train)

# y_pred_new = model_new.predict(X_test_new_scaled)

# print(f"New Model Accuracy: {model_new.score(X_test_new_scaled, y_test) * 100:.2f}%")
# print("\nNewConfusion Matrix:")
# print(confusion_matrix(y_test, y_pred_new))
# print("\nNewDetailed Report:")
# print(classification_report(y_test, y_pred_new))

# print("Train Accuracy:", model_new.score(X_train_new_scaled, y_train))
# print("Test Accuracy :", model_new.score(X_test_new_scaled, y_test))
# print("F1:", f1_score(y_test, y_pred_new))
# print("AUC:", roc_auc_score(y_test, model_new.predict_proba(X_test_new_scaled)[:,1]))

In [None]:

# def predict_url(url,model,scaler):
#   features_dict = extractor(url)

#   df_features = pd.DataFrame([features_dict])

#   df_features = df_features.reindex(columns=feature_order, fill_value=0)

#   scaled_features= scaler.transform(df_features)

#   prediction = model.predict(scaled_features)[0]

#   probability = model.predict_proba(scaled_features)[0][1]

#   return prediction, probability

In [None]:
# # --- TEST TIME ---
# # Test 1: A legitimate site
# safe_url = "x.com/home"
# pred, prob = predict_url(safe_url, model_new, scaler_new) # Use your new model/scaler names

# print(f"URL: {safe_url}")
# print(f"Prediction: {'PHISHING' if pred == 1 else 'SAFE'}")
# print(f"Probability: {prob*100:.2f}% Phishing\n")


In [None]:
dfp = pd.read_csv('2.online-valid.csv')
dfp.head()

In [None]:
phish = dfp.sample(n=5000,random_state=12).copy()
phish = phish.reset_index(drop=True)
phish.head()

In [None]:
dfb = pd.read_csv('1.Benign_list_big_final.csv')
dfb.columns = ['URLs']
dfb.head()

In [None]:
ben = dfb.sample(n=5000,random_state=12).copy()
ben = ben.reset_index(drop=True)
ben.head()

In [None]:
ben = ben[['URLs']]
ben = ben.rename(columns={'URLs':'url'})

phish = phish[['url']]


In [None]:
from urllib.parse import urlparse,urlencode
import ipaddress
import re

def getDomain(url):
  domain = urlparse(url).netloc
  if re.match(r"^www.",domain):
	       domain = domain.replace("www.","")
  return domain

def havingIP(url):
  try:
    ipaddress.ip_address(url)
    ip = 1
  except:
    ip = 0
  return ip

def haveAtSign(url):
  if "@" in url:
    at = 1
  else:
    at = 0
  return at

def getLength(url):
  if len(url) < 54:
    length = 0
  else:
    length = 1
  return length

def getDepth(url):
  s = urlparse(url).path.split('/')
  depth = 0
  for j in range(len(s)):
    if len(s[j]) != 0:
      depth = depth+1
  return depth

def redirection(url):
  pos = url.rfind('//')
  if pos > 6:
    if pos > 7:
      return 1
    else:
      return 0
  else:
    return 0
def httpDomain(url):
  domain = urlparse(url).netloc
  if 'https' in domain:
    return 1
  else:
    return 0
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
                      r"tr\.im|link\.zip\.net"
def tinyURL(url):
    match=re.search(shortening_services,url)
    if match:
        return 1
    else:
        return 0

def prefixSuffix(url):
    if '-' in urlparse(url).netloc:
        return 1            # phishing
    else:
        return 0            # legitimate




In [None]:
!pip install python-whois

In [None]:
import re
from bs4 import BeautifulSoup
import whois
import urllib
import urllib.request
from datetime import datetime

In [None]:
def web_traffic(url):
  try:
    #Filling the whitespaces in the URL if any
    url = urllib.parse.quote(url)
    rank = BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(), "xml").find(
        "REACH")['RANK']
    rank = int(rank)
  except TypeError:
        return 1
  if rank <100000:
    return 1
  else:
    return 0

def domainAge(domain_name):
  creation_date = domain_name.creation_date
  expiration_date = domain_name.expiration_date
  if (isinstance(creation_date,str) or isinstance(expiration_date,str)):
    try:
      creation_date = datetime.strptime(creation_date,'%Y-%m-%d')
      expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
    except:
      return 1
  if ((expiration_date is None) or (creation_date is None)):
      return 1
  elif ((type(expiration_date) is list) or (type(creation_date) is list)):
      return 1
  else:
    ageofdomain = abs((expiration_date - creation_date).days)
    if ((ageofdomain/30) < 6):
      age = 1
    else:
      age = 0
  return age

def domainEnd(domain_name):
  expiration_date = domain_name.expiration_date
  if isinstance(expiration_date,str):
    try:
      expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
    except:
      return 1
  if (expiration_date is None):
      return 1
  elif (type(expiration_date) is list):
      return 1
  else:
    today = datetime.now()
    end = abs((expiration_date - today).days)
    if ((end/30) < 6):
      end = 0
    else:
      end = 1
  return end



In [None]:
import requests

def iframe(response):
  if response == "":
      return 1
  else:
      if re.findall(r"[|]", response.text):
          return 0
      else:
          return 1

def mouseOver(response):
  if response == "" :
    return 1
  else:
    if re.findall("", response.text):
      return 1
    else:
      return 0

def rightClick(response):
  if response == "":
    return 1
  else:
    if re.findall(r"event.button ?== ?2", response.text):
      return 0
    else:
      return 1

def forwarding(response):
  if response == "":
    return 1
  else:
    if len(response.history) <= 2:
      return 0
    else:
      return 1

In [None]:
def Extraction(url,label):

  features = []
  #Address bar based features (10)
  features.append(getDomain(url))
  features.append(havingIP(url))
  features.append(haveAtSign(url))
  features.append(getLength(url))
  features.append(getDepth(url))
  features.append(redirection(url))
  features.append(httpDomain(url))
  features.append(tinyURL(url))
  features.append(prefixSuffix(url))

  #Domain based features (4)
  dns = 0
  try:
    domain_name = whois.whois(urlparse(url).netloc)
  except:
    dns = 1

  features.append(dns)
  features.append(web_traffic(url))
  features.append(1 if dns == 1 else domainAge(domain_name))
  features.append(1 if dns == 1 else domainEnd(domain_name))

  # HTML & Javascript based features (4)
  try:
    response = requests.get(url)
  except:
    response = ""
  features.append(iframe(response))
  features.append(mouseOver(response))
  features.append(rightClick(response))
  features.append(forwarding(response))
  features.append(label)

  return features

In [None]:
legi_features = []
label = 0

for i in range(0, 5000):
  url = ben['URLs'][i]
  legi_features.append(Extraction(url,label))


In [None]:
final_df = pd.DataFrame(rows)
final_df.head()

In [None]:
X_f = final_df.drop(columns=['target'])
y_f = final_df['target']

X_f_train, X_f_test, y_f_train, y_f_test = train_test_split(X_f,y_f,test_size=0.2,random_state=42)

scaler_f = StandardScaler()
X_f_train_scaled = scaler_f.fit_transform(X_f_train)
X_f_test_scaled = scaler_f.transform(X_f_test)

model_f = LogisticRegression(max_iter=1000)
model_f.fit(X_f_train_scaled,y_f_train)

In [None]:
y_f_pred = model_f.predict(X_f_test)
print(f"Accuracy: {accuracy_score(y_f_test,y_f_pred)*100: .2f}%")
print("\nConfusion Matrix:")
print(confusion_matrix(y_f_test, y_f_pred))
print("\nDetailed Report:")
print(classification_report(y_f_test, y_f_pred))