In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

# Problem Statement:  
Phishing is a type of fraud in which an attacker impersonates a reputable company or
person in order to get sensitive information such as login credentials or account
information via email or other communication channels. Phishing is popular among
attackers because it is easier to persuade someone to click a malicious link that appears
to be authentic than it is to break through a computer's protection measures.  
The mail goal is to predict whether the domains are real or malicious.  
Approach: The classical machine learning tasks like Data Exploration, Data Cleaning,
Feature Engineering, Model Building and Model Testing. Try out different machine
learning algorithms that’s best fit for the above case.  
For Feature Engineering show:-  
1. URL-Based Features  
2. Domain-Based Features  
3. Page-Based Features  
4. Content-Based Features  
  
Results: You have to build a solution that should able to predict whether the domain is
real or fake.  

In [2]:
df1=pd.read_csv('/content/drive/MyDrive/Data/temp/dataset_full.csv')
df=df1.copy()

## Understanding the Data

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
df.sample(11)

Unnamed: 0,qty_dot_url,qty_hyphen_url,qty_underline_url,qty_slash_url,qty_questionmark_url,qty_equal_url,qty_at_url,qty_and_url,qty_exclamation_url,qty_space_url,qty_tilde_url,qty_comma_url,qty_plus_url,qty_asterisk_url,qty_hashtag_url,qty_dollar_url,qty_percent_url,qty_tld_url,length_url,qty_dot_domain,qty_hyphen_domain,qty_underline_domain,qty_slash_domain,qty_questionmark_domain,qty_equal_domain,qty_at_domain,qty_and_domain,qty_exclamation_domain,qty_space_domain,qty_tilde_domain,qty_comma_domain,qty_plus_domain,qty_asterisk_domain,qty_hashtag_domain,qty_dollar_domain,qty_percent_domain,qty_vowels_domain,domain_length,domain_in_ip,server_client_domain,qty_dot_directory,qty_hyphen_directory,qty_underline_directory,qty_slash_directory,qty_questionmark_directory,qty_equal_directory,qty_at_directory,qty_and_directory,qty_exclamation_directory,qty_space_directory,qty_tilde_directory,qty_comma_directory,qty_plus_directory,qty_asterisk_directory,qty_hashtag_directory,qty_dollar_directory,qty_percent_directory,directory_length,qty_dot_file,qty_hyphen_file,qty_underline_file,qty_slash_file,qty_questionmark_file,qty_equal_file,qty_at_file,qty_and_file,qty_exclamation_file,qty_space_file,qty_tilde_file,qty_comma_file,qty_plus_file,qty_asterisk_file,qty_hashtag_file,qty_dollar_file,qty_percent_file,file_length,qty_dot_params,qty_hyphen_params,qty_underline_params,qty_slash_params,qty_questionmark_params,qty_equal_params,qty_at_params,qty_and_params,qty_exclamation_params,qty_space_params,qty_tilde_params,qty_comma_params,qty_plus_params,qty_asterisk_params,qty_hashtag_params,qty_dollar_params,qty_percent_params,params_length,tld_present_params,qty_params,email_in_url,time_response,domain_spf,asn_ip,time_domain_activation,time_domain_expiration,qty_ip_resolved,qty_nameservers,qty_mx_servers,ttl_hostname,tls_ssl_certificate,qty_redirects,url_google_index,domain_google_index,url_shortened,phishing
42599,2,4,3,3,0.0,3.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,130.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,23.0,0.0,0.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,3.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,88.0,0.0,3.0,0.0,0.2999,1.0,-1.0,466.0,263.0,4.0,4.0,1.0,56.0,1.0,0.0,0.0,0.0,0.0,1.0
20657,2,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,18.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,18.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,1.147545,1.0,26496.0,8027.0,371.0,1.0,2.0,1.0,10790.0,1.0,1.0,0.0,0.0,0.0,0.0
31826,3,2,0,8,1.0,3.0,1.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,2.0,273.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,1.0,1.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,76.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,1.0,1.0,3.0,1.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,191.0,1.0,3.0,1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,4.0,0.0,-1.0,0.0,-1.0,0.0,0.0,0.0,1.0
7739,2,0,0,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,43.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,19.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,1.127305,0.0,46606.0,5162.0,681.0,1.0,2.0,2.0,3599.0,1.0,0.0,0.0,0.0,0.0,1.0
23010,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,17.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,17.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,1.522732,0.0,9371.0,398.0,331.0,1.0,5.0,1.0,19332.0,1.0,1.0,0.0,0.0,0.0,0.0
37442,1,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,11.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.148323,-1.0,13335.0,-1.0,-1.0,2.0,2.0,1.0,299.0,1.0,0.0,0.0,0.0,1.0,1.0
46457,2,0,0,5,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,68.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,8.0,0.0,0.0,1.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.0,0.0,1.0,0.0,0.449462,0.0,53824.0,1819.0,6.0,1.0,2.0,2.0,3597.0,0.0,0.0,0.0,0.0,0.0,1.0
8067,3,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,15.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,15.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,1.250758,0.0,38244.0,-1.0,-1.0,1.0,2.0,1.0,3955.0,0.0,0.0,0.0,0.0,0.0,0.0
34049,2,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,40.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,39.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.313202,0.0,15133.0,-1.0,-1.0,1.0,4.0,1.0,3590.0,0.0,0.0,0.0,0.0,0.0,1.0
3991,2,0,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,29.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,16.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,5.3567,0.0,36024.0,5000.0,112.0,1.0,2.0,1.0,3566.0,0.0,0.0,0.0,0.0,0.0,1.0


In [5]:
df.shape

(48218, 112)

In [6]:
df.isnull().sum()


qty_dot_url             0
qty_hyphen_url          0
qty_underline_url       0
qty_slash_url           0
qty_questionmark_url    1
                       ..
qty_redirects           1
url_google_index        1
domain_google_index     1
url_shortened           1
phishing                1
Length: 112, dtype: int64

In [7]:
df.dropna(axis=0,inplace=True)

In [8]:
df.isnull().sum()


qty_dot_url             0
qty_hyphen_url          0
qty_underline_url       0
qty_slash_url           0
qty_questionmark_url    0
                       ..
qty_redirects           0
url_google_index        0
domain_google_index     0
url_shortened           0
phishing                0
Length: 112, dtype: int64

In [9]:
for column in df.columns:
    unique_values = df[column].nunique()
    print(f"Feature: {column}, Unique values: {unique_values}")

Feature: qty_dot_url, Unique values: 21
Feature: qty_hyphen_url, Unique values: 26
Feature: qty_underline_url, Unique values: 20
Feature: qty_slash_url, Unique values: 22
Feature: qty_questionmark_url, Unique values: 5
Feature: qty_equal_url, Unique values: 18
Feature: qty_at_url, Unique values: 11
Feature: qty_and_url, Unique values: 20
Feature: qty_exclamation_url, Unique values: 7
Feature: qty_space_url, Unique values: 7
Feature: qty_tilde_url, Unique values: 6
Feature: qty_comma_url, Unique values: 5
Feature: qty_plus_url, Unique values: 9
Feature: qty_asterisk_url, Unique values: 15
Feature: qty_hashtag_url, Unique values: 5
Feature: qty_dollar_url, Unique values: 10
Feature: qty_percent_url, Unique values: 48
Feature: qty_tld_url, Unique values: 8
Feature: length_url, Unique values: 395
Feature: qty_dot_domain, Unique values: 17
Feature: qty_hyphen_domain, Unique values: 10
Feature: qty_underline_domain, Unique values: 4
Feature: qty_slash_domain, Unique values: 1
Feature: qty_qu

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48217 entries, 0 to 48216
Columns: 112 entries, qty_dot_url to phishing
dtypes: float64(108), int64(4)
memory usage: 41.6 MB


In [11]:
df.describe()

Unnamed: 0,qty_dot_url,qty_hyphen_url,qty_underline_url,qty_slash_url,qty_questionmark_url,qty_equal_url,qty_at_url,qty_and_url,qty_exclamation_url,qty_space_url,qty_tilde_url,qty_comma_url,qty_plus_url,qty_asterisk_url,qty_hashtag_url,qty_dollar_url,qty_percent_url,qty_tld_url,length_url,qty_dot_domain,qty_hyphen_domain,qty_underline_domain,qty_slash_domain,qty_questionmark_domain,qty_equal_domain,qty_at_domain,qty_and_domain,qty_exclamation_domain,qty_space_domain,qty_tilde_domain,qty_comma_domain,qty_plus_domain,qty_asterisk_domain,qty_hashtag_domain,qty_dollar_domain,qty_percent_domain,qty_vowels_domain,domain_length,domain_in_ip,server_client_domain,qty_dot_directory,qty_hyphen_directory,qty_underline_directory,qty_slash_directory,qty_questionmark_directory,qty_equal_directory,qty_at_directory,qty_and_directory,qty_exclamation_directory,qty_space_directory,qty_tilde_directory,qty_comma_directory,qty_plus_directory,qty_asterisk_directory,qty_hashtag_directory,qty_dollar_directory,qty_percent_directory,directory_length,qty_dot_file,qty_hyphen_file,qty_underline_file,qty_slash_file,qty_questionmark_file,qty_equal_file,qty_at_file,qty_and_file,qty_exclamation_file,qty_space_file,qty_tilde_file,qty_comma_file,qty_plus_file,qty_asterisk_file,qty_hashtag_file,qty_dollar_file,qty_percent_file,file_length,qty_dot_params,qty_hyphen_params,qty_underline_params,qty_slash_params,qty_questionmark_params,qty_equal_params,qty_at_params,qty_and_params,qty_exclamation_params,qty_space_params,qty_tilde_params,qty_comma_params,qty_plus_params,qty_asterisk_params,qty_hashtag_params,qty_dollar_params,qty_percent_params,params_length,tld_present_params,qty_params,email_in_url,time_response,domain_spf,asn_ip,time_domain_activation,time_domain_expiration,qty_ip_resolved,qty_nameservers,qty_mx_servers,ttl_hostname,tls_ssl_certificate,qty_redirects,url_google_index,domain_google_index,url_shortened,phishing
count,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0,48217.0
mean,2.197876,0.330734,0.116681,1.286704,0.008897,0.2092,0.023373,0.14246,0.002717,0.001307,0.003152,0.002178,0.003173,0.005413,0.000373,0.002095,0.11413,1.0488,36.526184,1.876537,0.114897,0.000788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.452268,18.591596,0.002219,0.004335,-0.320095,-0.357571,-0.47504,0.720866,-0.533608,-0.525499,-0.529212,-0.528299,-0.532012,-0.532654,-0.530498,-0.532883,-0.53226,-0.528942,-0.533608,-0.532012,-0.476741,10.882282,-0.364788,-0.477259,-0.508306,-0.533608,-0.533608,-0.531908,-0.533297,-0.532157,-0.532592,-0.533173,-0.533318,-0.53309,-0.532634,-0.532323,-0.533608,-0.533608,-0.494141,2.746438,-0.810523,-0.876973,-0.859054,-0.886534,-0.907232,-0.723956,-0.896779,-0.787876,-0.914781,-0.915113,-0.915196,-0.913786,-0.913557,-0.915175,-0.915237,-0.914864,-0.858411,5.409026,-0.890371,-0.756621,0.018666,0.794425,-0.019371,31021.74312,3393.465686,349.892465,1.139971,2.776407,1.739158,6179.296804,0.508576,0.345832,0.001161,0.001804,0.005247,0.346351
std,1.234049,1.11971,0.649622,1.899404,0.108076,0.961435,0.336656,0.933255,0.073805,0.086278,0.078817,0.066117,0.129973,0.36704,0.046442,0.115282,1.825838,0.258751,47.955626,0.711347,0.421385,0.033457,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.555078,6.650227,0.047056,0.065695,0.895526,1.106005,0.675365,2.216402,0.498874,0.519228,0.584865,0.552714,0.503572,0.505791,0.508298,0.501245,0.512497,0.614584,0.498874,0.512431,1.613972,23.785088,0.768672,0.789392,0.597318,0.498874,0.498874,0.503249,0.49956,0.501954,0.501223,0.50164,0.499932,0.500818,0.510283,0.570045,0.498874,0.498874,1.483054,13.243955,0.975527,0.626158,0.658919,0.566182,0.319959,1.116027,0.362654,1.009869,0.283046,0.279608,0.278742,0.288122,0.290955,0.278847,0.278531,0.281528,1.104901,37.5156,0.383858,0.946062,0.135342,1.473786,0.560322,44774.148259,3047.637722,590.612979,0.901834,1.323036,1.702122,11812.768169,0.499932,0.784159,0.058664,0.062583,0.072247,0.475812
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.0,0.0
25%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,17.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,14.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.241202,0.0,13335.0,-1.0,-1.0,1.0,2.0,1.0,292.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,22.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,18.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.466634,0.0,20013.0,3047.0,168.0,1.0,2.0,1.0,2075.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,2.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,39.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,22.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.873259,0.0,34922.0,6430.0,353.0,1.0,4.0,2.0,10799.0,1.0,1.0,0.0,0.0,0.0,1.0
max,24.0,35.0,21.0,44.0,9.0,17.0,43.0,26.0,8.0,9.0,5.0,4.0,19.0,60.0,9.0,10.0,174.0,12.0,4165.0,17.0,11.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.0,231.0,1.0,1.0,19.0,23.0,16.0,22.0,0.0,5.0,43.0,26.0,5.0,9.0,5.0,3.0,19.0,60.0,0.0,10.0,174.0,1083.0,12.0,17.0,16.0,0.0,0.0,3.0,2.0,2.0,4.0,9.0,4.0,3.0,19.0,60.0,0.0,0.0,174.0,1044.0,23.0,35.0,21.0,43.0,9.0,17.0,10.0,19.0,8.0,4.0,1.0,4.0,6.0,1.0,0.0,4.0,65.0,4094.0,1.0,15.0,1.0,38.402411,1.0,395754.0,12210.0,3718.0,24.0,16.0,20.0,604800.0,1.0,12.0,1.0,1.0,1.0,1.0


In [12]:
df.columns.duplicated().sum()

0

In [13]:
df.corr()

Unnamed: 0,qty_dot_url,qty_hyphen_url,qty_underline_url,qty_slash_url,qty_questionmark_url,qty_equal_url,qty_at_url,qty_and_url,qty_exclamation_url,qty_space_url,qty_tilde_url,qty_comma_url,qty_plus_url,qty_asterisk_url,qty_hashtag_url,qty_dollar_url,qty_percent_url,qty_tld_url,length_url,qty_dot_domain,qty_hyphen_domain,qty_underline_domain,qty_slash_domain,qty_questionmark_domain,qty_equal_domain,qty_at_domain,qty_and_domain,qty_exclamation_domain,qty_space_domain,qty_tilde_domain,qty_comma_domain,qty_plus_domain,qty_asterisk_domain,qty_hashtag_domain,qty_dollar_domain,qty_percent_domain,qty_vowels_domain,domain_length,domain_in_ip,server_client_domain,qty_dot_directory,qty_hyphen_directory,qty_underline_directory,qty_slash_directory,qty_questionmark_directory,qty_equal_directory,qty_at_directory,qty_and_directory,qty_exclamation_directory,qty_space_directory,qty_tilde_directory,qty_comma_directory,qty_plus_directory,qty_asterisk_directory,qty_hashtag_directory,qty_dollar_directory,qty_percent_directory,directory_length,qty_dot_file,qty_hyphen_file,qty_underline_file,qty_slash_file,qty_questionmark_file,qty_equal_file,qty_at_file,qty_and_file,qty_exclamation_file,qty_space_file,qty_tilde_file,qty_comma_file,qty_plus_file,qty_asterisk_file,qty_hashtag_file,qty_dollar_file,qty_percent_file,file_length,qty_dot_params,qty_hyphen_params,qty_underline_params,qty_slash_params,qty_questionmark_params,qty_equal_params,qty_at_params,qty_and_params,qty_exclamation_params,qty_space_params,qty_tilde_params,qty_comma_params,qty_plus_params,qty_asterisk_params,qty_hashtag_params,qty_dollar_params,qty_percent_params,params_length,tld_present_params,qty_params,email_in_url,time_response,domain_spf,asn_ip,time_domain_activation,time_domain_expiration,qty_ip_resolved,qty_nameservers,qty_mx_servers,ttl_hostname,tls_ssl_certificate,qty_redirects,url_google_index,domain_google_index,url_shortened,phishing
qty_dot_url,1.000000,0.116572,0.135041,0.241802,0.409465,0.494544,0.145472,0.537386,0.002523,-0.001649,0.024932,0.035390,0.024791,-0.003510,0.002330,0.008166,0.062078,0.419941,0.439771,0.484219,0.042228,-0.002270,,,,,,,,,,,,,,,0.147977,0.268122,0.058870,0.015514,0.414641,0.121924,0.147353,0.210530,0.159387,0.157982,0.136242,0.149223,0.156824,0.156274,0.160212,0.159574,0.156565,0.127477,0.159387,0.154113,0.056414,0.211499,0.321262,0.115533,0.145543,0.159387,0.159387,0.159096,0.159506,0.163235,0.157644,0.157833,0.159158,0.159341,0.157164,0.137386,0.159387,0.159387,0.054051,0.117429,0.708680,0.249033,0.254940,0.296109,0.448050,0.486355,0.412738,0.567701,0.360414,0.361551,0.362782,0.357303,0.355778,0.362814,0.362899,0.363838,0.182729,0.373616,0.426754,0.414377,0.344953,0.003290,-0.009244,0.011122,-0.085161,-0.076532,-0.053102,-0.012801,-0.048237,0.012001,-0.068086,-0.054451,-0.003461,-0.000326,-0.060264,0.173234
qty_hyphen_url,0.116572,1.000000,0.210178,0.307855,0.073030,0.180671,0.033631,0.131215,0.026772,0.022577,0.003461,0.033695,0.013453,-0.000773,-0.000380,0.004595,0.020989,0.147521,0.386476,-0.020444,0.398192,-0.006958,,,,,,,,,,,,,,,0.095943,0.118056,-0.007632,0.025622,0.229537,0.752211,0.221204,0.294256,0.231813,0.233809,0.199373,0.211958,0.230479,0.230869,0.228039,0.231508,0.227006,0.187433,0.231813,0.224759,0.070587,0.397674,0.229723,0.522550,0.214679,0.231813,0.231813,0.232959,0.231644,0.234407,0.231717,0.231608,0.231299,0.231051,0.227772,0.202204,0.231813,0.231813,0.077074,0.264618,0.118033,0.447067,0.230123,0.169155,0.178629,0.188540,0.164153,0.160659,0.178915,0.177491,0.176070,0.176671,0.172428,0.177311,0.176181,0.177203,0.079966,0.220543,0.173750,0.201462,0.073402,-0.009098,0.000195,0.004855,-0.112126,-0.030458,0.030210,0.031369,-0.031301,-0.032612,0.010550,-0.006094,-0.003322,0.011018,-0.009659,0.201757
qty_underline_url,0.135041,0.210178,1.000000,0.254516,0.093627,0.343394,0.057991,0.260866,0.102829,-0.000130,0.041829,0.036094,0.027302,0.012399,0.018492,0.011414,0.089613,0.153424,0.413851,-0.041084,0.051035,0.047299,,,,,,,,,,,,,,,0.032844,0.009562,0.000350,0.013420,0.220174,0.140079,0.493364,0.229081,0.190139,0.192301,0.164491,0.176708,0.190585,0.187263,0.193115,0.189426,0.186544,0.158328,0.190139,0.186979,0.111846,0.281858,0.227081,0.154337,0.376590,0.190139,0.190139,0.190861,0.190086,0.191443,0.189840,0.188999,0.189760,0.189215,0.185545,0.165995,0.190139,0.190139,0.117713,0.238888,0.184876,0.386427,0.751694,0.276792,0.311130,0.364768,0.295558,0.316983,0.342171,0.323984,0.324357,0.321783,0.319606,0.325022,0.324629,0.321842,0.152116,0.347252,0.292036,0.396985,0.092939,-0.011195,0.010426,0.027342,-0.096994,-0.050458,0.004727,0.028739,-0.012108,-0.029212,0.013523,-0.017086,0.000254,0.004004,-0.013045,0.195219
qty_slash_url,0.241802,0.307855,0.254516,1.000000,0.193579,0.293879,0.088185,0.204093,0.032318,0.016698,0.071268,0.085531,0.024711,0.006699,0.001608,0.022073,0.045883,0.251948,0.558234,-0.204850,0.042695,-0.012368,,,,,,,,,,,,,,,0.063897,-0.032959,0.040683,0.003005,0.703601,0.568528,0.668330,0.962006,0.724606,0.727976,0.624925,0.664860,0.719406,0.716895,0.722214,0.724511,0.708570,0.592223,0.724606,0.707437,0.246786,0.740951,0.712266,0.516553,0.635058,0.724606,0.724606,0.722418,0.723933,0.723857,0.722168,0.721634,0.724579,0.724211,0.710899,0.633816,0.724606,0.724606,0.254748,0.262582,0.275717,0.252374,0.265914,0.387420,0.372788,0.310710,0.349781,0.263777,0.361970,0.362070,0.363084,0.365077,0.353260,0.363131,0.363342,0.363888,0.133727,0.251522,0.359201,0.312453,0.190721,0.030790,-0.001583,0.056348,-0.290460,-0.093562,-0.049859,-0.032947,-0.047979,-0.023122,-0.019014,-0.045893,-0.004664,0.008036,-0.001895,0.696032
qty_questionmark_url,0.409465,0.073030,0.093627,0.193579,1.000000,0.475696,0.120260,0.477852,0.035971,0.003202,-0.000858,0.104680,0.015708,0.000877,-0.000662,0.003498,0.018397,0.302640,0.308541,-0.058820,0.006243,-0.001939,,,,,,,,,,,,,,,-0.028016,-0.037017,-0.003882,0.006253,0.120928,0.063400,0.064442,0.115480,0.088058,0.090343,0.075477,0.079731,0.086975,0.086698,0.086299,0.087522,0.085501,0.072103,0.088058,0.085472,0.030976,0.105374,0.107225,0.052934,0.070700,0.088058,0.088058,0.087014,0.087886,0.088427,0.087478,0.087501,0.087824,0.087631,0.085932,0.076878,0.088058,0.088058,0.027430,0.024441,0.535995,0.191922,0.185383,0.325831,0.505129,0.403152,0.297768,0.442952,0.245053,0.241987,0.242764,0.259101,0.240025,0.242666,0.242959,0.242308,0.083934,0.305394,0.280445,0.294445,0.180063,-0.011770,-0.000236,0.005499,-0.036325,-0.015459,-0.027673,-0.003202,-0.033496,-0.008299,-0.049970,-0.021869,0.008184,0.006826,0.007302,0.109869
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
qty_redirects,-0.054451,-0.006094,-0.017086,-0.045893,-0.021869,-0.044439,-0.016950,-0.032408,0.002041,-0.008212,0.012562,-0.056529,-0.004866,-0.003838,0.003858,-0.005719,-0.002377,-0.026141,-0.043709,-0.008264,-0.036901,-0.011180,,,,,,,,,,,,,,,-0.043347,-0.047104,-0.023047,-0.021047,-0.067026,-0.007497,-0.039660,-0.048657,-0.056891,-0.056761,-0.050938,-0.052523,-0.056551,-0.057154,-0.053853,-0.059108,-0.054784,-0.049012,-0.056891,-0.056761,-0.006987,-0.010002,-0.071850,-0.010142,-0.042065,-0.056891,-0.056891,-0.055207,-0.056770,-0.056448,-0.056516,-0.057277,-0.056762,-0.058447,-0.054907,-0.050690,-0.056891,-0.056891,-0.006620,0.029578,-0.049467,-0.046146,-0.046412,-0.047370,-0.063808,-0.054888,-0.060985,-0.048666,-0.062422,-0.062761,-0.062730,-0.069272,-0.063126,-0.062739,-0.062712,-0.061971,-0.035045,-0.043510,-0.063464,-0.055224,-0.032684,0.043460,0.034600,0.016615,0.130552,0.084384,0.271809,0.048287,0.129243,0.007492,0.305075,1.000000,0.042215,0.037576,0.049606,-0.063391
url_google_index,-0.003461,-0.003322,0.000254,-0.004664,0.008184,-0.000998,-0.001375,-0.000749,-0.000729,-0.000300,-0.000792,0.015389,-0.000483,-0.000292,-0.000159,-0.000360,0.014253,-0.002368,-0.000682,-0.001037,-0.002881,-0.000466,,,,,,,,,,,,,,,-0.007517,-0.006120,-0.000934,-0.001306,-0.002793,-0.002550,0.001886,-0.004206,-0.000084,-0.000389,-0.000220,-0.000266,-0.000146,-0.000120,-0.000203,-0.000112,-0.000133,-0.000218,-0.000084,-0.000143,0.015486,-0.001953,-0.003023,-0.000570,0.003235,-0.000084,-0.000084,-0.000150,-0.000096,-0.000140,-0.000123,-0.000100,-0.000095,-0.000104,-0.000120,-0.000118,-0.000084,-0.000084,0.017085,0.002408,-0.001671,-0.000502,-0.002625,-0.000221,0.000889,-0.001096,-0.002710,-0.001008,-0.002214,-0.002217,-0.002218,0.001438,-0.002237,-0.002219,-0.002217,-0.002220,0.000343,0.001537,-0.002891,-0.001730,-0.002730,0.020272,-0.004363,0.010208,0.037488,0.016228,0.048282,0.044764,0.022350,0.007233,0.025119,0.042215,1.000000,0.626481,-0.001438,-0.006981
domain_google_index,-0.000326,0.011018,0.004004,0.008036,0.006826,0.004757,-0.001017,0.001636,-0.001061,-0.000437,-0.001153,0.014088,-0.000704,-0.000425,-0.000232,-0.000524,0.014170,0.004808,0.010126,-0.005711,-0.000783,-0.000679,,,,,,,,,,,,,,,-0.007049,-0.006950,-0.001360,-0.001902,0.005865,0.009321,0.008012,0.006472,0.008253,0.008118,0.006823,0.007172,0.008085,0.008086,0.007924,0.008172,0.007958,0.006480,0.008253,0.007945,0.018372,0.008294,0.004629,0.003997,0.009001,0.008253,0.008253,0.008743,0.008224,0.008119,0.008156,0.008183,0.008219,0.008191,0.008014,0.007158,0.008253,0.008253,0.020333,0.004756,0.005611,0.014976,0.006406,0.014123,0.011320,0.006825,0.007329,0.004773,0.010053,0.010211,0.010251,0.013227,0.009658,0.010245,0.010263,0.010115,0.002904,0.009253,0.009032,0.007996,-0.001528,0.022271,-0.004918,0.007642,0.032108,0.017713,0.046971,0.043948,0.027782,0.008776,0.028341,0.037576,0.626481,1.000000,0.002493,0.001301
url_shortened,-0.060264,-0.009659,-0.013045,-0.001895,0.007302,-0.011623,-0.005042,-0.010164,-0.002674,-0.001100,-0.002905,-0.002392,0.000436,-0.001071,-0.000584,-0.001320,-0.004540,0.001835,-0.026699,-0.083845,-0.019803,-0.001711,,,,,,,,,,,,,,,-0.113748,-0.133112,-0.003425,-0.004792,0.025960,0.027115,0.050235,0.010831,0.076534,0.072400,0.064736,0.068382,0.075590,0.075351,0.074671,0.076067,0.074309,0.061574,0.076534,0.074283,0.021098,-0.006930,0.034467,0.048274,0.060844,0.076534,0.076534,0.075624,0.076384,0.075855,0.076029,0.076049,0.076330,0.076162,0.074685,0.066815,0.076534,0.076534,0.023812,0.024041,0.001195,0.010487,-0.005080,0.020430,0.004961,-0.008447,-0.001674,-0.007581,0.002474,0.002591,0.002621,0.002180,0.002102,0.002614,0.002633,0.002509,-0.003072,-0.005429,0.007676,-0.007760,-0.010016,-0.008817,-0.013884,0.026468,-0.001939,0.012073,0.048889,0.083660,0.011130,-0.031123,0.036365,0.049606,-0.001438,0.002493,1.000000,0.096757


In [14]:
print(df.columns)

Index(['qty_dot_url', 'qty_hyphen_url', 'qty_underline_url', 'qty_slash_url',
       'qty_questionmark_url', 'qty_equal_url', 'qty_at_url', 'qty_and_url',
       'qty_exclamation_url', 'qty_space_url',
       ...
       'qty_ip_resolved', 'qty_nameservers', 'qty_mx_servers', 'ttl_hostname',
       'tls_ssl_certificate', 'qty_redirects', 'url_google_index',
       'domain_google_index', 'url_shortened', 'phishing'],
      dtype='object', length=112)


# Classificaton of Features

In [15]:
#I have explored data and foung that 'email_in_url' coulmn is in wrong place so below code put it on right place.
column_name = 'email_in_url'
current_index = df.columns.get_loc(column_name)

target_index = 19
column = df.pop(column_name)
df.insert(target_index, column_name, column)
df.sample(5)

  df.insert(target_index, column_name, column)


Unnamed: 0,qty_dot_url,qty_hyphen_url,qty_underline_url,qty_slash_url,qty_questionmark_url,qty_equal_url,qty_at_url,qty_and_url,qty_exclamation_url,qty_space_url,qty_tilde_url,qty_comma_url,qty_plus_url,qty_asterisk_url,qty_hashtag_url,qty_dollar_url,qty_percent_url,qty_tld_url,length_url,email_in_url,qty_dot_domain,qty_hyphen_domain,qty_underline_domain,qty_slash_domain,qty_questionmark_domain,qty_equal_domain,qty_at_domain,qty_and_domain,qty_exclamation_domain,qty_space_domain,qty_tilde_domain,qty_comma_domain,qty_plus_domain,qty_asterisk_domain,qty_hashtag_domain,qty_dollar_domain,qty_percent_domain,qty_vowels_domain,domain_length,domain_in_ip,server_client_domain,qty_dot_directory,qty_hyphen_directory,qty_underline_directory,qty_slash_directory,qty_questionmark_directory,qty_equal_directory,qty_at_directory,qty_and_directory,qty_exclamation_directory,qty_space_directory,qty_tilde_directory,qty_comma_directory,qty_plus_directory,qty_asterisk_directory,qty_hashtag_directory,qty_dollar_directory,qty_percent_directory,directory_length,qty_dot_file,qty_hyphen_file,qty_underline_file,qty_slash_file,qty_questionmark_file,qty_equal_file,qty_at_file,qty_and_file,qty_exclamation_file,qty_space_file,qty_tilde_file,qty_comma_file,qty_plus_file,qty_asterisk_file,qty_hashtag_file,qty_dollar_file,qty_percent_file,file_length,qty_dot_params,qty_hyphen_params,qty_underline_params,qty_slash_params,qty_questionmark_params,qty_equal_params,qty_at_params,qty_and_params,qty_exclamation_params,qty_space_params,qty_tilde_params,qty_comma_params,qty_plus_params,qty_asterisk_params,qty_hashtag_params,qty_dollar_params,qty_percent_params,params_length,tld_present_params,qty_params,time_response,domain_spf,asn_ip,time_domain_activation,time_domain_expiration,qty_ip_resolved,qty_nameservers,qty_mx_servers,ttl_hostname,tls_ssl_certificate,qty_redirects,url_google_index,domain_google_index,url_shortened,phishing
13312,1,2,2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,36.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,19.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.475876,0.0,19066.0,6422.0,151.0,1.0,2.0,1.0,14390.0,1.0,0.0,0.0,0.0,0.0,1.0
23245,3,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,39.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,26.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.297799,0.0,47583.0,-1.0,-1.0,1.0,1.0,1.0,51.0,1.0,0.0,0.0,0.0,0.0,1.0
31340,2,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,18.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,18.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.143601,0.0,26496.0,6904.0,1131.0,1.0,2.0,2.0,596.0,0.0,0.0,0.0,0.0,0.0,0.0
33321,3,1,1,4,0.0,3.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,178.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,22.0,0.0,0.0,1.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,1.0,1.0,0.0,0.0,3.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,119.0,0.0,3.0,0.786669,1.0,24446.0,-1.0,-1.0,1.0,2.0,1.0,8589.0,0.0,0.0,0.0,0.0,0.0,1.0
28612,2,8,0,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,79.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,17.0,0.0,0.0,0.0,8.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.028968,0.0,34164.0,-1.0,-1.0,1.0,4.0,1.0,13.0,1.0,1.0,0.0,0.0,0.0,0.0


Below i have created five datasets based on their features


> The first group is based on the values of the attributes on the whole URL string.

> while the values of the following four groups are based on the particular sub-strings squence wise domain,directory,file,parameters.

> The last group attributes are based on the URL resolve metrics as well as on the external services such as Google search index.





1. Dataset attributes based on URL

In [16]:
url_data=df.iloc[:,list(range(20))+[-1]]

In [17]:
url_data.sample(5)

Unnamed: 0,qty_dot_url,qty_hyphen_url,qty_underline_url,qty_slash_url,qty_questionmark_url,qty_equal_url,qty_at_url,qty_and_url,qty_exclamation_url,qty_space_url,qty_tilde_url,qty_comma_url,qty_plus_url,qty_asterisk_url,qty_hashtag_url,qty_dollar_url,qty_percent_url,qty_tld_url,length_url,email_in_url,phishing
42626,4,0,0,2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,86.0,1.0,1.0
28464,2,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,21.0,0.0,0.0
14373,3,4,0,6,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,56.0,0.0,1.0
40554,4,5,11,12,0.0,3.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,259.0,0.0,1.0
48155,2,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,21.0,0.0,0.0


2. Dataset attributes based on domain URL

In [18]:
domain_data=df.iloc[:,list(range(20,41)) +[-1]]

In [19]:
domain_data.sample(5)

Unnamed: 0,qty_dot_domain,qty_hyphen_domain,qty_underline_domain,qty_slash_domain,qty_questionmark_domain,qty_equal_domain,qty_at_domain,qty_and_domain,qty_exclamation_domain,qty_space_domain,qty_tilde_domain,qty_comma_domain,qty_plus_domain,qty_asterisk_domain,qty_hashtag_domain,qty_dollar_domain,qty_percent_domain,qty_vowels_domain,domain_length,domain_in_ip,server_client_domain,phishing
12936,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,17.0,0.0,0.0,0.0
37283,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,9.0,0.0,0.0,0.0
8509,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,17.0,0.0,0.0,1.0
12681,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,13.0,0.0,0.0,0.0
29037,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,28.0,0.0,0.0,0.0


3. Dataset attributes based on URL directory

In [20]:
directory_data=df.iloc[:,list(range(41,59))+[-1]]
directory_data.sample(5)

Unnamed: 0,qty_dot_directory,qty_hyphen_directory,qty_underline_directory,qty_slash_directory,qty_questionmark_directory,qty_equal_directory,qty_at_directory,qty_and_directory,qty_exclamation_directory,qty_space_directory,qty_tilde_directory,qty_comma_directory,qty_plus_directory,qty_asterisk_directory,qty_hashtag_directory,qty_dollar_directory,qty_percent_directory,directory_length,phishing
6589,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
41148,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
24060,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
19394,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
17149,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.0,0.0


4. Dataset attributes based on URL file name

In [21]:
file_data=df.iloc[:,list(range(59,77))+[-1]]
file_data.sample(5)

Unnamed: 0,qty_dot_file,qty_hyphen_file,qty_underline_file,qty_slash_file,qty_questionmark_file,qty_equal_file,qty_at_file,qty_and_file,qty_exclamation_file,qty_space_file,qty_tilde_file,qty_comma_file,qty_plus_file,qty_asterisk_file,qty_hashtag_file,qty_dollar_file,qty_percent_file,file_length,phishing
207,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
42450,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
4755,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
30077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0
35826,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46.0,1.0


5. Dataset attributes based on URL parameters

In [22]:
params_data=df.iloc[:,list(range(77,97))+[-1]]
params_data.sample(5)

Unnamed: 0,qty_dot_params,qty_hyphen_params,qty_underline_params,qty_slash_params,qty_questionmark_params,qty_equal_params,qty_at_params,qty_and_params,qty_exclamation_params,qty_space_params,qty_tilde_params,qty_comma_params,qty_plus_params,qty_asterisk_params,qty_hashtag_params,qty_dollar_params,qty_percent_params,params_length,tld_present_params,qty_params,phishing
46060,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0
1703,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0
35333,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0
811,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0
34684,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0


6. Dataset attributes based on resolving URL and external services

In [23]:
es_data=df.iloc[:,97:]
es_data.sample(5)

Unnamed: 0,time_response,domain_spf,asn_ip,time_domain_activation,time_domain_expiration,qty_ip_resolved,qty_nameservers,qty_mx_servers,ttl_hostname,tls_ssl_certificate,qty_redirects,url_google_index,domain_google_index,url_shortened,phishing
35719,0.352374,1.0,26496.0,6755.0,2740.0,1.0,2.0,1.0,10418.0,0.0,1.0,0.0,0.0,0.0,1.0
15457,0.333146,1.0,53755.0,6334.0,239.0,1.0,6.0,5.0,1767.0,0.0,0.0,0.0,0.0,0.0,0.0
16200,0.255233,0.0,-1.0,-1.0,-1.0,1.0,4.0,0.0,279.0,1.0,1.0,0.0,0.0,0.0,1.0
17923,1.035823,0.0,50608.0,3452.0,166.0,1.0,4.0,1.0,3590.0,1.0,1.0,0.0,0.0,0.0,0.0
20695,0.870162,1.0,28299.0,4888.0,224.0,1.0,4.0,2.0,3586.0,0.0,0.0,0.0,0.0,0.0,1.0


# Univariate Analysis

In [24]:
import plotly.express as px
import plotly.graph_objects as go

In [25]:
url_data.corr()

Unnamed: 0,qty_dot_url,qty_hyphen_url,qty_underline_url,qty_slash_url,qty_questionmark_url,qty_equal_url,qty_at_url,qty_and_url,qty_exclamation_url,qty_space_url,qty_tilde_url,qty_comma_url,qty_plus_url,qty_asterisk_url,qty_hashtag_url,qty_dollar_url,qty_percent_url,qty_tld_url,length_url,email_in_url,phishing
qty_dot_url,1.0,0.116572,0.135041,0.241802,0.409465,0.494544,0.145472,0.537386,0.002523,-0.001649,0.024932,0.03539,0.024791,-0.00351,0.00233,0.008166,0.062078,0.419941,0.439771,0.344953,0.173234
qty_hyphen_url,0.116572,1.0,0.210178,0.307855,0.07303,0.180671,0.033631,0.131215,0.026772,0.022577,0.003461,0.033695,0.013453,-0.000773,-0.00038,0.004595,0.020989,0.147521,0.386476,0.073402,0.201757
qty_underline_url,0.135041,0.210178,1.0,0.254516,0.093627,0.343394,0.057991,0.260866,0.102829,-0.00013,0.041829,0.036094,0.027302,0.012399,0.018492,0.011414,0.089613,0.153424,0.413851,0.092939,0.195219
qty_slash_url,0.241802,0.307855,0.254516,1.0,0.193579,0.293879,0.088185,0.204093,0.032318,0.016698,0.071268,0.085531,0.024711,0.006699,0.001608,0.022073,0.045883,0.251948,0.558234,0.190721,0.696032
qty_questionmark_url,0.409465,0.07303,0.093627,0.193579,1.0,0.475696,0.12026,0.477852,0.035971,0.003202,-0.000858,0.10468,0.015708,0.000877,-0.000662,0.003498,0.018397,0.30264,0.308541,0.180063,0.109869
qty_equal_url,0.494544,0.180671,0.343394,0.293879,0.475696,1.0,0.14406,0.884647,0.066522,0.010956,-0.000492,0.047973,0.063732,0.003432,0.002896,0.020746,0.121454,0.375975,0.63819,0.327975,0.262337
qty_at_url,0.145472,0.033631,0.057991,0.088185,0.12026,0.14406,1.0,0.306787,0.078411,0.004661,0.027707,0.012622,0.016791,0.442591,0.198421,0.614894,0.271899,0.246423,0.165742,0.411016,0.09525
qty_and_url,0.537386,0.131215,0.260866,0.204093,0.477852,0.884647,0.306787,1.0,0.077185,0.026794,0.004609,0.025559,0.059537,0.150752,0.060024,0.215253,0.166118,0.308573,0.567328,0.271552,0.19051
qty_exclamation_url,0.002523,0.026772,0.102829,0.032318,0.035971,0.066522,0.078411,0.077185,1.0,-0.000557,0.101923,0.007288,0.061801,0.100517,0.072314,0.204088,0.050181,0.02238,0.058686,-0.000924,0.034035
qty_space_url,-0.001649,0.022577,-0.00013,0.016698,0.003202,0.010956,0.004661,0.026794,-0.000557,1.0,-0.000606,0.032223,-0.00037,-0.000223,-0.000122,-0.000275,-0.000815,0.005505,0.034346,0.01212,0.020804


In [26]:
px.imshow(url_data.corr(),color_continuous_scale="Blues")

In [27]:
px.imshow(domain_data.corr(),color_continuous_scale="Blues")


In [28]:
px.imshow(directory_data.corr(),color_continuous_scale="Blues")


In [29]:
px.imshow(file_data.corr(),color_continuous_scale="Blues")


In [30]:
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['phishing']),
                                                 df['phishing'],
                                                 test_size=0.3,
                                                random_state=42)

In [31]:
y_test

8206     0.0
21489    0.0
47547    0.0
24823    0.0
36560    0.0
        ... 
37689    1.0
9029     1.0
47186    1.0
25201    0.0
37246    1.0
Name: phishing, Length: 14466, dtype: float64

In [32]:
lr=LinearRegression()

In [33]:
lr.fit(X_train,y_train)

In [34]:
X_test

Unnamed: 0,qty_dot_url,qty_hyphen_url,qty_underline_url,qty_slash_url,qty_questionmark_url,qty_equal_url,qty_at_url,qty_and_url,qty_exclamation_url,qty_space_url,qty_tilde_url,qty_comma_url,qty_plus_url,qty_asterisk_url,qty_hashtag_url,qty_dollar_url,qty_percent_url,qty_tld_url,length_url,email_in_url,qty_dot_domain,qty_hyphen_domain,qty_underline_domain,qty_slash_domain,qty_questionmark_domain,qty_equal_domain,qty_at_domain,qty_and_domain,qty_exclamation_domain,qty_space_domain,qty_tilde_domain,qty_comma_domain,qty_plus_domain,qty_asterisk_domain,qty_hashtag_domain,qty_dollar_domain,qty_percent_domain,qty_vowels_domain,domain_length,domain_in_ip,server_client_domain,qty_dot_directory,qty_hyphen_directory,qty_underline_directory,qty_slash_directory,qty_questionmark_directory,qty_equal_directory,qty_at_directory,qty_and_directory,qty_exclamation_directory,qty_space_directory,qty_tilde_directory,qty_comma_directory,qty_plus_directory,qty_asterisk_directory,qty_hashtag_directory,qty_dollar_directory,qty_percent_directory,directory_length,qty_dot_file,qty_hyphen_file,qty_underline_file,qty_slash_file,qty_questionmark_file,qty_equal_file,qty_at_file,qty_and_file,qty_exclamation_file,qty_space_file,qty_tilde_file,qty_comma_file,qty_plus_file,qty_asterisk_file,qty_hashtag_file,qty_dollar_file,qty_percent_file,file_length,qty_dot_params,qty_hyphen_params,qty_underline_params,qty_slash_params,qty_questionmark_params,qty_equal_params,qty_at_params,qty_and_params,qty_exclamation_params,qty_space_params,qty_tilde_params,qty_comma_params,qty_plus_params,qty_asterisk_params,qty_hashtag_params,qty_dollar_params,qty_percent_params,params_length,tld_present_params,qty_params,time_response,domain_spf,asn_ip,time_domain_activation,time_domain_expiration,qty_ip_resolved,qty_nameservers,qty_mx_servers,ttl_hostname,tls_ssl_certificate,qty_redirects,url_google_index,domain_google_index,url_shortened
8206,2,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,25.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,25.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000000,1.0,46606.0,8040.0,1089.0,1.0,2.0,5.0,14376.0,0.0,2.0,0.0,0.0,0.0
21489,2,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,16.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,16.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.186705,0.0,1836.0,-1.0,-1.0,-1.0,2.0,1.0,-1.0,0.0,0.0,0.0,0.0,0.0
47547,2,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,22.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,22.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.743887,0.0,852.0,6734.0,1300.0,1.0,2.0,2.0,21597.0,0.0,0.0,0.0,0.0,0.0
24823,3,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,29.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,29.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.636720,0.0,13446.0,-1.0,-1.0,1.0,2.0,2.0,299.0,0.0,0.0,0.0,0.0,0.0
36560,2,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,19.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,19.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.000000,1.0,31815.0,3057.0,229.0,1.0,2.0,5.0,43190.0,0.0,-1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37689,2,1,0,4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,93.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,27.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,58.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,1.0,0.675018,0.0,8075.0,-1.0,-1.0,1.0,4.0,0.0,84.0,1.0,1.0,0.0,0.0,0.0
9029,2,1,0,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,48.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,21.0,0.0,0.0,0.0,1.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.276138,1.0,46606.0,2919.0,367.0,1.0,2.0,2.0,14390.0,1.0,0.0,0.0,0.0,0.0
47186,3,0,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,49.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,22.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,27.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,21.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,3.222893,0.0,26496.0,2775.0,146.0,1.0,2.0,2.0,599.0,0.0,0.0,0.0,0.0,0.0
25201,2,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,17.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,17.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,8.328733,0.0,13335.0,4405.0,708.0,2.0,2.0,1.0,299.0,0.0,-1.0,0.0,0.0,0.0


In [36]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [37]:
y_pred = lr.predict(X_test)

In [38]:
y_test.values

array([0., 0., 0., ..., 1., 0., 1.])

In [39]:
print("MAE",mean_absolute_error(y_test,y_pred))

MAE 0.19755075897489346


In [40]:
print("MSE",mean_squared_error(y_test,y_pred))

MSE 0.6446127276945145


In [41]:
print("RMSE",np.sqrt(mean_squared_error(y_test,y_pred)))

RMSE 0.8028777788022997


In [43]:
print("R2 score",r2_score(y_test,y_pred))
r2 = r2_score(y_test,y_pred)

R2 score -1.8623712247484883


In [44]:
y_pred

array([-0.00491599,  0.12350848, -0.01933138, ...,  0.7474696 ,
        0.04426863,  0.94591773])

In [45]:
import numpy as np
from sklearn.linear_model import LogisticRegression

# Assuming you have your training data `X_train` and target labels `y_train`
# and your test data `X_test` for which you want to obtain binary predictions

# Train your model
model = LogisticRegression()
model.fit(X_train, y_train)

# Get predicted probabilities for the test data
probs = model.predict_proba(X_test)[:, 1]

# Apply a threshold of 0.5 to obtain binary predictions
binary_predictions = np.where(probs >= 0.5, 1, 0)

# `binary_predictions` now contains the binary outputs (0 or 1) from your model



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

