# Session SetUp

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession;

spark = SparkSession.builder.master("local[4]").appName("ISM6562 Spark App01").getOrCreate();

sc = spark.sparkContext  

spark_session_port = spark.sparkContext.uiWebUrl.split(":")[-1]
print("Spark Session WebUI Port: " + spark_session_port)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/05 19:45:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark Session WebUI Port: 4040


In [2]:
spark

# Importing Data

In [3]:
df=spark.read.csv("/home/admin/Desktop/Final-Project/rawdata/phiusiilphishingurldataset/PhiUSIIL_Phishing_URL_Dataset.csv",header=True,inferSchema=True)

                                                                                

# Data Cleaning and Transformation

In [4]:
df.show(5)

24/05/05 19:45:43 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+----------+--------------------+---------+--------------------+------------+----------+---+------------------+--------------------+-----------------+-----------+---------+-------------+--------------+------------------+----------------+----------------+----------------+---------------+---------------+---------------+--------------+------------------+--------------------------+---------------------+-------+----------+-----------------+--------+--------------------+---------------------+------------------+----------+------+------------+---------------+----------------+--------------+---------+----------+---------------------+------------+---------------+---------------+----------------+----+---+------+----------------+---------+-------+------+-----------+------------+---------------+-----+
|  FILENAME|                 URL|URLLength|              Domain|DomainLength|IsDomainIP|TLD|URLSimilarityIndex|CharContinuationRate|TLDLegitimateProb|URLCharProb|TLDLength|NoOfSubDomain|HasObfuscation

In [5]:
df.printSchema() 

root
 |-- FILENAME: string (nullable = true)
 |-- URL: string (nullable = true)
 |-- URLLength: integer (nullable = true)
 |-- Domain: string (nullable = true)
 |-- DomainLength: integer (nullable = true)
 |-- IsDomainIP: integer (nullable = true)
 |-- TLD: string (nullable = true)
 |-- URLSimilarityIndex: double (nullable = true)
 |-- CharContinuationRate: double (nullable = true)
 |-- TLDLegitimateProb: double (nullable = true)
 |-- URLCharProb: double (nullable = true)
 |-- TLDLength: integer (nullable = true)
 |-- NoOfSubDomain: integer (nullable = true)
 |-- HasObfuscation: integer (nullable = true)
 |-- NoOfObfuscatedChar: integer (nullable = true)
 |-- ObfuscationRatio: double (nullable = true)
 |-- NoOfLettersInURL: integer (nullable = true)
 |-- LetterRatioInURL: double (nullable = true)
 |-- NoOfDegitsInURL: integer (nullable = true)
 |-- DegitRatioInURL: double (nullable = true)
 |-- NoOfEqualsInURL: integer (nullable = true)
 |-- NoOfQMarkInURL: integer (nullable = true)
 |

## Check for Null Values

In [6]:
from pyspark.sql.functions import col, sum as sql_sum

null_counts = df.agg(*(sql_sum(col(c).isNull().cast("int")).alias(c) for c in df.columns)).collect()[0].asDict()
print(null_counts)



{'FILENAME': 0, 'URL': 0, 'URLLength': 0, 'Domain': 0, 'DomainLength': 0, 'IsDomainIP': 0, 'TLD': 0, 'URLSimilarityIndex': 0, 'CharContinuationRate': 0, 'TLDLegitimateProb': 0, 'URLCharProb': 0, 'TLDLength': 0, 'NoOfSubDomain': 0, 'HasObfuscation': 0, 'NoOfObfuscatedChar': 0, 'ObfuscationRatio': 0, 'NoOfLettersInURL': 0, 'LetterRatioInURL': 0, 'NoOfDegitsInURL': 0, 'DegitRatioInURL': 0, 'NoOfEqualsInURL': 0, 'NoOfQMarkInURL': 0, 'NoOfAmpersandInURL': 0, 'NoOfOtherSpecialCharsInURL': 0, 'SpacialCharRatioInURL': 0, 'IsHTTPS': 0, 'LineOfCode': 0, 'LargestLineLength': 0, 'HasTitle': 0, 'Title': 0, 'DomainTitleMatchScore': 0, 'URLTitleMatchScore': 0, 'HasFavicon': 0, 'Robots': 0, 'IsResponsive': 0, 'NoOfURLRedirect': 0, 'NoOfSelfRedirect': 0, 'HasDescription': 0, 'NoOfPopup': 0, 'NoOfiFrame': 0, 'HasExternalFormSubmit': 0, 'HasSocialNet': 0, 'HasSubmitButton': 0, 'HasHiddenFields': 0, 'HasPasswordField': 0, 'Bank': 0, 'Pay': 0, 'Crypto': 0, 'HasCopyrightInfo': 0, 'NoOfImage': 0, 'NoOfCSS': 

                                                                                

In [7]:
sum(null_counts.values())

0

>There are no null values in the dataset so need to apply fillna.

## Drop irrelevant columns

There are some columns columns that we will not required in out model building or analysis as they are of no use like file name so we need to drop them.

>> FILENAME has no significane in further analysis it is only name of file in which a perticular record is stored we can not use it for predicting if Website is phising or not.

>> We can drop *FILENAME*

In [8]:
df = df.drop("FILENAME")

## Rename columns

There are few column which are binary their are not started by IS which make them little inconvient for analysis we can rename those columns

In [9]:
df.select('Robots','Bank','Pay','Crypto').show(5)

+------+----+---+------+
|Robots|Bank|Pay|Crypto|
+------+----+---+------+
|     1|   1|  0|     0|
|     1|   0|  0|     0|
|     1|   0|  0|     0|
|     1|   0|  1|     1|
|     1|   1|  1|     0|
+------+----+---+------+
only showing top 5 rows



In [10]:
df=df.withColumnRenamed("Robots","IsRobots").withColumnRenamed("Bank","IsBank").withColumnRenamed("Pay","IsPay").withColumnRenamed("Crypto","IsCrypto")

## Data Type Conversion

>> All the columns have accurate datatypes due InferSchema Paramater while loading data into pyspark df.
>> We don't need to change any datatype. 

# Storing data into datawarehouse

>>Next step in our ETL pipeline is to Load Transformed data into datawarehouse so that we can take transformed i.e cleaned data according to our Business Need.

In [11]:
spark.sql("show databases").show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [12]:
spark.sql("show tables").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
+---------+---------+-----------+



In [13]:
df.write.saveAsTable("mytb1", mode='overwrite')

                                                                                

In [14]:
spark.sql("show tables").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|  default|    mytb1|      false|
+---------+---------+-----------+



>>Now we have sucessfully Extracted data from source and did transformation on it and we loaded this transformed data as a Table in our Spark DataWarehouse.

>> For Further Model Building and Analysis we can directly extract data/columns from warehouse and continue our modelling from that which will reduce need to initial transformation everytime.

# Machine Learning Modelling using PySpark

>> Lets Load Back our transformed data for Modelling. We will pick columns depend on our Modelling requirement.

>> We will not work on TEXT column intially for the following reason:
1. Text Mining Included concept like TF-IDF Tokenlization which will increase number of dimension and complexity of Model Significantly.
2. If we are getting better result without them then we might not need to include them or else we can add them.

In [15]:
df = spark.sql("SELECT * FROM mytb1").drop("URL", "Domain","Title")

In [16]:
df.show(5)

+---------+------------+----------+---+------------------+--------------------+-----------------+-----------+---------+-------------+--------------+------------------+----------------+----------------+----------------+---------------+---------------+---------------+--------------+------------------+--------------------------+---------------------+-------+----------+-----------------+--------+---------------------+------------------+----------+--------+------------+---------------+----------------+--------------+---------+----------+---------------------+------------+---------------+---------------+----------------+------+-----+--------+----------------+---------+-------+------+-----------+------------+---------------+-----+
|URLLength|DomainLength|IsDomainIP|TLD|URLSimilarityIndex|CharContinuationRate|TLDLegitimateProb|URLCharProb|TLDLength|NoOfSubDomain|HasObfuscation|NoOfObfuscatedChar|ObfuscationRatio|NoOfLettersInURL|LetterRatioInURL|NoOfDegitsInURL|DegitRatioInURL|NoOfEqualsInURL|

##  Exploratory Data Analysis

>We will use pandas api for pyspark for EDA.<br>
>We can take out summary from this EDA which column need to apply which feature Engineering.

In [17]:
import pyspark.pandas as ps



In [18]:
df1=df.pandas_api()

In [19]:
type(df1)

pyspark.pandas.frame.DataFrame

In [20]:
df1.describe()

                                                                                

Unnamed: 0,URLLength,DomainLength,IsDomainIP,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,URLCharProb,TLDLength,NoOfSubDomain,HasObfuscation,NoOfObfuscatedChar,ObfuscationRatio,NoOfLettersInURL,LetterRatioInURL,NoOfDegitsInURL,DegitRatioInURL,NoOfEqualsInURL,NoOfQMarkInURL,NoOfAmpersandInURL,NoOfOtherSpecialCharsInURL,SpacialCharRatioInURL,IsHTTPS,LineOfCode,LargestLineLength,HasTitle,DomainTitleMatchScore,URLTitleMatchScore,HasFavicon,IsRobots,IsResponsive,NoOfURLRedirect,NoOfSelfRedirect,HasDescription,NoOfPopup,NoOfiFrame,HasExternalFormSubmit,HasSocialNet,HasSubmitButton,HasHiddenFields,HasPasswordField,IsBank,IsPay,IsCrypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label
count,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0
mean,34.573095,21.470396,0.002706,78.430778,0.845508,0.260423,0.055747,2.764456,1.164758,0.002057,0.024861,0.000138,19.428919,0.515946,1.881011,0.028616,0.062241,0.029403,0.025056,2.340198,0.063309,0.782625,1141.900443,12789.53,0.861261,50.131427,52.122098,0.361768,0.266541,0.624513,0.133438,0.040107,0.440183,0.221765,1.588638,0.043987,0.45657,0.414301,0.377799,0.102263,0.127089,0.237007,0.023474,0.486775,26.075689,6.333111,10.522305,65.071113,2.377629,49.262516,0.571895
std,41.314153,9.150793,0.051946,28.976055,0.216632,0.251628,0.010587,0.599739,0.600969,0.045306,1.876249,0.003817,29.09033,0.123315,11.886695,0.070897,0.934704,0.193505,0.836448,3.527603,0.032393,0.412461,3419.950513,152201.1,0.345675,49.676981,49.600564,0.480513,0.442151,0.484249,0.340048,0.19621,0.49641,3.87054,5.762561,0.205067,0.498111,0.492602,0.484838,0.302994,0.333074,0.425247,0.151403,0.499826,79.411815,74.866296,22.312192,176.687539,17.641097,161.02743,0.494805
min,13.0,4.0,0.0,0.155574,0.0,0.0,0.001083,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,23.0,16.0,0.0,57.02381,0.68,0.005977,0.050744,2.0,1.0,0.0,0.0,0.0,10.0,0.435,0.0,0.0,0.0,0.0,0.0,1.0,0.038,1.0,18.0,200.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,27.0,20.0,0.0,100.0,1.0,0.079963,0.057969,3.0,1.0,0.0,0.0,0.0,14.0,0.519,0.0,0.0,0.0,0.0,0.0,1.0,0.05,1.0,429.0,1089.0,1.0,75.0,100.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,2.0,6.0,12.0,0.0,10.0,1.0
75%,34.0,24.0,0.0,100.0,1.0,0.522907,0.062874,3.0,1.0,0.0,0.0,0.0,20.0,0.594,0.0,0.0,0.0,0.0,0.0,3.0,0.083,1.0,1277.0,8047.0,1.0,100.0,100.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,29.0,7.0,15.0,88.0,1.0,57.0,1.0
max,6097.0,110.0,1.0,100.0,1.0,0.522907,0.090824,13.0,10.0,1.0,447.0,0.348,5191.0,0.926,2011.0,0.684,176.0,4.0,149.0,499.0,0.397,1.0,442666.0,13975730.0,1.0,100.0,100.0,1.0,1.0,1.0,1.0,1.0,1.0,602.0,1602.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,8956.0,35820.0,6957.0,27397.0,4887.0,27516.0,1.0


In [21]:
df1.info

                                                                                

<bound method DataFrame.info of      URLLength  DomainLength  IsDomainIP        TLD  URLSimilarityIndex  CharContinuationRate  TLDLegitimateProb  URLCharProb  TLDLength  NoOfSubDomain  HasObfuscation  NoOfObfuscatedChar  ObfuscationRatio  NoOfLettersInURL  LetterRatioInURL  NoOfDegitsInURL  DegitRatioInURL  NoOfEqualsInURL  NoOfQMarkInURL  NoOfAmpersandInURL  NoOfOtherSpecialCharsInURL  SpacialCharRatioInURL  IsHTTPS  LineOfCode  LargestLineLength  HasTitle  DomainTitleMatchScore  URLTitleMatchScore  HasFavicon  IsRobots  IsResponsive  NoOfURLRedirect  NoOfSelfRedirect  HasDescription  NoOfPopup  NoOfiFrame  HasExternalFormSubmit  HasSocialNet  HasSubmitButton  HasHiddenFields  HasPasswordField  IsBank  IsPay  IsCrypto  HasCopyrightInfo  NoOfImage  NoOfCSS  NoOfJS  NoOfSelfRef  NoOfEmptyRef  NoOfExternalRef  label
0           58            22           0         io           36.743529              0.684211           0.012927     0.061131          2              1               0       

In [22]:
df1.columns

Index(['URLLength', 'DomainLength', 'IsDomainIP', 'TLD', 'URLSimilarityIndex',
       'CharContinuationRate', 'TLDLegitimateProb', 'URLCharProb', 'TLDLength',
       'NoOfSubDomain', 'HasObfuscation', 'NoOfObfuscatedChar',
       'ObfuscationRatio', 'NoOfLettersInURL', 'LetterRatioInURL',
       'NoOfDegitsInURL', 'DegitRatioInURL', 'NoOfEqualsInURL',
       'NoOfQMarkInURL', 'NoOfAmpersandInURL', 'NoOfOtherSpecialCharsInURL',
       'SpacialCharRatioInURL', 'IsHTTPS', 'LineOfCode', 'LargestLineLength',
       'HasTitle', 'DomainTitleMatchScore', 'URLTitleMatchScore', 'HasFavicon',
       'IsRobots', 'IsResponsive', 'NoOfURLRedirect', 'NoOfSelfRedirect',
       'HasDescription', 'NoOfPopup', 'NoOfiFrame', 'HasExternalFormSubmit',
       'HasSocialNet', 'HasSubmitButton', 'HasHiddenFields',
       'HasPasswordField', 'IsBank', 'IsPay', 'IsCrypto', 'HasCopyrightInfo',
       'NoOfImage', 'NoOfCSS', 'NoOfJS', 'NoOfSelfRef', 'NoOfEmptyRef',
       'NoOfExternalRef', 'label'],
      dtype='

## Univariant Analysis

### URLLength

Its is numerical column lets check Distribution

In [23]:
df1['URLLength'].hist(bins=1000)

Check its relation with target column

In [24]:
df1[['URLLength','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,URLLength
label,Unnamed: 1_level_1
1,26.22861
0,45.720293


We can clearly see avg length of phising URL is Lower which means shorter the URL higher chances of phising 

### DomainLength

In [25]:
df1[['DomainLength']].hist(bins=100)

In [26]:
df1[['DomainLength','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,DomainLength
label,Unnamed: 1_level_1
1,19.22861
0,24.465144


### IsDomainIP

In [27]:
df1['IsDomainIP'].value_counts()


The resulting Series will have a fixed name of 'count' from 4.0.0.



0    235157
1       638
Name: IsDomainIP, dtype: int64

In [28]:
df1[['IsDomainIP','label']].groupby('label').count().reset_index()

Unnamed: 0,label,IsDomainIP
0,1,134850
1,0,100945


### URLSimilarityIndex

In [29]:
df1.columns

Index(['URLLength', 'DomainLength', 'IsDomainIP', 'TLD', 'URLSimilarityIndex',
       'CharContinuationRate', 'TLDLegitimateProb', 'URLCharProb', 'TLDLength',
       'NoOfSubDomain', 'HasObfuscation', 'NoOfObfuscatedChar',
       'ObfuscationRatio', 'NoOfLettersInURL', 'LetterRatioInURL',
       'NoOfDegitsInURL', 'DegitRatioInURL', 'NoOfEqualsInURL',
       'NoOfQMarkInURL', 'NoOfAmpersandInURL', 'NoOfOtherSpecialCharsInURL',
       'SpacialCharRatioInURL', 'IsHTTPS', 'LineOfCode', 'LargestLineLength',
       'HasTitle', 'DomainTitleMatchScore', 'URLTitleMatchScore', 'HasFavicon',
       'IsRobots', 'IsResponsive', 'NoOfURLRedirect', 'NoOfSelfRedirect',
       'HasDescription', 'NoOfPopup', 'NoOfiFrame', 'HasExternalFormSubmit',
       'HasSocialNet', 'HasSubmitButton', 'HasHiddenFields',
       'HasPasswordField', 'IsBank', 'IsPay', 'IsCrypto', 'HasCopyrightInfo',
       'NoOfImage', 'NoOfCSS', 'NoOfJS', 'NoOfSelfRef', 'NoOfEmptyRef',
       'NoOfExternalRef', 'label'],
      dtype='

In [30]:
df1[['URLSimilarityIndex']].hist(bins=100)

In [31]:
df1[['URLSimilarityIndex','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,URLSimilarityIndex
label,Unnamed: 1_level_1
1,100.0
0,49.616973


### CharContinuationRate

In [32]:
df1[['CharContinuationRate']].hist(bins=100)

In [33]:
df1[['CharContinuationRate','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,CharContinuationRate
label,Unnamed: 1_level_1
1,0.933176
0,0.728395


###

###TLDLegitimateProb

In [34]:
df1[['TLDLegitimateProb']].hist(bins=100)

In [35]:
df1[['TLDLegitimateProb','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,TLDLegitimateProb
label,Unnamed: 1_level_1
1,0.281625
0,0.232099


### URLCharProb

In [36]:
df1[['URLCharProb']].hist(bins=100)

In [37]:
df1[['URLCharProb','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,URLCharProb
label,Unnamed: 1_level_1
1,0.06005
0,0.049999


### TLDLength

In [38]:
df1[['TLDLength']].hist(bins=100)

In [39]:
df1[['TLDLength','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,TLDLength
label,Unnamed: 1_level_1
1,2.723382
0,2.819327


### NoOfSubDomain

In [40]:
df1[['NoOfSubDomain']].hist(bins=100)

In [41]:
df1[['NoOfSubDomain','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,NoOfSubDomain
label,Unnamed: 1_level_1
1,1.161661
0,1.168894


### HasObfuscation

In [42]:
df1[['HasObfuscation']].hist(bins=100)

In [43]:
df1[['HasObfuscation','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,HasObfuscation
label,Unnamed: 1_level_1
1,0.0
0,0.004805


### NoOfObfuscatedChar

In [44]:
df1[['NoOfObfuscatedChar']].hist(bins=100)

In [45]:
df1[['NoOfObfuscatedChar','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,NoOfObfuscatedChar
label,Unnamed: 1_level_1
1,0.0
0,0.058071


### ObfuscationRatio

In [46]:
df1[['ObfuscationRatio']].hist(bins=100)

In [47]:
df1[['ObfuscationRatio','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,ObfuscationRatio
label,Unnamed: 1_level_1
1,0.0
0,0.000323


### NoOfLettersInURL

In [48]:
df1[['NoOfLettersInURL']].hist(bins=100)

In [49]:
df1[['NoOfLettersInURL','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,NoOfLettersInURL
label,Unnamed: 1_level_1
1,12.933059
0,28.106583


### LetterRatioInURL

In [50]:
df1[['LetterRatioInURL']].hist(bins=100)

In [51]:
df1[['LetterRatioInURL','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,LetterRatioInURL
label,Unnamed: 1_level_1
1,0.476705
0,0.568366


### NoOfDegitsInURL

In [52]:
df1[['NoOfDegitsInURL']].hist(bins=100)

In [53]:
df1[['NoOfDegitsInURL','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,NoOfDegitsInURL
label,Unnamed: 1_level_1
1,0.050597
0,4.326217


### DegitRatioInURL

In [54]:
df1[['DegitRatioInURL']].hist(bins=100)

In [55]:
df1[['DegitRatioInURL','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,DegitRatioInURL
label,Unnamed: 1_level_1
1,0.002115
0,0.064018


### NoOfEqualsInURL

In [56]:
df1[['NoOfEqualsInURL']].hist(bins=100)

In [57]:
df1[['NoOfEqualsInURL','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,NoOfEqualsInURL
label,Unnamed: 1_level_1
1,0.0
0,0.145386


### NoOfQMarkInURL

In [58]:
df1[['NoOfQMarkInURL']].hist(bins=100)

In [59]:
df1[['NoOfQMarkInURL','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,NoOfQMarkInURL
label,Unnamed: 1_level_1
1,0.0
0,0.068681


### NoOfAmpersandInURL

In [60]:
df1[['NoOfAmpersandInURL']].hist(bins=100)

In [61]:
df1[['NoOfAmpersandInURL','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,NoOfAmpersandInURL
label,Unnamed: 1_level_1
1,0.0
0,0.058527


### NoOfOtherSpecialCharsInURL

In [62]:
df1[['NoOfOtherSpecialCharsInURL']].hist(bins=100)

In [63]:
df1[['NoOfOtherSpecialCharsInURL','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,NoOfOtherSpecialCharsInURL
label,Unnamed: 1_level_1
1,1.244835
0,3.803467


### SpacialCharRatioInURL

In [64]:
df1[['SpacialCharRatioInURL']].hist(bins=100)

In [65]:
df1[['SpacialCharRatioInURL','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,SpacialCharRatioInURL
label,Unnamed: 1_level_1
1,0.048356
0,0.083285


### IsHTTPS

In [66]:
df1['IsHTTPS'].value_counts()


The resulting Series will have a fixed name of 'count' from 4.0.0.



1    184539
0     51256
Name: IsHTTPS, dtype: int64

In [67]:
df1[['IsHTTPS','label']].groupby('label').count().reset_index()

Unnamed: 0,label,IsHTTPS
0,1,134850
1,0,100945


### LineOfCode

In [68]:
df1[['LineOfCode']].hist(bins=100)

In [69]:
df1[['LineOfCode','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,LineOfCode
label,Unnamed: 1_level_1
1,1947.49168
0,65.730467


### LargestLineLength

In [70]:
df1[['LargestLineLength']].hist(bins=100)

In [71]:
df1[['LargestLineLength','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,LargestLineLength
label,Unnamed: 1_level_1
1,7375.919555
0,20021.448244


### HasTitle

In [72]:
df1[['HasTitle']].hist(bins=100)

In [73]:
df1[['HasTitle','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,HasTitle
label,Unnamed: 1_level_1
1,0.998754
0,0.677587


### DomainTitleMatchScore

In [74]:
df1[['DomainTitleMatchScore']].hist(bins=100)


In [75]:
df1[['DomainTitleMatchScore','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,DomainTitleMatchScore
label,Unnamed: 1_level_1
1,75.270904
0,16.548205


### URLTitleMatchScore

In [76]:
df1[['URLTitleMatchScore']].hist(bins=100)


In [77]:
df1[['URLTitleMatchScore','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,URLTitleMatchScore
label,Unnamed: 1_level_1
1,75.270904
0,21.198164


### HasFavicon

In [78]:
df1[['HasFavicon']].hist(bins=100)


In [79]:
df1[['HasFavicon','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,HasFavicon
label,Unnamed: 1_level_1
1,0.567023
0,0.087572


### IsRobots

In [80]:
df1['IsRobots'].value_counts()


The resulting Series will have a fixed name of 'count' from 4.0.0.



0    172946
1     62849
Name: IsRobots, dtype: int64

In [81]:
df1[['IsRobots','label']].groupby('label').count().reset_index()

Unnamed: 0,label,IsRobots
0,1,134850
1,0,100945


### IsResponsive

In [82]:
df1['IsResponsive'].value_counts()


The resulting Series will have a fixed name of 'count' from 4.0.0.



1    147257
0     88538
Name: IsResponsive, dtype: int64

In [83]:
df1[['IsResponsive','label']].groupby('label').count().reset_index()

Unnamed: 0,label,IsResponsive
0,1,134850
1,0,100945


### NoOfURLRedirect

In [84]:
df1[['NoOfURLRedirect']].hist(bins=100)


In [85]:
df1[['NoOfURLRedirect','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,NoOfURLRedirect
label,Unnamed: 1_level_1
1,0.11977
0,0.151696


### NoOfSelfRedirect

In [86]:
df1[['NoOfSelfRedirect']].hist(bins=100)


In [87]:
df1[['NoOfSelfRedirect','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,NoOfSelfRedirect
label,Unnamed: 1_level_1
1,0.027126
0,0.057447


### HasDescription

In [88]:
df1[['HasDescription']].hist(bins=100)


In [89]:
df1[['HasDescription','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,HasDescription
label,Unnamed: 1_level_1
1,0.736633
0,0.044163


### NoOfPopup

In [90]:
df1[['DomainLength']].hist(bins=100)

In [91]:
df1[['DomainLength','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,DomainLength
label,Unnamed: 1_level_1
1,19.22861
0,24.465144


In [92]:
df1[['DomainLength']].hist(bins=100)

In [93]:
df1[['DomainLength','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,DomainLength
label,Unnamed: 1_level_1
1,19.22861
0,24.465144


In [94]:
df1[['DomainLength']].hist(bins=100)

In [95]:
df1[['DomainLength','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,DomainLength
label,Unnamed: 1_level_1
1,19.22861
0,24.465144


In [96]:
df1[['DomainLength']].hist(bins=100)

In [97]:
df1[['DomainLength','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,DomainLength
label,Unnamed: 1_level_1
1,19.22861
0,24.465144


In [98]:
df1[['DomainLength']].hist(bins=100)

In [99]:
df1[['DomainLength','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,DomainLength
label,Unnamed: 1_level_1
1,19.22861
0,24.465144


In [100]:
df1[['DomainLength']].hist(bins=100)

In [101]:
df1[['DomainLength','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,DomainLength
label,Unnamed: 1_level_1
1,19.22861
0,24.465144


In [102]:
df1[['DomainLength']].hist(bins=100)

In [103]:
df1[['DomainLength','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,DomainLength
label,Unnamed: 1_level_1
1,19.22861
0,24.465144


In [104]:
df1[['DomainLength']].hist(bins=100)

In [105]:
df1[['DomainLength','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,DomainLength
label,Unnamed: 1_level_1
1,19.22861
0,24.465144


In [106]:
df1[['DomainLength']].hist(bins=100)

In [107]:
df1[['DomainLength','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,DomainLength
label,Unnamed: 1_level_1
1,19.22861
0,24.465144


In [108]:
df1[['DomainLength']].hist(bins=100)

In [109]:
df1[['DomainLength','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,DomainLength
label,Unnamed: 1_level_1
1,19.22861
0,24.465144


In [110]:
df1[['DomainLength']].hist(bins=100)

In [111]:
df1[['DomainLength','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,DomainLength
label,Unnamed: 1_level_1
1,19.22861
0,24.465144


In [112]:
df1[['DomainLength']].hist(bins=100)

In [113]:
df1[['DomainLength','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,DomainLength
label,Unnamed: 1_level_1
1,19.22861
0,24.465144


In [114]:
df1[['DomainLength']].hist(bins=100)

In [115]:
df1[['DomainLength','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,DomainLength
label,Unnamed: 1_level_1
1,19.22861
0,24.465144


In [116]:
df1[['DomainLength']].hist(bins=100)

In [117]:
df1[['DomainLength','label']].groupby('label').mean()


Default value of `numeric_only` will be changed to `False` instead of `True` in 4.0.0.



Unnamed: 0_level_0,DomainLength
label,Unnamed: 1_level_1
1,19.22861
0,24.465144


## Feature Engineering

In [118]:
from pyspark.ml.feature import VectorAssembler, StandardScaler, OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [119]:
df.columns

['URLLength',
 'DomainLength',
 'IsDomainIP',
 'TLD',
 'URLSimilarityIndex',
 'CharContinuationRate',
 'TLDLegitimateProb',
 'URLCharProb',
 'TLDLength',
 'NoOfSubDomain',
 'HasObfuscation',
 'NoOfObfuscatedChar',
 'ObfuscationRatio',
 'NoOfLettersInURL',
 'LetterRatioInURL',
 'NoOfDegitsInURL',
 'DegitRatioInURL',
 'NoOfEqualsInURL',
 'NoOfQMarkInURL',
 'NoOfAmpersandInURL',
 'NoOfOtherSpecialCharsInURL',
 'SpacialCharRatioInURL',
 'IsHTTPS',
 'LineOfCode',
 'LargestLineLength',
 'HasTitle',
 'DomainTitleMatchScore',
 'URLTitleMatchScore',
 'HasFavicon',
 'IsRobots',
 'IsResponsive',
 'NoOfURLRedirect',
 'NoOfSelfRedirect',
 'HasDescription',
 'NoOfPopup',
 'NoOfiFrame',
 'HasExternalFormSubmit',
 'HasSocialNet',
 'HasSubmitButton',
 'HasHiddenFields',
 'HasPasswordField',
 'IsBank',
 'IsPay',
 'IsCrypto',
 'HasCopyrightInfo',
 'NoOfImage',
 'NoOfCSS',
 'NoOfJS',
 'NoOfSelfRef',
 'NoOfEmptyRef',
 'NoOfExternalRef',
 'label']

In [120]:
len(df.columns)

52

In [121]:
# Define categorical, numeric columns
categorical_cols = ['TLD','TLDLength','NoOfSubDomain']
numeric_cols = ['URLLength','DomainLength','URLSimilarityIndex','CharContinuationRate','TLDLegitimateProb','URLCharProb','NoOfObfuscatedChar','ObfuscationRatio','NoOfLettersInURL','LetterRatioInURL','NoOfDegitsInURL','DegitRatioInURL','NoOfEqualsInURL','NoOfQMarkInURL','NoOfAmpersandInURL','NoOfOtherSpecialCharsInURL','SpacialCharRatioInURL','LineOfCode','LargestLineLength','DomainTitleMatchScore','URLTitleMatchScore','NoOfPopup','NoOfiFrame','NoOfImage','NoOfCSS','NoOfJS','NoOfSelfRef','NoOfEmptyRef','NoOfExternalRef']
binary_cols = ['IsHTTPS','HasTitle','HasFavicon','IsRobots','IsResponsive','NoOfURLRedirect','NoOfSelfRedirect','HasDescription','HasExternalFormSubmit','HasSocialNet','HasSubmitButton','HasHiddenFields','HasPasswordField','IsBank','IsPay','IsCrypto','HasCopyrightInfo']

In [122]:
len(categorical_cols)+len(numeric_cols)+len(binary_cols)

49

In [123]:
# Define stages for the preprocessing pipeline
stages = []

# StringIndexer for categorical columns
for col in categorical_cols:
    indexer = StringIndexer(inputCol=col, outputCol=col+'_index')
    stages.append(indexer)

# StandardScaler for numeric columns
numeric_assembler = VectorAssembler(inputCols=numeric_cols, outputCol='numeric_features')
scaler = StandardScaler(inputCol='numeric_features', outputCol='scaled_numeric_features', withStd=True, withMean=False)
stages += [numeric_assembler, scaler]

# OneHotEncoder for binary columns
for col in binary_cols:
    encoder = OneHotEncoder(inputCol=col, outputCol=col+'_onehot')
    stages.append(encoder)

# VectorAssembler for all features
assembler_inputs = [col + '_index' for col in categorical_cols] + [col + '_onehot' for col in binary_cols] + ['scaled_numeric_features']
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol='features')
stages.append(assembler)

# Pipeline for preprocessing
pipeline = Pipeline(stages=stages)

In [124]:
# Fit the pipeline to the data
pipeline_model = pipeline.fit(df)
transformed_df = pipeline_model.transform(df)

                                                                                

In [125]:
transformed_df.show()

+---------+------------+----------+---+------------------+--------------------+-----------------+-----------+---------+-------------+--------------+------------------+----------------+----------------+----------------+---------------+---------------+---------------+--------------+------------------+--------------------------+---------------------+-------+----------+-----------------+--------+---------------------+------------------+----------+--------+------------+---------------+----------------+--------------+---------+----------+---------------------+------------+---------------+---------------+----------------+------+-----+--------+----------------+---------+-------+------+-----------+------------+---------------+-----+---------+---------------+-------------------+--------------------+-----------------------+--------------+---------------+-----------------+---------------+-------------------+----------------------+-----------------------+---------------------+----------------------

In [126]:
train_df, test_df = transformed_df.randomSplit([0.8, 0.2], seed=42)

## Logistic Regression

In [127]:
# Instantiate the logistic regression model
lr = LogisticRegression(featuresCol='features', labelCol='label')

# Fit the model on the training data
lr_model = lr.fit(train_df)

                                                                                

In [128]:
# Make predictions on the test data
predictions = lr_model.transform(test_df)


# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='f1')
f1_score = evaluator.evaluate(predictions)

# Print the accuracy
print("f1_score for Logistic Regression:", f1_score)



f1_score for Logistic Regression: 0.9999575848832525


                                                                                

## Decision Tree Classifier

In [129]:
from pyspark.ml.classification import DecisionTreeClassifier

In [130]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features",maxBins=1000)

In [131]:
dt_model = dt.fit(train_df)

                                                                                

In [132]:
# Make predictions on the test data
predictions = dt_model.transform(test_df)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='f1')
f1_score = evaluator.evaluate(predictions)

# Print the accuracy
print("f1_score for DT:", f1_score)



f1_score for DT: 1.0


                                                                                

## Gradient-boosted tree classifier

In [133]:
from pyspark.ml.classification import GBTClassifier

In [134]:
gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10,maxBins=1000)

In [135]:
gbt_model = gbt.fit(train_df)

                                                                                

In [136]:
# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='f1')
f1_score = evaluator.evaluate(predictions)

# Print the accuracy
print("f1_score for Gradient Boosting:", f1_score)




f1_score for Gradient Boosting: 1.0


                                                                                