In [102]:
import pandas as pd
import numpy as np
import random

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [103]:
# 1. Load data
url_data = pd.read_csv('urldata.csv')


In [104]:
url_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420464 entries, 0 to 420463
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     420464 non-null  object
 1   label   420464 non-null  object
dtypes: object(2)
memory usage: 6.4+ MB


In [105]:
#check null values
url_data.isnull().sum()


url      0
label    0
dtype: int64

In [106]:
#check dublicate values
url_data.duplicated().sum()

9216

In [107]:
#show dublicate values
url_data[url_data.duplicated()]


Unnamed: 0,url,label
125,stormpages.com/script/PHP.txt,bad
442,penwithian.co.uk/hyperventilate/sporran.js,bad
473,ukonline.hc0.me/Host.exe,bad
475,ukonline.hc0.me/new.exe,bad
506,praxisww.com/wp-stc.php,bad
...,...,...
420389,www.eye-watch.in/design/fancybox/Pnf.action,bad
420390,www.eye-watch.in/design/fancybox/Pnf.action,bad
420406,www.fapoergol.top/read.php?f=1.gif,bad
420439,cureeczemafast.org/wp-conf.gbn,bad


In [108]:
from sklearn.preprocessing import LabelEncoder


# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Apply label encoding to the "label" column
url_data['label_encoded'] = label_encoder.fit_transform(url_data['label'])
url_data.head()


Unnamed: 0,url,label,label_encoded
0,diaryofagameaddict.com,bad,0
1,espdesign.com.au,bad,0
2,iamagameaddict.com,bad,0
3,kalantzis.net,bad,0
4,slightlyoffcenter.net,bad,0


In [109]:
url_data.drop_duplicates(inplace=True)

In [110]:
#dublicate values
url_data.duplicated().sum()


0

In [111]:
url_data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 411248 entries, 0 to 420463
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   url            411248 non-null  object
 1   label          411248 non-null  object
 2   label_encoded  411248 non-null  int32 
dtypes: int32(1), object(2)
memory usage: 11.0+ MB


### Data Vectorizaion

In [112]:
#data vectorization
def maketokens(f):
    tkns_by_slash=str(f.encode('utf-8')).split('/')	#get tokens after splitting by slash
    total_tokens=[]
    for i in tkns_by_slash:
        tokens = str(i).split('-')	#get tokens after splitting by dash
        tkns_by_dot=[]
        for j in range(0,len(tokens)):
            temp_tokens=str(tokens[j]).split('.')	#get tokens after splitting by dot
            tkns_by_dot=tkns_by_dot+temp_tokens
        total_tokens=total_tokens+tokens+tkns_by_dot

    total_tokens=list(set(total_tokens))	#remove redundant tokens
    if 'com' in total_tokens:
        total_tokens.remove('com')	#removing .com since it occurs a lot of times and it should not be included in our features
    return total_tokens

In [125]:
y = url_data['label'] #target variable


In [114]:
url_list = url_data['url'] #predictor variable

In [115]:
vectorizer = TfidfVectorizer(tokenizer=maketokens)

In [116]:
X = vectorizer.fit_transform(url_list)



### spliting the data into train and test

In [117]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [118]:
# from sklearn.preprocessing import LabelEncoder

# # Create a LabelEncoder object
# label_encoder = LabelEncoder()

# # Fit the encoder on your target variable and transform it
# y_train_encoded = label_encoder.fit_transform(y_train)


In [126]:
#model building 
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [127]:
#acccuracy 
print("Accuracy --> ", model.score(X_test, y_test))

Accuracy -->  0.9615440729483282


### Prediction

In [128]:
X_predict = ["google.com/search=jcharistech",
"google.com/search=faizanahmad",
"pakistanifacebookforever.com/getpassword.php/", 
"www.radsport-voggel.de/wp-admin/includes/log.exe", 
"ahrenhei.without-transfer.ru/nethost.exe ",
"www.itidea.it/centroesteticosothys/img/_notes/gum.exe"]

In [129]:
X_predict = vectorizer.transform(X_predict)
New_predict = model.predict(X_predict)

In [130]:
print(New_predict)

[1 1 1 0 0 0]


In [131]:
# https://db.aa419.org/fakebankslist.php
X_predict1 = ["www.buyfakebillsonlinee.blogspot.com", 
"www.unitedairlineslogistics.com",
"www.stonehousedelivery.com",
"www.silkroadmeds-onlinepharmacy.com" ]

In [132]:
X_predict1 = vectorizer.transform(X_predict1)
New_predict1 = model.predict(X_predict1)

In [133]:
print(New_predict1)

[0 0 0 0]
