#### Importing all the required libraries

In [1]:
import pandas as pd
import numpy as np
import random
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

#### Importing the Dataset
### Source : 

In [2]:
url = 'data_url.csv'
url_csv = pd.read_csv(url,',',error_bad_lines=False)

#converting the data from csv to dataframe for easy handling
url_df = pd.DataFrame(url_csv)

#to convert into array 
url_df = np.array(url_df)  
random.shuffle(url_df)

#### Seperating the data according to it's characteristics

In [None]:
y = [d[1] for d in url_df]                 
urls = [d[0] for d in url_df]

#### Since the urls are different from our normal text documents, we need to use a sanitization method to get the relevant data from raw urls.

In [4]:
def sanitization(web):
    web = web.lower()
    token = []
    dot_token_slash = []
    raw_slash = str(web).split('/')
    for i in raw_slash:
        # removing slash to get token
        raw1 = str(i).split('-')
        slash_token = []
        for j in range(0,len(raw1)):
            # removing dot to get the tokens
            raw2 = str(raw1[j]).split('.')
            slash_token = slash_token + raw2
        dot_token_slash = dot_token_slash + raw1 + slash_token
    # to remove same words
    token = list(set(dot_token_slash))  
    if 'com' in token:
        #remove com
        token.remove('com')
    return token

#### We will have to pass the data to our custom vectorizer function using Tf-idf approach 

In [5]:
# term-frequency and inverse-document-frequency
vectorizer = TfidfVectorizer(tokenizer=sanitization)

#### Splitting the test set and train set

In [None]:
x = vectorizer.fit_transform(urls)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

#### Training

In [7]:
lgr = LogisticRegression(solver='lbfgs', max_iter=1000)                  # Logistic regression
lgr.fit(x_train, y_train)
score = lgr.score(x_test, y_test)
print("score: {0:.2f} %".format(100 * score))
vectorizer_save = vectorizer

score: 98.46 %


#### Saving the modle and vectors

In [8]:
file = "pickel_model.pkl"
with open(file, 'wb') as f:
    pickle.dump(lgr, f)
f.close()

file2 = "pickel_vector.pkl"
with open(file2,'wb') as f2:
    pickle.dump(vectorizer_save, f2)
f2.close()