In [1]:
!pip install pandas
!pip install numpy
!pip install sklearn



In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
import re
import random
from sklearn.metrics import accuracy_score

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [4]:
def getTokens(input):
    tokensBySlash = str(input.encode('utf-8')).split('/')
    allTokens = []
    for i in tokensBySlash:
        tokens = str(i).split('-')
        tokensByDot = []
        for j in range(0,len(tokens)):
            tempTokens = str(tokens[j]).split('.')
            tokensByDot = tokensByDot + tempTokens
        allTokens = allTokens + tokens + tokensByDot
    allTokens = list(set(allTokens))
    if 'com' in allTokens:
        allTokens.remove('com')
    return allTokens

#function to remove "http://" from URL
def trim(url):
    return re.match(r'(?:\w*://)?(?:.*\.)?([a-zA-Z-1-9]*\.[a-zA-Z]{1,}).*', url).groups()[0]

In [5]:
#read from a file
data = pd.read_csv("data.csv",',',error_bad_lines=False)

In [6]:
data.head()

Unnamed: 0,url,label
0,diaryofagameaddict.com,bad
1,espdesign.com.au,bad
2,iamagameaddict.com,bad
3,kalantzis.net,bad
4,slightlyoffcenter.net,bad


In [7]:
data['url'].values

array(['diaryofagameaddict.com', 'espdesign.com.au', 'iamagameaddict.com',
       ..., 'apple-iclods.org/', 'apple-uptoday.org/',
       'apple-search.info'], dtype=object)

In [8]:
#convert it into numpy array and shuffle the dataset
data = np.array(data)
random.shuffle(data)

In [9]:
#convert text data into numerical data for machine learning models
y = [d[1] for d in data]
corpus = [d[0] for d in data]

In [10]:
print(corpus[0:20])

['diaryofagameaddict.com', 'diaryofagameaddict.com', 'diaryofagameaddict.com', 'iamagameaddict.com', 'kalantzis.net', 'iamagameaddict.com', 'slightlyoffcenter.net', 'iamagameaddict.com', 'toddscarwash.com', 'slightlyoffcenter.net', 'crackspider.us/toolbar/install.php?pack=exe', 'iamagameaddict.com', 'pos-kupang.com/', 'sn-gzzx.com', 'kalantzis.net', 'toddscarwash.com', 'rupor.info', 'sn-gzzx.com', 'pos-kupang.com/', 'tubemoviez.com']


In [11]:
print(y[0:20])

['bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad']


In [12]:
vectorizer = TfidfVectorizer(tokenizer=getTokens)
X = vectorizer.fit_transform(corpus)

In [13]:
#split the data set inot train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## LogisticRegression

In [14]:
model = LogisticRegression(C=1)
model.fit(X_train, y_train)



LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
print(model.score(X_test,y_test))

0.9836609468088902


In [16]:
# #save the model and vectorizer
# joblib.dump(model, "mal-logireg1.pkl", protocol=2)
# joblib.dump(vectorizer, "vectorizer1.pkl", protocol=2)

In [17]:
#make prediction
a = "http://www.savanvisalpara.com"
aa = vectorizer.transform([trim(a)])
s = model.predict(aa)
s[0] #0 for good

'bad'

## Random Forest Classifier

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [19]:
m = RandomForestClassifier(n_estimators=10)
m.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

## Support Vector Machine

In [None]:
#2 - SVM
from sklearn.svm import SVC
svcModel = SVC()
svcModel.fit(X_train, y_train)
# lsvcModel = svm.LinearSVC.fit(X_train, y_train)

In [None]:
svcModel.score(X_test, y_test)