In [1]:
import re
import os
import pickle
import pandas as pd
from sklearn.preprocessing import StandardScaler
from urllib.parse import urlparse

In [26]:
len(re.findall("(\d)","https://finance.sina.com.cn/stock/kechuangban/zcfx/2021-03-16/doc-ikkntiam2634520.shtm"))

15

In [10]:


items = ['17ok.com/stock2018531.shtml', '17ok.com']
if not items[0].startswith("http"):
    url = "http://" + items[0]
else:
    url = items[0]
if not items[1].startswith("http"):
    upurl = "http://" + items[1]
else:
    upurl = items[1]

current_url = "http://finance.17ok.com/news/4/2020/1019/2703664.html"
origin_url = url

temp_c = current_url.replace("http://", "").replace("https://", "")
temp_o = origin_url.replace("http://", "").replace("https://", "")
temp_up = upurl.replace("http://", "").replace("https://", "")
print(temp_c, temp_o, temp_up)
print(re.search(temp_o, temp_c))
print(re.search(temp_up, temp_c))

finance.17ok.com/news/4/2020/1019/2703664.html 17ok.com/stock2018531.shtml 17ok.com
None
<re.Match object; span=(8, 16), match='17ok.com'>


In [5]:
print(re.search(temp_c, temp_o))

None


In [2]:
def build_features(df, load_scaler_from_file=False):
    processed_features = df[["url"]].copy()
    processed_features["path"] = processed_features["url"].map(
        lambda x: urlparse(x).path + urlparse(x).params + urlparse(x).query + urlparse(x).fragment)
    processed_features["path_len"] = processed_features["path"].map(
        lambda x: len(x))
    processed_features["num_hyphen"] = processed_features["path"].map(
        lambda x: x.count("-"))
    processed_features["num_slash"] = processed_features["path"].map(
        lambda x: x.rstrip("/").count("/"))
    processed_features["num_number"] = processed_features["path"].map(
        lambda x: len(re.findall("(\d)",x)))
    cols_to_drop = ['url', 'path']

    processed_features.drop(cols_to_drop, axis=1, inplace=True)
    scaled_features = processed_features.copy()
    features = scaled_features[["path_len","num_hyphen","num_slash","num_number"]]
    scaler_filename = 'StandardScaler.est'
    if load_scaler_from_file and os.path.isfile(scaler_filename):
        scaler = pickle.load(open(scaler_filename, 'rb'))
    else:
        scaler = StandardScaler()
        scaler = StandardScaler().fit(features.values)
        pickle.dump(scaler, open(scaler_filename, 'wb'))

    features = scaler.transform(features.values)
    scaled_features[["path_len","num_hyphen","num_slash","num_number"]] = features
    return scaled_features

In [3]:
links_df = pd.read_csv("links.csv")

In [4]:
links_df.head()

Unnamed: 0,url,label
0,http://stock.cnfol.com,1
1,http://www.cnfol.com/,1
2,http://gold.cnfol.com/,1
3,http://money.cnfol.com/,1
4,http://video.cnfol.com/,1


In [5]:
links_sca = build_features(links_df)

In [6]:
pd.DataFrame(links_sca).head()

Unnamed: 0,path_len,num_hyphen,num_slash,num_number
0,-1.297699,-0.236514,-1.468311,-0.973847
1,-1.245219,-0.236514,-1.468311,-0.973847
2,-1.245219,-0.236514,-1.468311,-0.973847
3,-1.245219,-0.236514,-1.468311,-0.973847
4,-1.245219,-0.236514,-1.468311,-0.973847


In [7]:
output_targets = pd.DataFrame()
output_targets["label"] = links_df["label"].astype(int)
output_targets.head()

Unnamed: 0,label
0,1
1,1
2,1
3,1
4,1


In [8]:
links_ = links_sca.copy()
links_["label"] = output_targets
links_

Unnamed: 0,path_len,num_hyphen,num_slash,num_number,label
0,-1.297699,-0.236514,-1.468311,-0.973847,1
1,-1.245219,-0.236514,-1.468311,-0.973847,1
2,-1.245219,-0.236514,-1.468311,-0.973847,1
3,-1.245219,-0.236514,-1.468311,-0.973847,1
4,-1.245219,-0.236514,-1.468311,-0.973847,1
...,...,...,...,...,...
4432,-0.720414,-0.236514,-0.748194,-0.973847,1
4433,-0.720414,-0.236514,-0.748194,-0.973847,1
4434,-0.720414,-0.236514,-0.748194,-0.973847,1
4435,-1.245219,-0.236514,-1.468311,-0.973847,1


In [9]:
import math
import numpy as np
# Choose the first 90% of the examples for training.
n_links = len(links_df)
train_len = int(math.floor(0.85*n_links))
validation_len = int(n_links - train_len)

links_input = links_.reindex(np.random.permutation(links_.index))

training_input = links_input.head(train_len)
validation_input = links_input.tail(validation_len)

In [10]:
training_examples = training_input[["path_len","num_hyphen","num_slash","num_number"]]
training_targets = training_input[["label"]]

validation_examples = validation_input[["path_len","num_hyphen","num_slash","num_number"]]
validation_targets = validation_input[["label"]]

In [11]:
from IPython import display

print("Training examples summary:")
display.display(training_examples.describe())
print("Validation examples summary:")
display.display(validation_examples.describe())

print("Training targets summary:")
display.display(training_targets.describe())
print("Validation targets summary:")
display.display(validation_targets.describe())

Training examples summary:


Unnamed: 0,path_len,num_hyphen,num_slash,num_number
count,3771.0,3771.0,3771.0,3771.0
mean,-0.012478,-0.00455,-0.008027,-0.004101
std,1.01095,0.990985,1.003711,1.008366
min,-1.297699,-0.236514,-1.468311,-0.973847
25%,-0.615454,-0.236514,-0.748194,-0.973847
50%,-0.090649,-0.236514,-0.028078,-0.218677
75%,0.381675,-0.236514,0.692039,0.914079
max,15.548522,9.169237,5.012739,12.996799


Validation examples summary:


Unnamed: 0,path_len,num_hyphen,num_slash,num_number
count,666.0,666.0,666.0,666.0
mean,0.070653,0.025766,0.045448,0.02322
std,0.933963,1.050707,0.97899,0.952425
min,-1.245219,-0.236514,-1.468311,-0.973847
25%,-0.510493,-0.236514,-0.748194,-0.973847
50%,0.014312,-0.236514,-0.028078,-0.218677
75%,0.539116,-0.236514,0.692039,0.914079
max,7.728937,9.169237,5.012739,2.927865


Training targets summary:


Unnamed: 0,label
count,3771.0
mean,0.477327
std,0.499552
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


Validation targets summary:


Unnamed: 0,label
count,666.0
mean,0.451952
std,0.49806
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [12]:
from sklearn.model_selection import train_test_split, GroupKFold, cross_val_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier

In [13]:
gkf = GroupKFold(n_splits=5)

clf = LogisticRegression(solver="lbfgs", C=0.05, penalty="l2").fit(training_examples, training_targets.values)
sgd_clf = SGDClassifier(loss="log", max_iter=10000, alpha=0.01,
                        learning_rate="optimal").\
                        fit(training_examples, training_targets.values)
# logit_scores = cross_val_score(clf, training_examples,
#                                training_targets.values,
#                                cv=gkf, groups=training_groups)

# sgd_scores = cross_val_score(sgd_clf, training_examples,
#                              training_targets.values,
#                              cv=gkf, groups=training_groups)

# print("Logit Accuracy: %0.2f (+/- %0.2f)" % (logit_scores.mean(), logit_scores.std() * 2))
# print("SGD Accuracy: %0.2f (+/- %0.2f)" % (sgd_scores.mean(), sgd_scores.std() * 2))

print("Logit", "%0.5f" % clf.score(validation_examples, validation_targets))
print("SGD", "%0.5f" % sgd_clf.score(validation_examples, validation_targets))

Logit 0.81832
SGD 0.80030


  return f(*args, **kwargs)
  return f(*args, **kwargs)


In [14]:
from sklearn import svm

In [15]:
svm_clf = svm.SVC().fit(training_examples, training_targets.values)
print("SVM", "%0.5f" % svm_clf.score(validation_examples, validation_targets))

  return f(*args, **kwargs)


SVM 0.87387


In [16]:
from sklearn import tree

In [17]:
tree_clf = tree.DecisionTreeClassifier().fit(training_examples, training_targets.values)
print("Tree", "%0.5f" % tree_clf.score(validation_examples, validation_targets))

Tree 0.93544


In [18]:
import pickle
model_filename = 'DTClassifier.est'

final_tree_clf = tree.DecisionTreeClassifier().fit(links_input[["path_len","num_hyphen","num_slash","num_number"]]
                                                   , links_input[["label"]])

pickle.dump(tree_clf, open(model_filename, 'wb'))

In [19]:
test=[["http://www.17ok.com/foundation201862.shtml",1],
    ["http://www.17ok.com/thirdboard201862.shtml",1],
    ["http://stock.17ok.com/news/335/2020/1020/2703697.html",0],
    ["http://www.17ok.com/money201862.shtml",1],
    ["http://finance.17ok.com/news/4/2021/0317/2708464.html",0],
    ["http://finance.17ok.com/news/4/2021/0317/2708440.html",0],
    ["http://finance.17ok.com/news/4/2021/0317/2708438.html",0],
    ["http://finance.17ok.com/news/4/2021/0317/2708462.html",0],
    ["http://finance.17ok.com/news/4/2021/0317/2708448.html",0],
    ["http://finance.17ok.com/news/4/2021/0317/2708457.html",0],
    ["http://finance.17ok.com/news/4/2021/0317/2708458.html",0],
    ["http://finance.17ok.com/news/4/2021/0317/2708447.html",0],
    ["http://finance.17ok.com/news/4/2021/0317/2708449.html",0],
    ["http://finance.17ok.com/news/4/2021/0317/2708437.html",0],
    ["http://www.17ok.com/financial201862.shtml",1],
    ["http://www.17ok.com/internet201862.shtml",1],
    ["http://finance.17ok.com/news/4/2020/1020/2703669.html",0],
    ["http://finance.17ok.com/news/4/2020/1019/2703656.html",0],
    ["http://hk.eastmoney.com/",1],
    ["http://futures.eastmoney.com/",1],
    ["http://forex.eastmoney.com/",1],
    ["http://bank.eastmoney.com/",1],
    ["http://money.eastmoney.com/",1],
    ["http://bond.eastmoney.com/",1],
    ["http://video.eastmoney.com/",1],
    ["http://guba.eastmoney.com/",1],
    ["http://guba.eastmoney.com/jj.html",1],
    ["http://blog.eastmoney.com/",1],
    ["http://caifuhao.eastmoney.com/",1],
    ["http://so.eastmoney.com/",1],
    ["http://kuaixun.eastmoney.com/",1],
    ["http://quote.eastmoney.com/center/",1],
    ["http://data.eastmoney.com/center/",1],
    ["https://acttg.eastmoney.com/pub/pctg_hskh_act_gfcgrj_01_01_01_0",1],
    ["http://stock.eastmoney.com/bidu.html",1]]
test_url = pd.DataFrame(test)
test_url = test_url.rename(columns={0:"url", 1:"label"})
test_url.head()

Unnamed: 0,url,label
0,http://www.17ok.com/foundation201862.shtml,1
1,http://www.17ok.com/thirdboard201862.shtml,1
2,http://stock.17ok.com/news/335/2020/1020/27036...,0
3,http://www.17ok.com/money201862.shtml,1
4,http://finance.17ok.com/news/4/2021/0317/27084...,0


In [20]:
test_examples = build_features(test_url[["url"]])
test_targets = test_url[["label"]]

In [24]:
file_tree_est = pickle.load(open(model_filename, 'rb'))

print("Tree", "%0.5f" % file_tree_est.score(test_examples, test_targets))
file_tree_est.predict(test_examples)

Tree 0.91429


array([1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [25]:
print("SVM", "%0.5f" % svm_clf.score(test_examples, test_targets))
print("Logit", "%0.5f" % clf.score(test_examples, test_targets))
print("SGD", "%0.5f" % sgd_clf.score(test_examples, test_targets))

SVM 0.85714
Logit 1.00000
SGD 1.00000


In [26]:
svm_clf.predict(test_examples)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [27]:
clf.predict(test_examples)

array([1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [29]:
sgd_clf.predict(test_examples)

array([1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])