In [47]:
import re
import os
import pickle
import pandas as pd
from sklearn.preprocessing import StandardScaler
from urllib.parse import urlparse

In [26]:
len(re.findall("(\d)","https://finance.sina.com.cn/stock/kechuangban/zcfx/2021-03-16/doc-ikkntiam2634520.shtm"))

15

In [48]:
def build_features(df, load_scaler_from_file=False):
    processed_features = df[["url"]].copy()
    processed_features["path"] = processed_features["url"].map(
        lambda x: urlparse(x).path + urlparse(x).params + urlparse(x).query + urlparse(x).fragment)
    processed_features["path_len"] = processed_features["path"].map(
        lambda x: len(x))
    processed_features["num_hyphen"] = processed_features["path"].map(
        lambda x: x.count("-"))
    processed_features["num_slash"] = processed_features["path"].map(
        lambda x: x.rstrip("/").count("/"))
    processed_features["num_number"] = processed_features["path"].map(
        lambda x: len(re.findall("(\d)",x)))
    cols_to_drop = ['url', 'path']

    processed_features.drop(cols_to_drop, axis=1, inplace=True)
    scaled_features = processed_features.copy()
    features = scaled_features[["path_len","num_hyphen","num_slash","num_number"]]
    scaler_filename = 'StandardScaler.est'
    if load_scaler_from_file and os.path.isfile(scaler_filename):
        scaler = pickle.load(open(scaler_filename, 'rb'))
    else:
        scaler = StandardScaler()
        scaler = StandardScaler().fit(features.values)
        pickle.dump(scaler, open(scaler_filename, 'wb'))

    features = scaler.transform(features.values)
    scaled_features[["path_len","num_hyphen","num_slash","num_number"]] = features
    return scaled_features

In [49]:
links_df = pd.read_csv("links.csv")

In [50]:
links_df.head()

Unnamed: 0,url,label
0,http://stock.cnfol.com,1
1,http://www.cnfol.com/,1
2,http://gold.cnfol.com/,1
3,http://money.cnfol.com/,1
4,http://video.cnfol.com/,1


In [51]:
links_sca = build_features(links_df)

In [52]:
pd.DataFrame(links_sca).head()

Unnamed: 0,path_len,num_hyphen,num_slash,num_number
0,-1.297699,-0.236514,-1.468311,-0.973847
1,-1.245219,-0.236514,-1.468311,-0.973847
2,-1.245219,-0.236514,-1.468311,-0.973847
3,-1.245219,-0.236514,-1.468311,-0.973847
4,-1.245219,-0.236514,-1.468311,-0.973847


In [56]:
output_targets = pd.DataFrame()
output_targets["label"] = links_df["label"].astype(int)
output_targets.head()

Unnamed: 0,label
0,1
1,1
2,1
3,1
4,1


In [63]:
links_ = links_sca.copy()
links_["label"] = output_targets
links_

Unnamed: 0,path_len,num_hyphen,num_slash,num_number,label
0,-1.297699,-0.236514,-1.468311,-0.973847,1
1,-1.245219,-0.236514,-1.468311,-0.973847,1
2,-1.245219,-0.236514,-1.468311,-0.973847,1
3,-1.245219,-0.236514,-1.468311,-0.973847,1
4,-1.245219,-0.236514,-1.468311,-0.973847,1
...,...,...,...,...,...
4432,-0.720414,-0.236514,-0.748194,-0.973847,1
4433,-0.720414,-0.236514,-0.748194,-0.973847,1
4434,-0.720414,-0.236514,-0.748194,-0.973847,1
4435,-1.245219,-0.236514,-1.468311,-0.973847,1


In [69]:
import math
import numpy as np
# Choose the first 90% of the examples for training.
n_links = len(links_df)
train_len = int(math.floor(0.85*n_links))
validation_len = int(n_links - train_len)

links_input = links_.reindex(np.random.permutation(links_.index))

training_input = links_input.head(train_len)
validation_input = links_input.tail(validation_len)

In [71]:
training_examples = training_input[["path_len","num_hyphen","num_slash","num_number"]]
training_targets = training_input[["label"]]

validation_examples = validation_input[["path_len","num_hyphen","num_slash","num_number"]]
validation_targets = validation_input[["label"]]

In [73]:
from IPython import display

print("Training examples summary:")
display.display(training_examples.describe())
print("Validation examples summary:")
display.display(validation_examples.describe())

print("Training targets summary:")
display.display(training_targets.describe())
print("Validation targets summary:")
display.display(validation_targets.describe())

Training examples summary:


Unnamed: 0,path_len,num_hyphen,num_slash,num_number
count,3771.0,3771.0,3771.0,3771.0
mean,0.005892,0.004357,0.002858,-0.000296
std,1.024305,1.014908,1.003229,1.004429
min,-1.297699,-0.236514,-1.468311,-0.973847
25%,-0.562973,-0.236514,-0.748194,-0.973847
50%,-0.090649,-0.236514,-0.028078,-0.218677
75%,0.434155,-0.236514,0.692039,0.914079
max,15.548522,9.169237,5.012739,12.996799


Validation examples summary:


Unnamed: 0,path_len,num_hyphen,num_slash,num_number
count,666.0,666.0,666.0,666.0
mean,-0.033362,-0.024673,-0.016184,0.001676
std,0.850161,0.91214,0.982869,0.976051
min,-1.245219,-0.236514,-1.468311,-0.973847
25%,-0.615454,-0.236514,-0.748194,-0.973847
50%,-0.090649,-0.236514,-0.028078,-0.218677
75%,0.434155,-0.236514,0.692039,0.914079
max,4.212747,9.169237,4.292622,2.676142


Training targets summary:


Unnamed: 0,label
count,3771.0
mean,0.474145
std,0.499397
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


Validation targets summary:


Unnamed: 0,label
count,666.0
mean,0.46997
std,0.499473
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [76]:
from sklearn.model_selection import train_test_split, GroupKFold, cross_val_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier

In [82]:
gkf = GroupKFold(n_splits=5)

clf = LogisticRegression(solver="lbfgs", C=0.05, penalty="l2").fit(training_examples, training_targets.values)
sgd_clf = SGDClassifier(loss="log", max_iter=10000, alpha=0.01,
                        learning_rate="optimal").\
                        fit(training_examples, training_targets.values)
# logit_scores = cross_val_score(clf, training_examples,
#                                training_targets.values,
#                                cv=gkf, groups=training_groups)

# sgd_scores = cross_val_score(sgd_clf, training_examples,
#                              training_targets.values,
#                              cv=gkf, groups=training_groups)

# print("Logit Accuracy: %0.2f (+/- %0.2f)" % (logit_scores.mean(), logit_scores.std() * 2))
# print("SGD Accuracy: %0.2f (+/- %0.2f)" % (sgd_scores.mean(), sgd_scores.std() * 2))

print("Logit", "%0.5f" % clf.score(validation_examples, validation_targets))
print("SGD", "%0.5f" % sgd_clf.score(validation_examples, validation_targets))

Logit 0.82432
SGD 0.80631


  return f(*args, **kwargs)
  return f(*args, **kwargs)


In [80]:
from sklearn import svm

In [83]:
svm_clf = svm.SVC().fit(training_examples, training_targets.values)
print("SVM", "%0.5f" % svm_clf.score(validation_examples, validation_targets))

  return f(*args, **kwargs)


SVM 0.88


In [84]:
from sklearn import tree

In [85]:
tree_clf = tree.DecisionTreeClassifier().fit(training_examples, training_targets.values)
print("Tree", "%0.5f" % tree_clf.score(validation_examples, validation_targets))

Tree 0.93694


In [86]:
import pickle
model_filename = 'DTClassifier.est'

final_tree_clf = tree.DecisionTreeClassifier().fit(links_input[["path_len","num_hyphen","num_slash","num_number"]]
                                                   , links_input[["label"]])

pickle.dump(tree_clf, open(model_filename, 'wb'))

In [88]:
file_tree_est = pickle.load(open(model_filename, 'rb'))
file_tree_est.score(validation_examples, validation_targets)

0.9369369369369369