In [None]:
!pip install gdown

!gdown --id 1QA78ukWX-XEmOVbfsVF_Skw6uHqxIfTm
!gdown --id 1BK40Wm7y4dVTxA87_X4Dgu5Rfnfl2hsE

!unzip -qq ./trainset_doms.zip

In [None]:
!pip install openpyxl
!pip install Arabic-Stopwords
!pip install googletrans
!pip install python-bidi
!pip install arabic_reshaper

In [None]:
import pandas as pd
import numpy as np
from os.path import exists, join
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_excel("./Links_Classification_Task.xlsx")
df.head()

In [None]:
df.sample(5)

In [None]:
df[df["link_id"]==2684733]["link_url"]

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
# from googletrans import Translator

# translator = Translator()
# def translate_text(text, source_lang="ar", dest_lang="en"):
#     return translator.translate(text)

In [None]:
#df = df.apply(lambda x: translate_text(x["content_name"]) if pd.isnull(x['trans_content_name']) else x['trans_content_name'], axis=1)

In [None]:
df["alt_content_names"].fillna(df["content_name"], inplace=True)

values = {'class': "unrelated",
          "trans_content_name" : ""}
df.fillna(value=values, inplace=True)


In [None]:
df["class"].value_counts()/len(df)

In [None]:
import requests

def check_response_status(url):
    response = None
    status = 200
    try:
        response = requests.get(url)
        response.raise_for_status()
    except (requests.exceptions.ConnectionError, requests.exceptions.Timeout):
        #print("Down")
        status = response.status_code
    except requests.exceptions.HTTPError:
        #print("4xx, 5xx")
        status = response.status_code
    except requests.exceptions.RequestException :
        #print("Error")
        status = response.status_code
    except:
        status = 0
    else:
        #print("Request Status Is good")
        status = response.status_code
    finally:
        return status, response

In [None]:
def check_domain_url(soup, domain_name):        
    hrefs = [f.get('href') for f in soup.find_all('link')]
    for link in hrefs:
        if link is not None and domain_name in link:
            return True
    
    hrefs = [f.get('href') for f in soup.find_all('a')]
    for link in hrefs:
        if link is not None and domain_name in link:
            return True
        
    forms = [f.get('action') for f in soup.find_all('form')]
    for link in forms:
        if link is not None and domain_name in link:
            return True
    return False

In [None]:
from bs4.element import Comment

# def tag_visible(element):
#     if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]', 'comment']:
#         return False
#     if isinstance(element, Comment):
#         return False
#     return True


# def text_from_html(soup):
#     texts = soup.findAll(text=True)
#     visible_texts = filter(tag_visible, texts)  
#     return " ".join(t.strip() for t in visible_texts)

def text_from_html(soup):  
    return soup.get_text()

In [None]:
def get_key_occurance_text(text, key):
    if key is None or key == "":
        return 0
    return text.count(key)

In [None]:
from bs4 import BeautifulSoup as bs

def check_sample_online_request(url):
    response_status, response = check_response_status(url)
    soup = None
    if response is not None:
        soup = bs(response.text, 'html.parser')
    return response_status, soup

In [None]:
# def jaccard_similarity(query, document):
#     intersection = set(query).intersection(set(document))
#     union = set(query).union(set(document))
#     return len(intersection)/len(union)

In [None]:
import re

def get_n_related_scores(text, keys, delim="-|_|,", n=3):
    key_list = re.split(delim, keys)
    res = np.zeros((n,))
    for i, key in enumerate(key_list):
        res[i] = get_key_occurance_text(text, key.strip())
    return res

In [None]:
def get_page_features(soup, row, n_content_names=3):    
    domain_name_status = int(check_domain_url(soup, row["link_domain_name"]))
    text = text_from_html(soup)
    content_name_freq = get_key_occurance_text(text, row["content_name"])
    trans_content_name_freq = get_key_occurance_text(text, row["trans_content_name"])
    alt_content_names_freq_list = get_n_related_scores(text, row["alt_content_names"], n=n_content_names)
    campaign_name_freq = get_key_occurance_text(text, row["campaign_name"])
    
    return domain_name_status, content_name_freq, trans_content_name_freq, alt_content_names_freq_list, campaign_name_freq
    

In [None]:
row = df.iloc[2]
#url = "http://survey.ispp.edu.kh/space/moh.ramadan-v-alprince-h9.html"
response_status, soup = check_sample_online_request(row["link_url"])
if soup is not None:
    get_page_features(soup, row)   
    

In [None]:
df["class"].value_counts()

In [None]:
df.isna().sum()

In [None]:
dir_path = "./trainset_doms"
df["page_source_path"] = df["page_source_path"].apply(lambda x: join(dir_path, x))

In [None]:
def is_file_exist(file_path):
    return exists(file_path)

In [None]:
import codecs

def read_file(file_path):
    try:
        file = codecs.open(file_path, 'r', encoding="utf-8").read()
    except UnicodeDecodeError:
        file = codecs.open(file_path, 'r', encoding="windows-1256").read()
    return file


In [None]:
def read_html_page(html_path):
    return bs(html_path, 'html.parser')

In [None]:
file_index = 2

In [None]:
is_file_exist(df["page_source_path"][3])

In [None]:
import arabicstopwords.arabicstopwords as stp
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_emoji(text):
    regex_pattern = re.compile("["
                                u"\U0001F600-\U0001F64F"  
                                u"\U0001F300-\U0001F5FF"  
                                u"\U0001F680-\U0001F6FF"  
                                u"\U0001F1E0-\U0001F1FF"  
                                u"\U00002500-\U00002BEF"  
                                u"\U00002702-\U000027B0"
                                u"\U00002702-\U000027B0"
                                u"\U000024C2-\U0001F251"
                                u"\U0001f926-\U0001f937"
                                u"\U00010000-\U0010ffff"
                                u"\u2640-\u2642" 
                                u"\u2600-\u2B55"
                                u"\u200d"
                                u"\u23cf"
                                u"\u23e9"
                                u"\u231a"
                                u"\ufe0f"  
                                u"\u3030"
                               "]+", flags=re.UNICODE)

    return regex_pattern.sub(r'', text)


def remove_email(text):
    return re.sub('([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})', '', text)


def remove_repeated_char(text):
    return re.sub(r'(.)\1\1{1,}', r'\1\1', text)


def remove_account_tag(text):
    return re.sub(r'@[\w]+', '', text)


def remove_hashtag(text):
    return re.sub(r'#[\w]+', '', text)


def remove_more_spaces(text):
    return re.sub('\s+\t\n\r', ' ', text)


def remove_stop_words(text):
    text_list = []
    for w in text.split():
        if (not stp.is_stop(w)) and (w not in stop_words):
            text_list.append(w)
    return " ".join(text_list)


In [None]:
import re
import string


def clean_text(text):    
    text = text.lower()
    text = remove_emoji(text)
    text = remove_email(text)
    text = remove_account_tag(text)
    text = remove_hashtag(text)
    text = remove_stop_words(text)
        
    text = re.sub(r'http\S+', '', text)
    
    tags_comp = re.compile('<.*?>') 
    text = re.sub(tags_comp, '', text)
        
    text = re.sub(r'[^\w\s]', ' ', text)
    
    text = re.sub(r'\w*\d\w*', ' ', text)
    
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    text = remove_more_spaces(text)
        
    text = " ".join([x for x in text.split() if len(x)>3])    
    
    text = " ".join([x for x in text.split() if not x.isdigit()])   
        
    return text

In [None]:
def get_unique_words(text):
    return " ".join(list(dict.fromkeys(text.split())))

In [None]:
def get_page_tage_text(soup_src, tag_name):
    tag_content = soup_src.find_all(tag_name)
    for i in range(len(tag_content)):
        tag_content[i] = clean_text(tag_content[i].get_text())
    return tag_content

In [None]:
def get_tag_unique_content(soup, tag):
    text = get_page_tage_text(soup, tag)
    text = " ".join(text)
    text = clean_text(text)
    text = get_unique_words(text)
    return text

### Find Title

In [None]:
print(get_page_tage_text(soup, "title"))

### Find Headers

In [None]:
h = get_tag_unique_content(soup, "h1")
h

In [None]:
h = get_tag_unique_content(soup, "h2")
h

In [None]:
h = get_tag_unique_content(soup, "h3")
h

In [None]:
h = get_tag_unique_content(soup, "h4")
h

In [None]:
h = get_tag_unique_content(soup, "h5")
h

In [None]:
h = get_tag_unique_content(soup, "h6")
h

In [None]:
text = text_from_html(soup)
text = clean_text(text)
text

In [None]:
from collections import Counter

def get_n_top_freq_words(text, n):
    split_it = text.split()
    Counter_val = Counter(split_it)
    most_occur = Counter_val.most_common(n)
    return [x[0] for x in most_occur]

In [None]:
get_n_top_freq_words(text, 5)

In [None]:
def check_site_online(url):
    response_status, soup = check_sample_online_request(row["link_url"]) 
    return response_status, soup 

In [None]:
def generate_df(df):
    top_n_word = 5
    df["response_status"] = 200
    df["domain_name_status"] = 0
    df["content_name_freq"] = 0
    df["trans_content_name_freq"] = 0
    n_content_names = 5
    for i in range(n_content_names):
        df["alt_content_names_{}".format(i)] = None
    df["campaign_name_freq"] = None
    df["title"] = None
    df["h1"] = None
    df["h2"] = None
    df["h3"] = None
    df["h4"] = None
    df["h5"] = None
    df["h6"] = None
    df["content"] = None
    df["top_{}_word".format(top_n_word)] = None
    
    for index, row in df.iterrows():
        #print(index)
        #response_status, soup = check_sample_online_request(row["link_url"])        
        #if soup is not None:
        #    df["response_status"] = response_status
        #else:
        #    if(not is_file_exist(row["page_source_path"])):
        #        continue 
        #    soup = read_html_page(read_file(row["page_source_path"])) 
        
        #print(index)
        if is_file_exist(row["page_source_path"]):
            check_online = False
            soup = read_html_page(read_file(row["page_source_path"])) 
            
            if soup is None:
                check_online = True
                    
            elif soup.title is None or soup.title.string is None or "I am not a bot. Open Website" in soup.title.string:
                check_online = True
                    
            elif len(text_from_html(soup)) == 0:
                check_online = True
            
            if check_online:
                response_status, soup = check_sample_online_request(row["link_url"]) 
                df.at[index, "response_status"] = response_status
                if soup is None:
                    df.at[index, "response_status"] = 0
                    continue            
        else:
            response_status, soup = check_sample_online_request(row["link_url"]) 
            df.at[index, "response_status"] = response_status
            if soup is None:
                df.at[index, "response_status"] = 0
                continue
        
        domain_name_status, content_name_freq, trans_content_name_freq, alt_content_names_freq_list, campaign_name_freq = get_page_features(soup, row, n_content_names)
        df.at[index, "domain_name_status"] = domain_name_status
        df.at[index, "content_name_freq"] = content_name_freq
        df.at[index, "trans_content_name_freq"] = trans_content_name_freq
        for i in range(n_content_names):
            df.at[index, "alt_content_names_{}".format(i)] = alt_content_names_freq_list[i]
        df.at[index, "campaign_name_freq"] = campaign_name_freq
    
        df.at[index, "title"] = " ".join(get_page_tage_text(soup, "title"))
        df.at[index, "h1"] = get_tag_unique_content(soup, "h1")
        df.at[index, "h2"] = get_tag_unique_content(soup, "h2")
        df.at[index, "h3"] = get_tag_unique_content(soup, "h3")
        df.at[index, "h4"] = get_tag_unique_content(soup, "h4")
        df.at[index, "h5"] = get_tag_unique_content(soup, "h5")
        df.at[index, "h6"] = get_tag_unique_content(soup, "h6")
        
        text = text_from_html(soup)
        text = clean_text(text)
        df.at[index, "content"] = text
        
        df.at[index, "top_5_word"] = " ".join(get_n_top_freq_words(text, top_n_word))
    return df

In [None]:
df = generate_df(df)

In [None]:
df[df["content"].isnull()]

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df["link_domain_name"].value_counts()

In [None]:
df["content_name"].value_counts()

In [None]:
df["campaign_name"].value_counts()

In [None]:
df.to_csv("preprocessed_df.csv", index=False)

In [None]:
df = pd.read_csv("preprocessed_df.csv")

In [None]:
df.info()

In [None]:
from bidi.algorithm import get_display
from arabic_reshaper import reshape

right_2_left = lambda w: get_display(reshape(f'{w}'))

In [None]:
sns.set(rc={'figure.figsize':(20,20)})
ax = sns.countplot(data=df, x='campaign_name', hue='class')
labels = [item.get_text() for item in ax.get_xticklabels()]
ax.set_xticklabels(labels, rotation = 45)
ax.legend()
plt.title('campaign_name vs class');

In [None]:
sns.set(rc={'figure.figsize':(20,20)})
ax = sns.countplot(data=df, x='content_name', hue='class')
labels = [item.get_text() for item in ax.get_xticklabels()]
ax.set_xticklabels(labels, rotation = 45)
ax.legend()
plt.title('content_name vs class');

In [None]:
sns.set(rc={'figure.figsize':(20,20)})
ax = sns.barplot(data=df, x='content_name', y="number_of_episodes", hue='class')
labels = [item.get_text() for item in ax.get_xticklabels()]
ax.set_xticklabels(labels, rotation = 45)
ax.legend()
plt.title('content_name vs class');

In [None]:
sns.set(rc={'figure.figsize':(20,20)})
ax = sns.countplot(data=df, x='response_status', hue='class')
labels = [item.get_text() for item in ax.get_xticklabels()]
ax.set_xticklabels(labels, rotation = 45)
ax.legend()
plt.title('response_status vs class');

In [None]:
sns.set(rc={'figure.figsize':(20,20)})
ax = sns.countplot(data=df, x='domain_name_status', hue='class')
labels = [item.get_text() for item in ax.get_xticklabels()]
ax.set_xticklabels(labels, rotation = 45)
ax.legend()
plt.title('response_status vs class');

In [None]:
sns.set(rc={'figure.figsize':(20,20)})
ax = sns.countplot(data=df[df["content"].isna()], x='content_name', hue='class')
labels = [item.get_text() for item in ax.get_xticklabels()]
ax.set_xticklabels(labels, rotation = 45)
ax.legend()
plt.title('When content is null vs class');

In [None]:
cols = ["number_of_episodes", "response_status", "domain_name_status", "content_name_freq", 
        "trans_content_name_freq", "alt_content_names_0", "alt_content_names_1", "alt_content_names_2", 
        "alt_content_names_3", "alt_content_names_4", "campaign_name_freq", "class"]

df[cols].info()

In [None]:
df[~df["alt_content_names_0"].isnull()]

In [None]:
tmp_df = df[cols][~df["alt_content_names_0"].isnull()].copy()
tmp_df["domain_name_status"] = tmp_df["domain_name_status"].astype(int)

In [None]:
ax = sns.pairplot(tmp_df, hue="class")
plt.title('all numeric vs class');

In [None]:
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics
from sklearn.model_selection import train_test_split


In [None]:
df = df[~df["content"].isna()]

In [None]:
data_train, data_test = train_test_split(df, test_size=0.2,random_state=2, stratify=df["class"], shuffle=True)

In [None]:
def trim(s):
    return s if len(s) <= 80 else s[:77] + "..."

In [None]:
use_hashing = False
n_features = 100

target_names = np.unique(data_train["class"])

y_train, y_test = data_train["class"], data_test["class"]

print("Extracting features from the training data using a sparse vectorizer")
t0 = time()


if use_hashing:
    vectorizer = HashingVectorizer(
        stop_words="english", alternate_sign=False, n_features=n_features
    )
    X_train = vectorizer.transform(data_train["content"])
else:
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words="english")
    X_train = vectorizer.fit_transform(data_train["content"])
duration = time() - t0
print("done in %fs" % duration)
print("n_samples: %d, n_features: %d" % X_train.shape)
print()

feature_names = vectorizer.get_feature_names_out()

print("Extracting features from the test data using the same vectorizer")
t0 = time()
X_test = vectorizer.transform(data_test["content"])
duration = time() - t0
print("done in %fs" % duration)
print("n_samples: %d, n_features: %d" % X_test.shape)
print()

In [None]:
print_top10 = True

def benchmark(clf):
    print("_" * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, "coef_"):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))
    
        if print_top10 and feature_names is not None:
            print("top 10 keywords per class:")
            top10 = np.argsort(clf.coef_[0])[-10:]
            print(trim("%s: %s" % (target_names[0], " ".join(feature_names[top10]))))
            
            top10 = np.argsort(1-clf.coef_[0])[-10:]
            print(trim("%s: %s" % (target_names[1], " ".join(feature_names[top10]))))
        print()

    print("classification report:")
    print(metrics.classification_report(y_test, pred, target_names=target_names))

    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split("(")[0]
    return clf_descr, score, train_time, test_time


results = []
for clf, name in (
    (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"),
    (Perceptron(max_iter=50), "Perceptron"),
    (PassiveAggressiveClassifier(max_iter=50), "Passive-Aggressive"),
    (KNeighborsClassifier(n_neighbors=10), "kNN"),
    (RandomForestClassifier(), "Random forest"),
):
    print("=" * 80)
    print(name)
    results.append(benchmark(clf))

for penalty in ["l2", "l1"]:
    print("=" * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3)))

    # Train SGD model
    results.append(benchmark(SGDClassifier(alpha=0.0001, max_iter=50, penalty=penalty)))

# Train SGD with Elastic Net penalty
print("=" * 80)
print("Elastic-Net penalty")
results.append(
    benchmark(SGDClassifier(alpha=0.0001, max_iter=50, penalty="elasticnet"))
)

# Train NearestCentroid without threshold
print("=" * 80)
print("NearestCentroid (aka Rocchio classifier)")
results.append(benchmark(NearestCentroid()))

# Train sparse Naive Bayes classifiers
print("=" * 80)
print("Naive Bayes")
results.append(benchmark(MultinomialNB(alpha=0.01)))
results.append(benchmark(BernoulliNB(alpha=0.01)))
results.append(benchmark(ComplementNB(alpha=0.1)))

print("=" * 80)
print("LinearSVC with L1-based feature selection")
# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
results.append(
    benchmark(
        Pipeline(
            [
                (
                    "feature_selection",
                    SelectFromModel(LinearSVC(penalty="l1", dual=False, tol=1e-3)),
                ),
                ("classification", LinearSVC(penalty="l2")),
            ]
        )
    )
)

In [None]:
indices = np.arange(len(results))

results = [[x[i] for x in results] for i in range(4)]

clf_names, score, training_time, test_time = results
training_time = np.array(training_time) / np.max(training_time)
test_time = np.array(test_time) / np.max(test_time)

plt.figure(figsize=(12, 8))
plt.title("Score")
plt.barh(indices, score, 0.2, label="score", color="navy")
plt.barh(indices + 0.3, training_time, 0.2, label="training time", color="c")
plt.barh(indices + 0.6, test_time, 0.2, label="test time", color="darkorange")
plt.yticks(())
plt.legend(loc="best")
plt.subplots_adjust(left=0.25)
plt.subplots_adjust(top=0.95)
plt.subplots_adjust(bottom=0.05)

for i, c in zip(indices, clf_names):
    plt.text(-0.3, i, c)

plt.show()