## Part 1

In [12]:
from hazm import *
from elasticsearch import Elasticsearch
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
import time
import pandas as pd
import re

In [2]:
def get_digikala_comments(url, page_nums):
    driver = webdriver.Edge()
    driver.maximize_window()
    driver.get(url)

    comments = []

    try:
        timeout_in_seconds = 30
        WebDriverWait(driver, timeout_in_seconds).until(ec.element_to_be_clickable((By.XPATH, '/html/body/div[1]/div[1]/div[3]/div[3]/div[2]/div[9]/div/div[2]/div/section/div[2]/div[2]/div[2]/div[4]/button')))

        more_comments_button = driver.find_element(By.XPATH, '/html/body/div[1]/div[1]/div[3]/div[3]/div[2]/div[9]/div/div[2]/div/section/div[2]/div[2]/div[2]/div[4]/button')
        driver.execute_script("arguments[0].click();", more_comments_button)
        time.sleep(3)

        span_number = 2
        for _ in range(page_nums):
            html = driver.page_source
            soup = BeautifulSoup(html, features="html.parser")
            page_comments = soup.find_all('article', attrs={'class':'py-3 lg:mt-0 flex items-start br-list-vertical-no-padding-200'})
            comments.append(page_comments)

            next_page_button = driver.find_element(By.XPATH, f'/html/body/div[1]/div[1]/div[3]/div[3]/div[2]/div[9]/div/div[2]/div/section/div[2]/div[2]/div[2]/div[4]/div[2]/span[{span_number}]')
            WebDriverWait(driver, timeout_in_seconds).until(ec.element_to_be_clickable((By.XPATH, f'/html/body/div[1]/div[1]/div[3]/div[3]/div[2]/div[9]/div/div[2]/div/section/div[2]/div[2]/div[2]/div[4]/div[2]/span[{span_number}]')))
            driver.execute_script("arguments[0].click();", next_page_button)
            
            WebDriverWait(driver, timeout_in_seconds).until(ec.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[1]/div[3]/div[3]/div[2]/div[9]/div/div[2]/div/section/div[2]/div[2]/div[2]/article[1]/div[2]/div[1]/div/div/div[1]/div[2]/div/p')))

            if(span_number == 4):
                span_number = 4
            else:
                span_number += 1

        return comments

    except TimeoutException:
        print("I give up...")
    finally:
        driver.quit()

In [5]:
def get_comment_texts(comment_list):
    comments = []    
    for element in comment_list:
        soup = BeautifulSoup(f'{element}', 'html.parser')
        text = soup.find('p', attrs={'class': 'text-body-1 text-neutral-900 mb-1 pt-3 break-words'}).text
        comments.append(text)

    return comments

In [6]:
def write_comments_to_file(comments, file_path):
    # first normalize the comments
    normalizer = Normalizer()
    normalized_comments = []
    for comment in comments:
        normalized_text = normalizer.normalize(comment)
        normalized_text = re.sub('\r', '', normalized_text)
        normalized_text = re.sub('\n', '', normalized_text)
        normalized_text = re.sub('\u200c', '', normalized_text)
        normalized_comments.append(normalized_text)

    # write commnets to file
    with open(file_path, 'w', encoding='utf-8') as f:
        for line in normalized_comments:
            f.write(f'{line}\n')

In [3]:
# get comments form csv file
def get_comments_from_file(file_path):
    comments_df = pd.read_csv(file_path)
    return comments_df

In [5]:
def normalize_comment(comment):
    normalizer = Normalizer()
    lemmatizer = Lemmatizer()
    tagger = POSTagger ( model = 'pos_tagger.model' )

    normalized_text = normalizer.normalize(comment)
    normalized_text = re.sub('\r\n', '', normalized_text)
    normalized_text = re.sub('\u200c', '', normalized_text)

    tokenized_text = word_tokenize(normalized_text)

    lemmatized_text = []
    for token in tokenized_text:
        lemmatized_token = lemmatizer.lemmatize(token)
        lemmatized_text.append(lemmatized_token)

    tagged_text = tagger.tag(lemmatized_text)

    return tagged_text

In [None]:
def write_to_elasticsearch(dataset, name):
    # Connect to Elasticsearch
    es = Elasticsearch("https://localhost:9200", basic_auth=('elastic', 'zkDnyODIsWuqAaGpBexz'), verify_certs=False)

    # if index exists
    if es.indices.exists(index=name):
        i = 0
        for element in dataset:
            es.index(
                index=name,
                id = i,
                document={
                    'text': element
                },
            )
            i += 1

    # if index doesn't exist
    else:
        es.indices.create(index=name)     # create the index first
        i = 0
        for element in dataset:
            es.index(
                index=name,
                id = i,
                document={
                    'text': element
                },
            )
            i += 1

In [None]:
# get <article> elements that contain comments
url = "https://www.digikala.com/product/dkp-9510040/"
digikala_comment_elements = get_digikala_comments(url, page_nums=10)

In [None]:
# get comment texts form <article> elements
comments = []
for comment_list in digikala_comment_elements:
    comment_texts = get_comment_texts(comment_list)
    for item in comment_texts:
        comments.append(item)

In [13]:
comments_df = get_comments_from_file(file_path='comments_partial.csv')
comments = list(comments_df['comment'])

tagged_comments = []
for comment in comments[:100]:
    tagged_comment = normalize_comment(comment)
    tagged_comments.append(tagged_comment)

In [None]:
write_to_elasticsearch(comments, name='comments')

## Part 2-4

In [14]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import nltk

In [15]:
grammers = [
"""
NP:
        {<NOUN,EZ>?<NOUN.*>}    # Noun(s) + Noun(optional)

""",

"""
NP:
        {<NOUN.*><ADJ.*>?}    # Noun(s) + Adjective(optional)

"""
]

In [16]:
sent2vec_model_path = 'sent2vec-naab.model'
sent2vec_model = SentEmbedding(sent2vec_model_path)

In [17]:
# TODO - use the normalize method in part 1

def normalize_and_tokenize(comment):
    normalizer = Normalizer()

    normalized_text = normalizer.normalize(comment)
    normalized_text = re.sub('\r\n', '', normalized_text)
    normalized_text = re.sub('\u200c', '', normalized_text)
    tokenized_text = [word_tokenize(txt) for txt in sent_tokenize(normalized_text)]
    return tokenized_text

def tag_comment(tokenized_text, model_path):
    tagger = POSTagger(model = model_path)
    token_tag_list = tagger.tag_sents(tokenized_text)
    return token_tag_list

In [18]:
def extract_candidates(tagged, grammer):
    keyphrase_candidate = set()
    np_parser = nltk.RegexpParser(grammer)
    trees = np_parser.parse_sents(tagged)
    for tree in trees:
        for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'):  # For each nounphrase
            # Concatenate the token with a space
            keyphrase_candidate.add(' '.join(word for word, tag in subtree.leaves()))
    keyphrase_candidate = {kp for kp in keyphrase_candidate if len(kp.split()) <= 5}
    keyphrase_candidate = list(keyphrase_candidate)
    return keyphrase_candidate

def get_candidates(tagged, grammers):
    all_candidates = set()
    for grammer in grammers:
        all_candidates.update(extract_candidates(tagged, grammer))

    all_candidates = np.array(list(all_candidates))
    return all_candidates

In [19]:
def get_candidates_vector(candidates):
    candidates_vectors = [sent2vec_model[candidate] for candidate in candidates]

    candidates_concatinate = ' '.join(candidates)
    whole_text_vector = sent2vec_model[candidates_concatinate]

    return np.array(candidates_vectors), whole_text_vector

In [20]:
def cosine_similarity_candidate_and_whole(candidates_vectors, whole_text_vector):
    candidates_sim_whole = cosine_similarity(candidates_vectors, whole_text_vector.reshape(1,-1))
    candidates_sim_whole.reshape(1,-1)
    candidates_sim_whole_norm = candidates_sim_whole / np.max(candidates_sim_whole)
    candidates_sim_whole_norm = 0.5 + (candidates_sim_whole_norm - np.average(candidates_sim_whole_norm)) / np.std(candidates_sim_whole_norm)
    return candidates_sim_whole_norm

def cosine_similarity_between_candidates(candidates_vectors):
    candidate_sim_candidate = cosine_similarity(candidates_vectors)
    np.fill_diagonal(candidate_sim_candidate, np.NaN)
    candidate_sim_candidate_norm = candidate_sim_candidate / np.nanmax(candidate_sim_candidate, axis=0)
    candidate_sim_candidate_norm = 0.5 + (candidate_sim_candidate_norm - np.nanmean(candidate_sim_candidate_norm, axis=0)) / np.nanstd(candidate_sim_candidate_norm, axis=0)
    return candidate_sim_candidate_norm

In [14]:
def find_keywords(beta, N, candidates, candidates_sim_whole_norm, candidate_sim_candidate_norm):
    keyword_num = min(len(candidates), N)

    selected_candidates = []
    unselected_candidates = [i for i in range(len(candidates))]
    best_candidate = np.argmax(candidates_sim_whole_norm)
    selected_candidates.append(best_candidate)
    unselected_candidates.remove(best_candidate)


    for i in range(keyword_num-1):
        selected_vec = np.array(selected_candidates)
        unselected_vec = np.array(unselected_candidates)

        unselected_candidate_sim_whole_norm = candidates_sim_whole_norm[unselected_vec, :]

        dist_between = candidate_sim_candidate_norm[unselected_vec][:, selected_vec]

        if dist_between.ndim == 1:
            dist_between = dist_between[:, np.newaxis]

        best_candidate = np.argmax(beta * unselected_candidate_sim_whole_norm - (1 - beta) * np.max(dist_between, axis = 1).reshape(-1,1))
        best_index = unselected_candidates[best_candidate]
        selected_candidates.append(best_index)
        unselected_candidates.remove(best_index)
        
    keywords = candidates[selected_candidates].tolist()
    
    difference = N - len(keywords)
    if(difference > 0 and difference < N):
        for _ in range(difference):
            keywords.append('null')

    return keywords

In [None]:
tagged_comments = []
model_path = 'pos_tagger.model'
for comment in comments[:100]:
    tagged = tag_comment(normalize_and_tokenize(comment),model_path)
    tagged_comments.append(tagged)

In [None]:
write_to_elasticsearch(tagged_comments, name='tagged_comments')

In [None]:
keywords_list = []
for comment in tagged_comments:
    # get candidate keywords for a comment
    candidates = get_candidates(comment, grammers)
    write_to_elasticsearch(tagged_comments, name='candidates')
    
    if(len(candidates) > 0):
        # turn candidates into word vectors and also get a word vector for the whole text
        candidates_vectors, whole_text_vector = get_candidates_vector(candidates)

        # compute cosine similarity between each candidate and the whole text and also between candidates themselves.
        # and also normal the vectors
        candidates_sim_whole_norm = cosine_similarity_candidate_and_whole(candidates_vectors, whole_text_vector)

        candidate_sim_candidate_norm = cosine_similarity_between_candidates(candidates_vectors)

        # get keywords for each comment
        # if less than 5 keywords, type null instead
        beta = 0.82
        N = 5
        keywords = find_keywords(beta, N, candidates, candidates_sim_whole_norm, candidate_sim_candidate_norm)
        keywords_list.append(keywords)

write_to_elasticsearch(keywords_list, name='keywords')

## Part 5

In [None]:
import csv

with open('keywords.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(keywords_list)

In [17]:
wordEmbedding = WordEmbedding(model_type = 'fasttext')
wordEmbedding.load_model('fasttext_skipgram_300.bin')

In [31]:
keyword_vectors = []
for row in keywords_list:
    kv_vectors = []
    for word in row:
        kv_vectors.append(wordEmbedding.get_normal_vector(word))
    keyword_vectors.append(kv_vectors)

In [36]:
X = np.array([np.array(xi) for xi in keyword_vectors])
X = X.flatten()
X = np.reshape(X, newshape=(len(keyword_vectors), 1500))

y = np.array(comments_df['label'][:len(keyword_vectors)])

X.shape, y.shape

((413, 1500), (413,))

In [45]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(X_train, y_train)

In [46]:
print(neigh.score(X_train, y_train))
print(neigh.score(X_test, y_test))

0.9151515151515152
0.9156626506024096
