# Automatic filtering 

In [1]:
# import packages
import csv
import pandas as pd
import re
from bs4 import BeautifulSoup
import requests
import os
import numpy as np
import string
import math
from nltk import ngrams
import json
import time
import seaborn as sns
from matplotlib import pyplot as plt
import shutil
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import PunktSentenceTokenizer

In [2]:
# import internal modules
import file_path_management as fpath
import public_library as plib
import extract_info
import parameters as params
import download_and_process_pdf as dpp
import dataframe_columns as df_col



## Predefined fucntions:

In [8]:
def calcul_index(counts_dict, ranking_params_weights):
    for key in counts_dict.keys():
        index += math.log(1 + counts_dict[key]) * (ranking_params_weights[key])
    return index
# --------------------start of test code--------------------
# keywords_count_or_fre = {}
# index = calcul_related(keywords_count_or_fre, params.on_topic_kws_weights)
# print(index)
# ---------------------end of test code---------------------

In [9]:
def rank(db_path, db_ranked_path, ranking_params_weights):
    df = pd.read_csv(db_path, header=None, sep=",")
    df.columns = df_col.db_columns
    
    for ind in df.index:
        count_dict = {}
        for key in params.ranking_kw_groups.keys():
            count_dict[ind, key+"_COUNT"] = int(df.at[ind, key+"_COUNT"])
        # print(count_dict)

        relev_index = calcul_index(count_dict, ranking_params_weights)
        
        # csv column names
        columns = df_col.db_ranked_columns

        # specify rows
        row = {
            "INDEX": [df.at[ind, "INDEX"]],
            "DOI": [df.at[ind, "DOI"]],
            "PMID": [df.at[ind, "PMID"]],
            "PMCID": [df.at[ind, "PMCID"]],
            "TITLE": [df.at[ind, "TITLE"]],
        }
        # merge dicts row and count_dict into one
        row = {**row, **count_dict}
        row["RELEVANCE_INDEX"] = [relev_index]
        print(row)

        # save to csv
        if not plib.add_row_to_csv(db_ranked_path, row, columns):
            print("Error detected when adding a row to csv!")
        
        line_number_in_csv = ind + 1
        print("Line number:", line_number_in_csv, " INDEX:", int(df.at[ind, "INDEX"]))
    
    # sort and save to csv
    df_to_rank = df.read_csv(db_ranked_path, header=None, sep=",")
    df_to_rank.columns = df_col.db_ranked_columns

    df_ranked = df_to_rank.sort_values(by="RELEVANCE_INDEX", ascending=False)
    df_ranked.reset_index(drop=True, inplace=True)
    df_ranked.to_csv(db_ranked_path, header=True, index=False)
    print("Weighting and ranking the potentially related literature succeded!")
    print("Enjoy reading!")
# --------------------start of test code--------------------
# input_path = fpath.poten_litera_db_kw_count
# output_path = fpath.poten_litera_db_ranked
# rank(input_path, output_path, params.ranking_params_weights)
# ---------------------end of test code---------------------

## Main program:

### Clustering

In [None]:
# Clustering
from sklearn.cluster import KMeans
# Choose the number of clusters
n_clusters = 3  # Replace with the number of clusters you want to find
# Create a K-Means clustering model
kmeans = KMeans(n_clusters=n_clusters)

# Fit the model to the lower-dimensional data and predict cluster labels
cluster_labels = kmeans.fit_predict(dim_2_data)

### Ploting clutsering results

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Sample data (replace with your data)
data = np.random.rand(100, 2)  # Example 2D data

# Define the indices to label as "red" (replace with your specific indices)
red_indices = [5, 12, 25, 38]

# Create an array of labels
labels = np.array(['blue'] * len(data))  # Initialize all labels as "blue"
labels[red_indices] = 'red'  # Update labels for the specified indices to "red"

# Separate "red" and "blue" data points
red_data = data[labels == 'red']
blue_data = data[labels == 'blue']

# Visualize the labeled data
plt.scatter(blue_data[:, 0], blue_data[:, 1], c='blue', label='Blue')
plt.scatter(red_data[:, 0], red_data[:, 1], c='red', label='Red')
plt.legend()
plt.title('Data with Red and Blue Labels')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

### Candidate articles ranking 

In [None]:
# read from poten_litera_db_kw_count, rank the candidate articles, save to poten_litera_db_ranked
input_path = fpath.poten_litera_db_kw_count
output_path = fpath.poten_litera_db_ranked

# clear file
plib.clear_file(output_path)

rank(input_path, output_path, params.ranking_kw_groups_weights)

### Ranking results analysis

In [None]:
# # read the ranked database and obtain the relevance_index of YESs and NOs 
# # of the test data set and draw a violin plot, and calculate the difference between the two distributions
# # the difference is defined as 1. t-statistic 2. 
# db_ranked_path = fpath.poten_litera_db_ranked
# test_path = fpath.poten_litera_testing_set_1000_read

# df_db_ranked = pd.read_csv(db_ranked_path, header=0, sep=',')
# df_db_ranked.columns = [
#     "INDEX", "DOI", "PMID", "PMCID", "FULL_TEXT_URL", "FULL_TEXT_SOURCE", "PDF_URL", "PDF_SOURCE", 
#     "TITLE", "ABSTRACT", "KEYWORDS", 
#     "SPECIES_RELATED", "TC_CT_RELATED", "THALAM_RELATED", "CORTEX_RELATED", "METHOD_RELATED", "CONNECTIVITY_RELATED",
#     "RELEVANCE_INDEX"]

# df_test = pd.read_csv(test_path, header=0, sep=',')
# df_test.columns = ["INDEX", "DOI", "PMID", "PMCID", "FULL_TEXT_URL", "FULL_TEXT_SOURCE", "PDF_URL", "PDF_SOURCE", "TITLE", "ABSTRACT", "KEYWORDS", "RELEVANT"]

In [None]:
# # save the counts of the keywords in the respective lists
# relevant_species = []
# relevant_tc_ct = []
# relevant_thalam = []
# relevant_cortex = []
# relevant_method = []
# relevant_connectivity = []

# non_relevant_species = []
# non_relevant_tc_ct = []
# non_relevant_thalam = []
# non_relevant_cortex = []
# non_relevant_method = []
# non_relevant_connectivity = []

# relvant_index = []
# relevant_relevance_index_list = []

# non_relevant_index = []
# non_relevant_relevance_index_list = []

# for ind in df_test.index:
#     index = int(df_test.at[ind, "INDEX"])
#     # print(ind, index)
#     # print(df_db_ranked.loc[df_db_ranked["INDEX"].apply(int) == index, "RELEVANCE_INDEX"].values[0])

#     # if df_test.at[ind, "RELEVANT"] == "YES" and df_db_ranked.loc[df_db_ranked["INDEX"].apply(int) == index, "LENGTH_TEXT"].values[0] > 100:
#     if df_test.at[ind, "RELEVANT"] == "YES":
#         relvant_index.append(index)
#         relevant_relevance_index_list.append(df_db_ranked.loc[df_db_ranked["INDEX"].apply(int) == index, "RELEVANCE_INDEX"].values[0])
#         relevant_species.append(df_db_ranked.loc[df_db_ranked["INDEX"].apply(int) == index, "SPECIES_RELATED"].values[0])
#         relevant_tc_ct.append(df_db_ranked.loc[df_db_ranked["INDEX"].apply(int) == index, "TC_CT_RELATED"].values[0])
#         relevant_thalam.append(df_db_ranked.loc[df_db_ranked["INDEX"].apply(int) == index, "THALAM_RELATED"].values[0])
#         relevant_cortex.append(df_db_ranked.loc[df_db_ranked["INDEX"].apply(int) == index, "CORTEX_RELATED"].values[0])
#         relevant_method.append(df_db_ranked.loc[df_db_ranked["INDEX"].apply(int) == index, "METHOD_RELATED"].values[0])
#         relevant_connectivity.append(df_db_ranked.loc[df_db_ranked["INDEX"].apply(int) == index, "CONNECTIVITY_RELATED"].values[0])
#     # elif df_db_ranked.loc[df_db_ranked["INDEX"].apply(int) == index, "LENGTH_TEXT"].values[0] > 100:
#     else:
#         non_relevant_index.append(index)
#         non_relevant_relevance_index_list.append(df_db_ranked.loc[df_db_ranked["INDEX"].apply(int) == index, "RELEVANCE_INDEX"].values[0])
#         non_relevant_species.append(df_db_ranked.loc[df_db_ranked["INDEX"].apply(int) == index, "SPECIES_RELATED"].values[0])
#         non_relevant_tc_ct.append(df_db_ranked.loc[df_db_ranked["INDEX"].apply(int) == index, "TC_CT_RELATED"].values[0])
#         non_relevant_thalam.append(df_db_ranked.loc[df_db_ranked["INDEX"].apply(int) == index, "THALAM_RELATED"].values[0])
#         non_relevant_cortex.append(df_db_ranked.loc[df_db_ranked["INDEX"].apply(int) == index, "CORTEX_RELATED"].values[0])
#         non_relevant_method.append(df_db_ranked.loc[df_db_ranked["INDEX"].apply(int) == index, "METHOD_RELATED"].values[0])
#         non_relevant_connectivity.append(df_db_ranked.loc[df_db_ranked["INDEX"].apply(int) == index, "CONNECTIVITY_RELATED"].values[0])

In [None]:
# for i in range(len(relvant_index)):
#     print(relvant_index[i], relevant_relevance_index_list[i])
#     # print(relevant_relevance_index_list)

In [None]:
# # plot the dot plot of the relevance_index of YESs and NOs of the test data set
# plt.figure(figsize=(10, 5))
# plt.plot(relvant_index, relevant_relevance_index_list, 'ro', label="YES")
# plt.plot(non_relevant_index, non_relevant_relevance_index_list, 'bo', label="NO")
# plt.xlabel("Index")
# plt.ylabel("Relevance Index")
# plt.legend()

# # add labels for the relevant index points
# for i, index in enumerate(relvant_index):
#     plt.text(index, relevant_relevance_index_list[i]+0.1, str(index), color='black', fontsize=10)

# # # add labels for the relevant index points
# # for i, index in enumerate(non_relevant_index):
# #     plt.text(index, non_relevant_relevance_index_list[i], str(index), color='black', fontsize=10)

# plt.show()

In [None]:
# # plot the 6 dot plots of the species_related, tc_ct_related, thalam_related, cortex_related, method_related, connectivity_related of YESs and NOs of the test data set in 2 rows in the same figure
# plt.figure(figsize=(10, 10))
# plt.subplot(3, 2, 1)
# plt.plot(relvant_index, relevant_species, 'ro', label="YES")
# plt.plot(non_relevant_index, non_relevant_species, 'bo', label="NO")
# plt.xlabel("Index")
# plt.ylabel("Species Related")
# plt.legend()

# plt.subplot(3, 2, 2)
# plt.plot(relvant_index, relevant_tc_ct, 'ro', label="YES")
# plt.plot(non_relevant_index, non_relevant_tc_ct, 'bo', label="NO")
# plt.xlabel("Index")
# plt.ylabel("TC_CT Related")
# plt.legend()

# plt.subplot(3, 2, 3)
# plt.plot(relvant_index, relevant_thalam, 'ro', label="YES")
# plt.plot(non_relevant_index, non_relevant_thalam, 'bo', label="NO")
# plt.xlabel("Index")
# plt.ylabel("Thalam Related")
# plt.legend()

# plt.subplot(3, 2, 4)
# plt.plot(relvant_index, relevant_cortex, 'ro', label="YES")
# plt.plot(non_relevant_index, non_relevant_cortex, 'bo', label="NO")
# plt.xlabel("Index")
# plt.ylabel("Cortex Related")
# plt.legend()

# plt.subplot(3, 2, 5)
# plt.plot(relvant_index, relevant_method, 'ro', label="YES")
# plt.plot(non_relevant_index, non_relevant_method, 'bo', label="NO")
# plt.xlabel("Index")
# plt.ylabel("Method Related")
# plt.legend()

# plt.subplot(3, 2, 6)
# plt.plot(relvant_index, relevant_connectivity, 'ro', label="YES")
# plt.plot(non_relevant_index, non_relevant_connectivity, 'bo', label="NO")
# plt.xlabel("Index")
# plt.ylabel("Connectivity Related")
# plt.legend()

# plt.show()


In [None]:
# def pick_values_uniformly(data, n):
#     """Pick up `n` values uniformly from `data`."""
#     if n <= 0:
#         return []

#     # Determine the range of the data
#     min_val, max_val = min(data), max(data)

#     threshold = (max_val - min_val) / n / 2

#     # If n is 1, just return the midpoint
#     if n == 1:
#         return [(min_val + max_val) / 2]

#     # Calculate the interval size
#     interval = (max_val - min_val) / (n - 1)

#     # Get the uniform values
#     return [min_val + i * interval for i in range(n)], threshold

# # data = [1, 3, 5, 2, 8, 10, 2]
# n = 5
# density_display_index, thres = pick_values_uniformly(relevant_relevance_index_list + non_relevant_relevance_index_list, n)
# print(density_display_index)

In [None]:
# import pandas as pd
# import seaborn as sns
# import matplotlib.pyplot as plt

# # Printing the length of lists
# print("Numer of relevant literature:", len(relevant_relevance_index_list))
# print("Number of not relevant literature:", len(non_relevant_relevance_index_list))
# print()

# # Create a DataFrame for plotting
# df = pd.DataFrame({'Relevance Index': relevant_relevance_index_list + non_relevant_relevance_index_list, 
#                    'Label': ['Relevant'] * len(relevant_relevance_index_list) + ['Not Relevant'] * len(non_relevant_relevance_index_list)})

# # Draw the violin plot
# plt.figure(figsize=(10, 6))
# ax = sns.violinplot(x='Label', y='Relevance Index', data=df, bw='scott', cut=0)

# relevance_indices = density_display_index  # Replace with your relevance indices

# threshold = thres  # Adjust this based on your desired range around the relevance index

# for index in relevance_indices:
#     ax.axhline(index, color='gray', linestyle='--')
    
#     for i, label in enumerate(df['Label'].unique()):
#         # Filter data points close to the current relevance index
#         close_points = df[(df['Label'] == label) & (np.abs(df['Relevance Index'] - index) < threshold)]
#         density = len(close_points)
        
#         ax.text(i, index + 0.1, str(density), ha='center', va='center', color='red', fontsize=9)  # adjust the vertical offset (0.1 here) as necessary

# plt.title('Distribution of Relevance Index')
# plt.show()

In [None]:
# import scipy.stats as stats

# # Check the equality of variances
# var_relevant = np.var(relevant_relevance_index_list)
# var_non_relevant = np.var(non_relevant_relevance_index_list)
# print('Variance of relevant:', var_relevant)
# print('Variance of non-relevant:', var_non_relevant)
# print(var_relevant/var_non_relevant)
# # statistic, p_value = stats.levene(relevant_relevance_index_list, non_relevant_relevance_index_list)

# # # Print the results
# # print('Levene test statistic:', statistic)
# # print('p-value:', p_value)

In [None]:
# # Calculate the t-statistic and p-value
# t_statistic, p_value = stats.ttest_ind(relevant_relevance_index_list, non_relevant_relevance_index_list)

# # Print the results
# print('t-statistic:', t_statistic)
# print('p-value:', p_value)

<h3> Next step: manually read papers and find all actually related literature </h3>