In [1]:
import numpy as np
import pandas as pd
import random
import math
import copy
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from wordcloud import WordCloud
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, StratifiedKFold, GridSearchCV
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from wordcloud import WordCloud
random.seed = 456

In [2]:
!pip install wordcloud



In [3]:
def normalize_keywords_list(list_of_keyword_dicts):
    normalized_list = []
    for keyword_dict in list_of_keyword_dicts:
        total_value = sum(keyword_dict.values())
        normalized_keywords = keyword_dict.copy()
        for keyword in normalized_keywords:
            normalized_keywords[keyword] /= total_value
        normalized_list.append(normalized_keywords)
    return normalized_list

In [4]:
def calculate_tfidf(list_of_keyword_dicts):
    idf_values = dict.fromkeys(list_of_keyword_dicts[0].keys(), 0.0)

    for keyword_dict in list_of_keyword_dicts:
        for word, frequency in filter(lambda x: x[1] > 0, keyword_dict.items()):
            idf_values[word] += 1.0

    for word in idf_values:
        if idf_values[word]:
            idf_values[word] = math.log(len(list_of_keyword_dicts) / idf_values[word])

    tfidf_results = []
    for keyword_dict in list_of_keyword_dicts:
        keyword_dict = keyword_dict.copy()
        total_frequency = sum(keyword_dict.values())
        for word, frequency in filter(lambda x: x[1] > 0, keyword_dict.items()):
            keyword_dict[word] *= idf_values[word] / total_frequency
        tfidf_results.append(keyword_dict)
    return tfidf_results

In [5]:
def normalize_tfidf(list_of_keyword_dicts):
    list_of_keyword_dicts = calculate_tfidf(list_of_keyword_dicts)
    keywords = list_of_keyword_dicts[0].keys()
    normalized_list = []

    for index, value_dict in enumerate(list_of_keyword_dicts):
        norm = 0

        for v in value_dict.values():
            norm += v**2

        norm = math.sqrt(norm)

        if norm == 0:
            normalized_list.append(dict(zip(keywords, value_dict.values())))
            continue

        normalized_values = map(lambda x: x / norm, value_dict.values())
        normalized_list.append(dict(zip(keywords, normalized_values)))

    return normalized_list

In [6]:
def generate_word_cloud(word_frequency_dict, title, axis=None):

    wordcloud = WordCloud(background_color="white", height=500, width=1000, random_state=random.seed)
    wordcloud.generate_from_frequencies(word_frequency_dict)

    if axis is not None:
        axis.imshow(wordcloud, interpolation='bilinear')
        axis.set_title(title)
        axis.axis(False)
    else:
        plt.subplots(num=None, figsize=(15, 10), dpi=80)
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.title(title)
        plt.axis("off")

In [7]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\linag\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\linag\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\linag\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
stemmer = SnowballStemmer(language='english')
lemmatizer = nltk.WordNetLemmatizer()
stopwords_set = set(stopwords.words('english'))
punctuation = string.punctuation

In [9]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
def preprocess_and_stem_nltk(text):
    tokens = word_tokenize(text.lower())
    return [stemmer.stem(word) for word in tokens if word not in stopwords_set and word not in punctuation]

In [11]:
def preprocess_and_lemmatize_nltk(text):
    tokens = word_tokenize(text.lower())
    return [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords_set and word not in punctuation]

In [12]:
def create_total_bag(results):
    word_frequency_dict = dict()
    for result in results:
        tokens = preprocess_and_lemmatize_nltk(result['abstract'])
        for token in tokens:
            if token not in word_frequency_dict:
                word_frequency_dict[token] = 1
            else:
                word_frequency_dict[token] += 1
    return word_frequency_dict

In [13]:
def update_word_frequency(abstract, word_frequency_dict):
    tokens = preprocess_and_lemmatize_nltk(abstract)
    for token in tokens:
        if token in word_frequency_dict:
            word_frequency_dict[token] += 1
    return word_frequency_dict

In [14]:
def remove_low_frequency_words(word_frequency_dict, threshold=0):
    filtered_dict = dict()
    for word, frequency in word_frequency_dict.items():
        if frequency > threshold:
            filtered_dict[word] = frequency
    return filtered_dict

In [15]:
def generate_list_keywords(word_frequency_dict, articles):
    list_keywords = []
    temp_bag = dict.fromkeys(word_frequency_dict, 0)
    for article in articles:
        list_keywords.append(update_word_frequency(article['abstract'], temp_bag.copy()))
    del temp_bag
    return list_keywords

In [16]:
def extract_list_keywords_weights(list_of_keyword_weights):
    list_keywords_weight_items = []
    for item in list_of_keyword_weights:
        list_keywords_weight_items.append(list(item.values()))
    return list_keywords_weight_items

In [17]:
def binary_precision_positive_class(y_true, y_pred):
    return precision_score(y_true, y_pred, pos_label=0)

def binary_recall_positive_class(y_true, y_pred):
    return recall_score(y_true, y_pred, pos_label=0)

def binary_f1_positive_class(y_true, y_pred):
    return f1_score(y_true, y_pred, pos_label=0)

In [18]:
precision_binary_positive = make_scorer(binary_precision_positive_class, greater_is_better=True)
recall_binary_positive = make_scorer(binary_recall_positive_class, greater_is_better=True)
f1_binary_positive = make_scorer(binary_f1_positive_class, greater_is_better=True)

In [19]:
def evaluate_classifier(model, X_train, y_train, X_test=None, y_test=None):
    cross_validator = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
    scoring_metrics = {'accuracy': 'accuracy', 'precision_binary_positive': precision_binary_positive, 'recall_binary_positive': recall_binary_positive, 'f1_binary_positive': f1_binary_positive}
    scores = cross_validate(model, X_train, y_train, cv=cross_validator, return_estimator=True, scoring=scoring_metrics)
    return cross_validator, scores

In [20]:
def plot_classifier_roc(list_of_cross_validators, list_of_scores, x_train, y_train, x_test=None, y_test=None):

    list_num_best_models = []
    for i in range(len(list_of_scores)):
        list_num_best_models.append(list_of_scores[i]['test_f1_binary_positive'].argmax())

    best_models = []
    for i in range(len(list_of_scores)):
        best_models.append(list_of_scores[i]['estimator'][list_num_best_models[i]])

    list_x_test = []
    list_y_test = []
    for j in range(len(list_num_best_models)):
        if x_test is None and y_test is None:
            _, test_num = list(list_of_cross_validators[j].split(x_train, y_train))[list_num_best_models[j]]
            x_test, y_test = [], []
            for i in test_num:
                x_test.append(x_train[i])
                y_test.append(y_train[i])
            list_x_test.append(x_test)
            list_y_test.append(y_test)
            x_test = None
            y_test = None

    list_y_proba = []
    for i in range(len(best_models)):
        list_y_proba.append(best_models[i].predict_proba(list_x_test[i])[:, 1])

    list_fpr = [0] * len(list_y_proba)
    list_tpr = [0] * len(list_y_proba)
    for i in range(len(list_y_proba)):
        list_fpr[i], list_tpr[i], _ = roc_curve(list_y_test[i], list_y_proba[i])

    plt.figure(figsize=(15, 10))
    lw = 2
    for i in range(len(list_tpr)):
        plt.plot(
            list_fpr[i],
            list_tpr[i],
            lw=lw,
            label=f'{str(best_models[i]).split("(")[0]}\nAUC = {round(roc_auc_score(list_y_test[i], list_y_proba[i]), 5)}'
        )
    plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
    plt.xlim([-0.1, 1.1])
    plt.ylim([-0.1, 1.1])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title('ROC curve')
    plt.legend(loc="lower right")
    plt.show()