In [182]:
import os
import time
import deepl
import random
import re as re
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
import seaborn as sns
import translators as ts
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker 
# check 
%matplotlib inline
import numbers
import string
from langdetect import detect
from tabulate import tabulate
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from pandas.errors import ParserError
from langdetect import DetectorFactory
from stop_words import get_stop_words
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler

from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import preprocessing
from sklearn.metrics import accuracy_score 
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
#from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder #maybe dont' need
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, classification_report, hamming_loss, accuracy_score, balanced_accuracy_score, make_scorer, confusion_matrix



#part 1 - dataset processing 



#functions adapted from: https://www.kaggle.com/code/abdmental01/text-preprocessing-nlp-steps-to-process-text

def removing_html_tags(text):
    """
    This function removes HTML tags in a given string (if it has one)
    
    input : text string
    return : modified text without HTML tag
    
    """
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', str(text))

def removing_url(text):
    """
    This function removes URLs in a given string (if it has one)
    
    input : text string
    return : modified text without URL
    
    """
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

punc = string.punctuation  #initiating string punctuation variable

def removing_string_punc(text):
    """
    This function removes string punctuations (!"#$%&'()*+, -./:;<=>?@[\]^_`{|}~)
    
    input : text string
    return : modified text without string punctuations
    
    """
    return text.translate(str.maketrans('', '', punc))

stop_words = get_stop_words('en') #initiating variable with english stop words
def remove_stopwords(text):
    """This function removes stop words from english vocabulary in a given string input (text)"""
    new_text = []
    for word in text.split():
        if word in stop_words:
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

def data_processing_function(list_of_interest):
    """This function takes a list or series as input, and applies each datapre-processing steps defined before,
    and return the processed list"""
    
    nl = list_of_interest.dropna().astype(str)
    nl = nl.str.lower()  #Lowercase formatting
    nl_html = nl.apply(removing_html_tags) #removing HTMLs
    nl_url = nl_html.apply(removing_url) #removing URLs
    nl_sp = nl_url.apply(removing_string_punc) #removing string punctuation: '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    nl_stopword = nl_sp.apply(remove_stopwords) #removing stop words
    return nl_stopword


def find_category(category_col, cat_dictionary):
    
    """
    This function will look for each string in the dictionary coining categories and subcategories to each class name,
    and returns the correct sentiment  
    
    """
    new_sentinent = []
    for string in category_col:
        found = False
        for key, values in cat_dictionary.items():
            if string in values:
                new_sentinent.append(key)
                found = True
                break  
        if not found:
            new_sentinent.append("NA")
            
    return new_sentinent




#part 2 - dataset distribution and feature analysis

def tf_idf_fun(df_column):
    """ this function give a takes a dataframe with text samples as input, 
    it vectorizes the words into CountVectorizer() matrix, and then converts that into TF-IDF matrix
    
    return:
    1. Vector with each feature (words) and its TF-IDF value
    2. TF-IDF vector
    3. CV vector
    
    """
    df_column = df_column.dropna()
    cv=CountVectorizer()
    cv_vec=cv.fit_transform(df_column) #transformation
    print(cv_vec.shape)
    
    cv_df = pd.DataFrame(data= cv_vec.toarray(), columns = cv.get_feature_names_out())
    
    tf_transform=TfidfTransformer(smooth_idf=True,use_idf=True) #tfidf instansiating
    tfidfvec = tf_transform.fit(cv_vec) #Transforming a count matrix into a tf-idf format
    idf_data = pd.DataFrame(tf_transform.idf_, index=cv.get_feature_names_out(),columns=["tfidf_values"]) #computing the IDF values
    result_tfidf = idf_data.sort_values(by=['tfidf_values']) #sorting by idf value 
    return cv_vec, tfidfvec, result_tfidf

def make_distribution(labeled1_df, name):
    
    """
    This function makes a label distribution bar plot, illustrating the classes given in column "sentiment"
    """
    output = labeled1_df["sentiment"].value_counts().sort_index()  #checking dataset distribution
    print(output)
    plt.bar(output.index, output.values)
    plt.xticks(rotation=90)
    plt.title(f" {name} - Data Distribution")
    plt.savefig(f"distributions_{name}.jpg", bbox_inches='tight')
    plt.show()
    return

def make_distribution2(grouped_function, text):
    
    """
    This function makes a label distribution bar plot
    """
    output = grouped_function
    plt.bar(output.index, output.values)
    plt.xticks(rotation=90)
    plt.title(f" {text} - Category Data Distribution")
    plt.savefig(f"distributions_prediction{text}.jpg", bbox_inches='tight')
    plt.show()
    return

def ca_class_weights(df_annotation):
    """
    This function returns a dictionary of class weights for each class, defined the following way:
    from class_n0 .. class_n
    
    weight = total_samples / (n_total * class_n_samples)
    
    adapted from :
    https://medium.com/@ravi.abhinav4/improving-class-imbalance-with-class-weights-in-machine-learning-af072fdd4aa4
    
    
    """
    unique_classes, class_counts = np.unique(df_annotation, return_counts=True)
    total_samples = len(df_annotation)
    class_weights = {}

    for class_label, class_count in zip(unique_classes, class_counts):
        class_weight = total_samples / (2.0 * class_count)
        class_weights[class_label] = class_weight

    return class_weights



#Part 3  --- prediction
def conf_matrix_crossval(X, y, model, all_labels):
    """
    confusion matrix w. k-fold cross-validation with 5 splits. 
    
    
    found inspiration through these two sources;
    https://stackoverflow.com/questions/59282807/creating-k-dataframe-using-train-index-test-index-of-kfold-cross-validation-in
    https://scikit-learn.org/1.5/modules/generated/sklearn.model_selection.KFold.html
    

    """
    kf = KFold(n_splits=5, shuffle=True, random_state=37)
    conf_matrices = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        y_test_enco = y_test
        y_pred_enco = y_pred
        
        conf_matrix = confusion_matrix(y_test_enco, y_pred_enco, labels=range(len(all_labels)))
        conf_matrices.append(conf_matrix)
    
    return conf_matrices


def fun_multilabel_test_train_data(df):

    le = LabelBinarizer()
    label_transformed = le.fit_transform(df["sentiment"])
    
    text_data = df["merged_text"]  # 
    X_train, X_test, y_train, y_test = train_test_split(text_data, label_transformed, test_size=0.21, random_state=37)

    return X_train, X_test, y_train, y_test, label_transformed, le

def fun_multilabel_test_train_data2(df):

    le = LabelBinarizer()
    label_transformed = le.fit_transform(df["pred_merge"])
    
    text_data = df["merged_text"]  # 
    X_train, X_test, y_train, y_test = train_test_split(text_data, label_transformed, test_size=0.21, random_state=37)

    return X_train, X_test, y_train, y_test, label_transformed, le
    

def random_forest_hyperparameter_selection(x_training, x_testing, y_training, y_testing, lab):

    """
    This function tests various random forest hyperparameters for optimization via grid search, 
    on the training and testing data of text data (x) and labels (y) : 

    n_estimators : Number of trees in the forest
    max_depth: Max depth of the tree
    min_samples_split: Minimum number of samples to split an internal node
    min_samples_leaf: Minimum number of samples to be at a leaf node
    source : https://scikit-learn.org/1.5/modules/generated/sklearn.ensemble.RandomForestClassifier.html
    

    returns the best f1-score and lowest hamming loss derived from the best combination of hyperparameters


    """
    
    conf_matrix_rf = []
    
    parameter_grid = {'n_estimators': [50, 100, 150],'max_depth': [None, 10, 20],'min_samples_split': [2, 5, 10],'min_samples_leaf': [1, 2, 4]}
    
    random_forest = RandomForestClassifier(random_state=37, bootstrap=True)  #initiating random forest classifier with bootstrap

    grid_search = GridSearchCV(random_forest, parameter_grid, cv=3, scoring='f1_weighted', n_jobs=-1) #initiating grid search with f1-scoring

    
    grid_search.fit(x_training, y_training)  #fitting
    best_params = grid_search.best_params_  #fetch the best parameters from grid search
    
    optimized_random_forest = RandomForestClassifier(**best_params, random_state=37, bootstrap=True)  #calculating the results using the best parameters
    optimized_random_forest.fit(x_training, y_training)  #fitting
    predictions = optimized_random_forest.predict(x_testing)  #predicting the labels with x-dataset training data
    
    rflist = [("Random Forest", optimized_random_forest)]
    
    y_training_enc = np.argmax(y_training, axis=1)
    y_testing_enc = np.argmax(y_testing, axis=1)
    
    x_total = np.vstack([x_training.toarray(), x_testing.toarray()])
    y_total = np.hstack([y_training_enc, y_testing_enc])
    
    
    conf_res_rf =conf_matrix_crossval(x_total, y_total, optimized_random_forest, lab.classes_)
    
    RFmean_of_conf_matrix_arrays = np.mean(conf_res_rf, axis=0)
    

    scoring = {'f1_weighted': make_scorer(f1_score, average='weighted', zero_division=0),'accuracy': make_scorer(accuracy_score)}
    
    cv_results = cross_validate(
    optimized_random_forest,
    x_total,
    y_total,
    cv=5,
    scoring=scoring,
    n_jobs=-1,
    return_train_score=False)
    
    avg_cross_val_f1 = round(cv_results['test_f1_weighted'].mean(), 4)
    avg_std_f1 = round(cv_results['test_f1_weighted'].std(), 4)
    
    avg_cross_val_BA = round(cv_results['test_accuracy'].mean(), 4)
    avg_std_BA = round(cv_results['test_accuracy'].std(), 4)
    
    

    
    print(optimized_random_forest)
    print("Best hyperarameters from random forest classification optimization:", best_params)
    print("Weighted F1-score CV-avr:", avg_cross_val_f1, "std :", avg_std_f1)
    print("Accuracy CV-avr:", avg_cross_val_BA, "std :", avg_std_BA)
    print("Classification Report:\n", classification_report(y_testing, predictions, target_names= lab.classes_))

    return avg_cross_val_f1, avg_std_f1, avg_cross_val_BA, avg_std_BA, RFmean_of_conf_matrix_arrays, optimized_random_forest, rflist


def parameter_est(xtrain, xtest, ytrain, ytest, binlab):
    """
    This function tests various  hyperparameters for the models:
    svm
    Multinomial NB
    Logistic Legression
    
    It optimizes using grid search, and takes the best hyperparameter to estimate f1-score and hamming loss.
    
    Input: training and testing data of text data (x), labels (y) and binary label format: 

    return: The best f1-score and lowest hamming loss for each model. 
    """
    
    f1_score_list = []
    f1_std_list = []
    BA_list = []
    BA_std_list = []
    model_t = []
    model_choice_list = []
    conf_matrix_list = []
    
    models_passed = []
    
    
    scoring = {'f1_weighted': make_scorer(f1_score, average='weighted'),'accuracy': make_scorer(accuracy_score)}
    
    
    sup_vm = LinearSVC(penalty='l2', loss='squared_hinge')
    multi_nb = MultinomialNB()
    logreg = LogisticRegression(solver="liblinear")
    
    #hyperparameter grids
    param_grid_svm = {"estimator__C": [1, 0.5, 1.5, 2], "estimator__max_iter": [1000, 2000, 3000, 4000], "estimator__class_weight": [None, "balanced"]}
    param_grid_NB = {"estimator__alpha": [1, 10**-3, 10**-1, 10**1], "estimator__fit_prior": [True, False]}
    param_grid_logreg = {"estimator__class_weight": [None, "balanced"], "estimator__max_iter": [3000, 4000], "estimator__C":[1, 1.5, 2]}

    model_selection = [param_grid_svm, param_grid_NB, param_grid_logreg]
    model_call = [sup_vm, multi_nb, logreg]
    modelname = ["SVM", "Multinomial NB", "Logistic Reg"]
    
    for num, model_choice in enumerate(model_call):
        
        ova = OneVsRestClassifier(model_choice)
        
        #grid search
        grid_search = GridSearchCV(ova, model_selection[num], scoring='f1_weighted', cv=3, n_jobs=-1)
        grid_search.fit(xtrain, ytrain)
        best_params = grid_search.best_params_
        
        params_to_set = {}
        for inner, va in best_params.items():
            key = inner.split('__')[1]
            params_to_set[key] = va

        model_choice.set_params(**params_to_set)
        
        optimized_model = OneVsRestClassifier(model_choice)
        optimized_model.fit(xtrain, ytrain)
        
        model_choice_list.append(optimized_model)
        predictions = optimized_model.predict(xtest)
        
        models_passed.append((modelname[num], optimized_model))
        #cv
        
        
        ytrain_enc = np.argmax(ytrain, axis=1)
        ytest_enc = np.argmax(ytest, axis=1)
        
        x_total = np.vstack([xtrain.toarray(), xtest.toarray()])
        y_total = np.hstack([ytrain_enc, ytest_enc])
        
        
        cv_results = cross_validate(
        optimized_model,
        x_total,
        y_total,
        cv=5,
        scoring=scoring,
        n_jobs=-1,
        return_train_score=False)

        avg_cross_val_f1 = round(cv_results['test_f1_weighted'].mean(), 4)
        avg_std_f1 = round(cv_results['test_f1_weighted'].std(), 4)

        avg_cross_val_BA = round(cv_results['test_accuracy'].mean(), 4)
        avg_std_BA = round(cv_results['test_accuracy'].std(), 4)

        conf_res = conf_matrix_crossval(x_total, y_total, optimized_model, binlab.classes_)
        mean_of_conf_matrix_arrays = np.mean(conf_res, axis=0)
        
        print(model_choice)
        print("Best hyperarameters :", best_params)
        print("Weighted F1-score CV-avr:", avg_cross_val_f1, "std :", avg_std_f1)
        print("Accuracy CV-avr:", avg_cross_val_BA, "std :", avg_std_BA)
        print("Classification Report:\n", classification_report(ytest, predictions, target_names= binlab.classes_))
        
        f1_score_list.append(avg_cross_val_f1)
        f1_std_list.append(avg_std_f1)
        BA_list.append(avg_cross_val_BA)
        BA_std_list.append(avg_std_BA)
        model_t.append(modelname[num])
        
        conf_matrix_list.append(mean_of_conf_matrix_arrays)
        
    return f1_score_list, f1_std_list, BA_list, BA_std_list, conf_matrix_list, model_t, model_choice_list, models_passed

def all_reports_cv(df, df_new_predictions, testname):
    """
    Multiple label classification preperation by optimizing the "random forest" hyperparameters, by testing and scoring the different hyperparameters, 
    and choosing the parameters yielding the lowest hamming loss. 
    
    A classification report, F1-score and a hamming loss is printed for 2 vectorization methods (TF-IDF and CV)
    
    3. hyperparameter selection testing using bootstrapping and grid search
    
    return: 2 tables, one for each vectorization method, their best hyperparameters and hamming loss of those. 
    
    """
    
    table_rf = []
    table_f = []
    
    confusion_rf = []
    confusion_rest = []
    
    model_choices_list = []
    featstrat_list = []
    
    
    x_train, x_test, y_train, y_test, binary_label, le_e = fun_multilabel_test_train_data(df)

    count_vect = CountVectorizer()  
    tfidf_vect = TfidfVectorizer()  
    
    for vectorizer in [count_vect, tfidf_vect]:
        xtrain_vec = vectorizer.fit_transform(x_train)
        xtest_vec = vectorizer.transform(x_test)
        feature_selection_method = str(vectorizer).split('(')[0]
        print("feature selection strategy :", feature_selection_method)
        
        RF_cv_f1, RF_cv_f1_std, RF_cv_BA, RF_cv_std, rf_cv_confmatrix, best_model, rf_est = random_forest_hyperparameter_selection(xtrain_vec, xtest_vec, y_train, y_test, le_e)

        table_rf.append([
            "Random Forest",
            "F1-score",
            feature_selection_method,
            RF_cv_f1
        ])
        table_rf.append([
            "Random Forest",
            "F1-score std.",
            feature_selection_method,
            RF_cv_f1_std
        ])
        table_rf.append([
            "Random Forest",
            "Accuracy",
            feature_selection_method,
            RF_cv_BA
        ])
        table_rf.append([
            "Random Forest",
            "Accuracy std.",
            feature_selection_method,
            RF_cv_std
        ])
        
        model_choices_list.append(best_model)
        
        featstrat_list.append(feature_selection_method)
        
        confusion_rf.append(rf_cv_confmatrix)
        
        LMS_f1_list, LMS_f1_std_list, LMS_BA_list, LMS_BA_std_list, LMS_conf_matrix_list, LMS_model_t, best_model, LMS_est = parameter_est(xtrain_vec, xtest_vec, y_train, y_test, le_e)
        
        
        models_passed = rf_est + LMS_est
        model_choices_list.append(best_model)
        featstrat_list.append(feature_selection_method)
        for num, i in enumerate(LMS_f1_list):

            table_f.append([
                LMS_model_t[num],
                "F1-score",
                feature_selection_method,
                i
            ])
            
            table_f.append([
                LMS_model_t[num],
                "F1-score std",
                feature_selection_method,
                LMS_f1_std_list[num]
            ])
            
            table_f.append([
                LMS_model_t[num],
                "Accuracy",
                feature_selection_method,
                LMS_BA_list[num]
            ])
            table_f.append([
                LMS_model_t[num],
                "Accuracy std.",
                feature_selection_method,
                LMS_BA_std_list[num]
            ])
            
            confusion_rest.append(LMS_conf_matrix_list[num])
            
    
    
    
    df_new_pred, df_top = result_maker_v2(0, df, df_new_predictions, testname, table_f, table_rf, confusion_rf, confusion_rest, binary_label, le_e, model_choices_list, feature_selection_method, x_train, x_test, y_train, y_test, binary_label, le_e, models_passed)
    
    
    
    
    return df_new_pred, df_top


def heatmap_maker(conf_matrix_2d, label_columns, modelname, feat_strat):
    
    """
    This function produces a heatmap from a confusion matrix and column of labels (in correct order)
    
    """
    ax = sns.heatmap(conf_matrix_2d, annot=True, fmt=".1f", cmap='YlGnBu', annot_kws={"fontsize":7})
    
    ax.set_xlabel("Predicted Label", fontsize=14, labelpad=20)
    ax.xaxis.set_ticklabels(label_columns, rotation=90)
    ax.set_ylabel("True Label", fontsize=14, labelpad=20)
    ax.yaxis.set_ticklabels(label_columns, rotation=0)
    ax.set_title(f"Heatmap - ({modelname} and {feat_strat}) - Predicted vs. True", fontsize=14, pad=20)
    plt.show()
    return
 

                                                                                      
def coefficient_printerv2(df, df_new, class_names, method_inst, model_name, choosen_vec_strategy, x_train, x_test, y_train, y_test, binary_label, le_e, models_passed):
    
    selected_model = None  
    
    for name, model in models_passed:
        if name == model_name:
            selected_model = model
            print(f"Selected model: {model_name}")
            break
    
    
    str_vect = choosen_vec_strategy
    xtrain_vec = str_vect.fit_transform(x_train)
    xtest_vec = str_vect.transform(x_test)
    

    selected_model.fit(xtrain_vec, y_train)
    

    predictions_best = selected_model.predict(xtest_vec)
    
    list_of_pos_corr = []

    for num, class_name in enumerate(class_names):
        class_coefficients = selected_model.estimators_[num].coef_[0]

        top_positive = np.argsort(class_coefficients)[-10:] 
        top_pos = []
        for i in top_positive:
            top_pos.append(str_vect.get_feature_names_out()[i])
            
        list_of_pos_corr.append(top_pos)


    full_data_xtest = str_vect.transform(df_new["merged_text"])
    predictions = selected_model.predict(full_data_xtest)

    new = le_e.inverse_transform(predictions)
    df_new["prediction"] = new
    print(len(list_of_pos_corr))

    return list_of_pos_corr, class_names, df_new


def make_table(table_rest, table_random_forest, testname):
    """
    This function, takes the tables created in the all_results function, and makes them into an easyly read format
    using tabulate. 
    input:
    table_random_forest = table output from random forest tests
    table_rest = table output from 3 remaining tests 
    
    return:
    table = table_output
    df_result_sort_out = dataframe with results sorted
    df_results = dataframe with results nonsorted
    """
    
    comb_tables = (table_rest+table_random_forest)
    df_results = pd.DataFrame(comb_tables, columns = ["model_name", "score_method", "feat_strat", "score"])
    df_result_sorted = df_results.sort_values(by=['feat_strat', 'model_name'])
    df_result_sort_out = df_result_sorted.loc[:,["feat_strat", 'model_name','score_method','score']]
    df_table = df_result_sorted.reset_index(drop = True)
    headers = ["Feature Selection Strategy", "Model", "Scoring Method", "Value"]
    table = tabulate(df_table, headers=headers, tablefmt="pretty", colalign=("center",) * len(headers), numalign="center", maxcolwidths = 20)
    print(f" score overview test : {testname}")
    return table, df_result_sort_out, df_results


def scatterplot_maker(df_sorted, name_string):
    
    """
    This function creates a scatterplot out of the test results, plotting F1-score vs. inverse Hamming Loss
    input:
    df_sorted (from make_table function)
    
    return:
    scatterplot 
    sorted_df = dataframe sorted by score 
    
    """
    
    #df_sorted = df_sorted[~((df_sorted["feat_strat"] == "TfidfVectorizer") & (df_sorted["model_name"] == "Logistic Reg"))]
    f1_scores = df_sorted[df_sorted["score_method"] == "F1-score"]
    balanced_acc = df_sorted[df_sorted["score_method"] == "Accuracy"]
    
    f1_scores = f1_scores.rename(columns={"score": "score_f1"})
    balanced_acc_named = balanced_acc.rename(columns={"score": "bal_accuracy"})

    mergeddf = pd.merge(
        f1_scores[["feat_strat", "model_name", "score_f1"]],
        balanced_acc_named[["feat_strat", "model_name", "bal_accuracy"]],
        on=["feat_strat", "model_name"])
    
    mergeddf["maxscore"] = (mergeddf["score_f1"] + (mergeddf["bal_accuracy"]))
    
    sorted_df = mergeddf.sort_values(by=["maxscore"], ascending=[False], ignore_index = True)

    count_vect = df_sorted[df_sorted["feat_strat"] == "CountVectorizer"]
    tfidf_vect = df_sorted[df_sorted["feat_strat"] == "TfidfVectorizer"]

    CV_f1score, CV_hammingloss_mask = count_vect["score_method"] == "F1-score", count_vect["score_method"] == "Accuracy"
    f1score_tfidf, hammingloss_tfidf = tfidf_vect["score_method"] == "F1-score", tfidf_vect["score_method"] == "Accuracy"
    
    
    hamming_cv_yvalue = np.array(count_vect[CV_hammingloss_mask]["score"])
    f1score_cv_xvalue = np.array(count_vect[CV_f1score]["score"])
    print(hamming_cv_yvalue)
    
    CV_modelnames = count_vect[CV_f1score]["model_name"]
    CV_modelnames = list(CV_modelnames)
    
    for num, score in enumerate(f1score_cv_xvalue):
        plt.text(score, hamming_cv_yvalue[num], CV_modelnames[num], fontsize = 7)

    
    TF_modelname = tfidf_vect[f1score_tfidf]["model_name"]

    f1score_tfidf_xvalue = np.array(tfidf_vect[f1score_tfidf]["score"])
    hammingloss_tfidf_yvalue = np.array(tfidf_vect[hammingloss_tfidf]["score"])
    

    TF_modelname = list(TF_modelname)
    for num, score in enumerate(f1score_tfidf_xvalue):
        if TF_modelname[num] == "Random Forest":
            y_adjust = 0.0013
            plt.text(score, (hammingloss_tfidf_yvalue[num]-y_adjust), TF_modelname[num], fontsize = 7)
        else:
            plt.text(score,hammingloss_tfidf_yvalue[num], TF_modelname[num], fontsize = 7)

    plt.arrow(sorted_df["score_f1"][0] - 0.003, sorted_df["bal_accuracy"][0] - 0.003, 0.0020, 0.0020, width = 0.0005, length_includes_head = True,
              head_width=0.002, head_length=0.001, fc='green', ec='green')

    plt.scatter(f1score_cv_xvalue, hamming_cv_yvalue, label = 'CountVectorizer', color = 'blue')
    plt.scatter(f1score_tfidf_xvalue, hammingloss_tfidf_yvalue, label = 'TF-IDF Vectorizer', color = 'red')

    plt.xlabel('F1-score')
    plt.ylabel('Accuracy')
    plt.title('F1-score vs. Accuracy')
    plt.legend()
    plt.tight_layout() 
    #plt.subplots_adjust(top=0.9, bottom=0.1)
    plt.savefig(f"scatterplot_{name_string}.jpg", bbox_inches='tight')
    
    plt.show()

    
    return sorted_df

def heatmap_printer(df_res, con_m_rf, con_m_rest, label_encode_i):
    
    
    df_rf = df_res[df_res["model_name"] == "Random Forest"]
    df_rest = df_res[df_res["model_name"] != "Random Forest"]

    rf_mask = df_rf["score_method"] == "F1-score"
    fstrat_rf = list(df_rf[rf_mask]["feat_strat"])

    rest_mask = df_rest["score_method"] == "F1-score"

    fstrat_rest = list(df_rest[rest_mask]["feat_strat"])
    model_names = list(df_rest[rest_mask]["model_name"])

    for num, i in enumerate(con_m_rf):
        #[i] = i
        heatmap_maker(i, label_encode_i.classes_, "Random Forest", fstrat_rf[num])


    for num, i in enumerate(con_m_rest):
        heatmap_maker(i, label_encode_i.classes_, model_names[num], fstrat_rest[num])
        
    return


def result_maker_v2(row_dfsorted, df, df_new_predictions, testname, table_rest_out, table_random_forest_out, confusion_rf, confusion_rest, bin_label, label_encode, model_list, feat_strategy, x_train, x_test, y_train, y_test, binary_label, le_e, models_passed):

    #table_rest_out, table_random_forest_out, confusion_rf, confusion_rest, bin_label, label_encode, model_list, feat_strategy  = all_reports(df)
    
    table_output, df_result_sorted, df_res = make_table(table_rest_out, table_random_forest_out, testname)
    
    print(table_output)
    
    scores_sorted = scatterplot_maker(df_result_sorted, testname)
    scores_sorted = scores_sorted.sort_index()
    print("this works")
    print(scores_sorted)
    heatmap_printer(df_res, confusion_rf, confusion_rest, label_encode)
    print("this works")
    feature_strategy_mapping = {
        "TfidfVectorizer": TfidfVectorizer,
        "CountVectorizer": CountVectorizer
    }
        
    print(model_list)

    model_strategy_mapping = {
        "TfidfVectorizer": {
            "Logistic Reg": model_list[3][2],
            "SGD": model_list[3][0],
            "Multinomial NB": model_list[3][1],
            "Random Forest": model_list[2]
        },
        "CountVectorizer": {
            "Logistic Reg": model_list[1][2],
            "SGD": model_list[1][0],
            "Multinomial NB": model_list[1][1],
            "Random Forest": model_list[0]
        }
    }

    top_feat_strat = scores_sorted["feat_strat"][row_dfsorted]
    top_model_name = scores_sorted["model_name"][row_dfsorted]
    

    feat_strategy = feature_strategy_mapping.get(top_feat_strat, lambda: None)()
    model_strategy = model_strategy_mapping.get(top_feat_strat, {}).get(top_model_name, None)


    print(feat_strategy)
    print("NAME, :", top_model_name)
    print(model_strategy)
    

                                                                                      
    list_of_corr, classnames, df_new_pred_n1_sgd = coefficient_printerv2(df, df_new_predictions, label_encode.classes_, model_strategy, top_model_name, feat_strategy, x_train, x_test, y_train, y_test, binary_label, le_e, models_passed)
    nl = list(list_of_corr)
    df_top = pd.DataFrame(nl)
    df_top = df_top.transpose()
    df_top.columns = classnames
    for num, i in enumerate(list_of_corr):
        print(f"Top positive correlations for {classnames[num]}:")
        print(i)
        print("\n")
    groupin = df_new_pred_n1_sgd.groupby(["prediction"])["Year"].count()
    print(df_new_pred_n1_sgd.groupby(["prediction"])["Year"].count())
    make_distribution2(groupin, "predictions")
    
    return df_new_pred_n1_sgd, df_top


def foundations_asso_print(df, foundation_name):

    df_mask = df["Foundation_name"] == foundation_name

    df_foundation_data = df[df_mask]
    print(len(df_foundation_data))
    x_train, x_test, y_train, y_test, label_transformed, le_group = fun_multilabel_test_train_data2(df_foundation_data)

    tfidf_vect = TfidfVectorizer()
    xtrain_vec = tfidf_vect.fit_transform(x_train)
    xtest_vec = tfidf_vect.transform(x_test)

    sup_vm = LinearSVC(penalty='l2', loss='squared_hinge', C=1, class_weight='balanced', max_iter = 1000)

    ova = OneVsRestClassifier(sup_vm)

    ova.fit(xtrain_vec, y_train)

    predictions = ova.predict(xtest_vec)

    f1score = round(f1_score(y_test, predictions, average='weighted'), 4)

    print("F1-score :", f1score)
    print("Classification Report :",foundation_name)
    class_report = classification_report(y_test, predictions, target_names= le_group.classes_, output_dict=True)
    class_reportfll = classification_report(y_test, predictions, target_names= le_group.classes_)
    
    list_of_pos_correlations = []

    for i, class_name in enumerate(le_group.classes_):
        class_coefficients = ova.estimators_[i].coef_[0]

        top_positive = np.argsort(class_coefficients)[-20:][::-1]         

        feature_names = tfidf_vect.get_feature_names_out()
        
        top_positive_words = []
        for i in top_positive:
            top_positive_words.append(str_vect.get_feature_names_out()[i])
        
        list_of_pos_correlations.append(top_positive_words)

    nl = list(list_of_pos_correlations)
    df_top = pd.DataFrame(nl)
    df_top = df_top.transpose()
    df_top.columns = le_group.classes_
    return df_top, class_report, f1score, class_reportfll



#Read in translated dataset df_full
df_full = pd.read_csv("df_translated_proc_7thnov.csv", dtype = object)
#Read in annotated dataset df_annotation
df_annotation = pd.read_csv("annotated_data/annotations_noVil_2024_11_04_13_35_81ae6912.csv", dtype=object)

df_full.loc[:, "merged_text"] = (df_full["Translated_descriptions_pro"].fillna('') + 
                                 df_full["Translated_title_pro"].fillna('') + 
                                 df_full["Translated_receiver_title_pro"].fillna(''))

#Dropping columns we don't need
df_full = df_full.drop(columns = ["Translated_descriptions_pro", "Translated_title_pro", 
                                  "Translated_receiver_title_pro", "Unnamed: 0"])
df_annotation = df_annotation.drop(columns = ["Unnamed: 0", 'Unnamed: 1', "annotation_id", 
                                              "annotator", "created_at", "id", "lead_time", "updated_at"])

#Description lower convertion
df_annotation["Description"] = df_annotation["Description"].str.lower()

#Removing Villum from annotation mask as the data was bugged
anno_novilmask = df_annotation["Foundation_name"]== "VILLUM"  
df_annotation_novil = df_annotation[~anno_novilmask]  


print("shape of full dataframe before merge :", df_full.shape)
#print("Columns in full dataset :", df_full.columns)
print("shape of annotated dataframe before merge :", df_annotation_novil.shape)
#print("Columns in annotated dataset :", df_annotation.columns)


### Next step is to make annotation data from gives categories 

#MERGE

merge_listnames = ["Description", "Grant_size_(DKK)", "Foundation_name", "Year", "Receiver",
        "Receiver_Title", "Title", "Country", "Institution", "Project_Category", 
        "Project_Subcategory", "Receiver_Name", "Receiver_Profession", "Region"]

merged_df = pd.merge(
    df_annotation_novil,
    df_full,  
    on=merge_listnames, 
    how="inner"
)

merged_df = merged_df.drop_duplicates(subset= merge_listnames)

df_remains = pd.merge(
    df_full,
    merged_df,
    on=merge_listnames,
    how="left",
    indicator=True)

df_remains = df_remains[df_remains["_merge"] == "left_only"].drop(columns=["_merge"])

df_remains=df_remains.drop(columns=['merged_text_y', 'Translated_descriptions_y', 'Translated_receiver_title_y', 
                                    'Translated_title_y', "sentiment"])

df_remains = df_remains.rename(columns = {"Translated_title_x" : "Translated_title", 
                                          "Translated_receiver_title_x" : "Translated_receiver_title",
                             "Translated_descriptions_x" : "Translated_descriptions", 
                                          "merged_text_x" : "merged_text"})


print("shape of full dataset (-annotations) after merge :", df_remains.shape)

print("shape of annotated dataframe after merge :", merged_df.shape)


print("Five lines where removed as they were either changed after remaking the dataset, or actual duplicates",
      "- but they were removed to make sure the data is consistent")

#Data processing the remaining lines 

list_of_columns = ["Receiver", "Project_Subcategory", "Project_Category", "Title", "Institution", "Region", 
                   "Receiver_Name", "Receiver_Profession"]

for i in list_of_columns:
    df_remains[i] = data_processing_function(df_remains[i].dropna())
    merged_df[i] = data_processing_function(merged_df[i].dropna())



no_cat = ~df_remains["Project_Category"].isna()  #Columns that have Project_Category
no_sub_cat = df_remains["Project_Subcategory"].isna() #Columns that don't have Project_SUBcategory
df_cat_and_no_sub_cat = df_remains.loc[no_cat & no_sub_cat]
df_subcats = df_remains.loc[~no_sub_cat]
df_cats = df_cat_and_no_sub_cat.groupby(["Project_Category"])["Year"].count()
df_subcategories = df_subcats.groupby(["Project_Subcategory"])["Year"].count()
#print(df_cats) #print to see how i chose categories
#print(df_subcategories) #print to see how i chose categories
#print("Dataframe Subcategories :", df_subcategories)

Education_and_public_information = ["diversitet  didaktik og uddannelse", "science  fritiden", 
        "undersøgende virkelighedsnær naturfagsundervisning", "viden og uddannelse", "viden uddannelse"]

Culture = ["kunst og kultur"]

Nature_Preservation_and_environment = ["miljø og klima"]

Social_Purpose = ["sociale indsatser"]

Research = ["teknisk og naturvidenskabelig forskning", "villum experiment", "villum international postdoc", 
            "villum investigator", "villum kann rasmussen professorat", "villum kann rasmussens årslegat", 
            "villum synergy", "villum young investigator"]

subcategories_dict = {
    "Education_and_public_information": Education_and_public_information,
    "Culture": Culture,
    "Nature_Preservation_and_environment": Nature_Preservation_and_environment,
    "Social_Purpose": Social_Purpose,
    "Research_and_Science": Research,
}


#Choosing these categories as label info in Project_Category : 

Research = ["brain prize", "ascending investigators", "earlycareer clinician scientists", 
            "education  awareness grants", "conferences", "experiment", "fellowship", 
            "field trips research stays 100000", "forskning", "forskning og innovation", 
            "forskning og læring", "frontier grant", "grants  excellence", 
            "international neuroscience programme", "international postdocs", 
            "internationalisation fellowships", "internationalisation programmes", 
            "larger biomedical projects", "larger international meetings  conferences", 
            "leo foundation awards", "leo foundation dr abildgaard fellowships", 
            "lf nih brain initiative", "lfin investigator network", "monograph fellowships", 
            "nordic research prize", "phd scholarships", "postdocs", 
            "postdoctoral fellowship   danish institute  athens", "pregraduate scholarships", 
            "professorships", "reintegration fellowships", "research grants  open competition", 
            "research infrastructure", "research learning active ownership", "research networking", 
            "semper ardens accelerate", "semper ardens accomplish", "semper ardens advance", 
            "serendipity grants", "visiting fellowships  university  oxford", "visiting professorships", 
            "young investigator prize"]

Education_and_public_information = ["junior brain prize", "alle børn skal kunne læse og regne", 
            "børn og unge parat til uddannelse", "godt på vej  ungdomsuddannelse", 
            "sec science education communication", "science communication", 
            "stærke rammer  tryghed styrk civilsamfund debat og demokrati", "uddannelse"]

Culture = ["aktuel kunst", "klassisk musik", "kunst og kultur", "publication", "samtidskunst"]

Health = ["sikkerhed", "sikkerhed akut hjælp", "sikkerhed forebyg brand", "sikkerhed respekt  vand", 
          "sikkerhed sikker  trafikken", "skadeforebyggelse", "skadeforebyggelse medlemmer", 
          "mere ungdom mindre sygdom", "sundhed", "sundhed akut hjælp", "sundhed lev med kronisk sygdom", 
          "sundhed lev sundt", "sundhed mental sundhed", "sundhed patienten først", 
          "tryghed  hverdagen hjertestart", "tryghed  hverdagen reager på stroke", 
          "tryghed  hverdagen respekt  vand", "unges mentale sundhed", "voksne sundere liv"]

Daughter_Foundation = ["bikubenfonden", "velux fonden"]

Social = ["børn og unge alle med fra start", "et bedre liv som anbragt", "social responsibility inclusion", 
          "sociale indsatser", "lige muligheder  børn  fattigdom", "sport", "trivsel", 
          "trivsel en chance  livet", "trivsel en chance  livet", "trivsel en plads  fællesskabet", 
          "trivsel et liv uden kriminalitet", "trivsel mental sundhed", "unge på kanten", 
          "ældre et godt liv som pårørende", "voksne fællesskaber  alle"]

International_Humanitarian = ["safe water"]

Nature_Preservation_and_environment = ["et hav  balance", "svanninge bjerge", "vand og bæredygtig udvikling"]

Religion = ["folkekirken"]

categories_dict = {
    "Research_and_Science": Research,
    "Education_and_public_information": Education_and_public_information,
    "Culture": Culture,
    "Health": Health,
    "Daughter_Foundation": Daughter_Foundation,
    "Social_Purpose": Social,
    "International_Humanitarian": International_Humanitarian,
    "Nature_Preservation_and_environment": Nature_Preservation_and_environment,
    "Religion": Religion
    
}
#categories_dict

#Running the find_category function defined above
annotation_subcat = find_category(df_remains["Project_Subcategory"], subcategories_dict)
annotation_cat = find_category(df_remains["Project_Category"], categories_dict)

#using list comprehension to insert sentinent in output list, category is inserted if subcategory is not there, if none = np.nan
output_list = [x if x != "NA" else y for x, y in zip(annotation_subcat, annotation_cat)]
df_remains["sentiment"]=output_list


mask_sentiment = df_remains["sentiment"] == "NA"
new_labeldf = df_remains[~mask_sentiment]
label_remains = df_remains[mask_sentiment]
label_remains.drop(columns = ["sentiment"])

print("Are length of new_labeldf + label_remains equal to df_remains? :", (len(new_labeldf)+len(label_remains))==len(df_remains))

print("\n")
#making sure its all strings
all_strings = merged_df["merged_text"].apply(lambda x: isinstance(x, str)).all()
if all_strings:
    print("All rows in the column are strings.")
else:
    print("Not all rows in the column are strings.")

merged_df['sentiment'] = merged_df['sentiment'].str.replace(" ", "_")
merged_df['sentiment'] = merged_df['sentiment'].str.replace("-", "_")
merged_df= merged_df.dropna(subset=['sentiment'])

new_labeldf['sentiment'] = new_labeldf['sentiment'].str.replace(" ", "_")
new_labeldf['sentiment'] = new_labeldf['sentiment'].str.replace("-", "_")
new_labeldf= new_labeldf.dropna(subset=['sentiment'])


#extra_label_df = merged_df.concat(new_labeldf)

extra_label_df = pd.concat([merged_df, new_labeldf])

print("\n")
print("self annotated dataframe :", merged_df.shape)
print("\n")
print("remains from self annotated dataframe :", df_remains.shape)
print("\n")
print("self annotated dataframe + lables from selected project categories :", extra_label_df.shape)
print("\n")
print("remains from self annotated dataframe + lables from selected project categories :", label_remains.shape)
print("\n")
print("Are the dataset lengths coherent? :", len(df_remains)+len(merged_df)==len(extra_label_df)+len(label_remains))
print("\n")

print("tf_idf values for self annotated df :")
tf_idf_fun(merged_df["merged_text"])
print("tf_idf values for self annotated df + added labels from categories:")
tf_idf_fun(extra_label_df["merged_text"])


abbreviations = {
    "Business__and_regional_development": "Bus._&_reg._dev.",
    "Daughter_Foundation": "Daughter_found.",
    "Education_and_public_information": "Edu_.&_public_info.",
    "International_Other": "Int._other",
    "International_Humanitarian": "Int._humanitarian",
    "Nature_Preservation_and_environment": "Nature_pres._&_env.",
    "Research_and_Science": "Research"
}

merged_df["sentiment"] = merged_df["sentiment"].replace(abbreviations)
extra_label_df["sentiment"] = extra_label_df["sentiment"].replace(abbreviations)

make_distribution(merged_df, "Self-labeled data")

make_distribution(extra_label_df, "Labels from source + Self-labeled data")


class_w = ca_class_weights(merged_df["sentiment"])
print("Class weights merged_df:", class_w)
print("\n")
class_w_extra = ca_class_weights(extra_label_df["sentiment"])
print("Class weights extra labels:", class_w_extra)


#Part 3 ----- Prediction



print("self annotated dataframe :", merged_df.shape)

print("remains from self annotated dataframe :", df_remains.shape)

print("self annotated dataframe + lables from selected project categories :", extra_label_df.shape)  #this

print("remains from self annotated dataframe + lables from selected project categories :", label_remains.shape)  #and this



#shortcut_versions! 1

#test1
test1_df_new_pred, test1_top  = all_reports_cv(merged_df, df_remains, "self_annotated")
#test2
test2_df_new_pred, test2_top  = all_reports_cv(extra_label_df, label_remains, "TEST2_Extended_Label_data")


# again but with added info to remaining test 3-5


df_extra_label_df_v1 = extra_label_df.copy()
df_extra_label_df_v2 = extra_label_df.copy()
df_extra_label_df_v3 = extra_label_df.copy()
label_remains_v1 = label_remains.copy()
label_remains_v2 = label_remains.copy()
label_remains_v3 = label_remains.copy()

abbreviations = {
    "Business__and_regional_development": "Bus._&_reg._dev.",
    "Daughter_Foundation": "Daughter_found.",
    "Education_and_public_information": "Edu_.&_public_info.",
    "International_Other": "Int._other",
    "International_Humanitarian": "Int._humanitarian",
    "Nature_Preservation_and_environment": "Nature_pres._&_env.",
    "Research_and_Science": "Research"
}

extra_label_df["sentiment"] = extra_label_df["sentiment"].replace(abbreviations)

df_extra_label_df_v1["merged_text"] = df_extra_label_df_v1["merged_text"].fillna('') + " " + df_extra_label_df_v1["Receiver"].fillna('') +  " " + df_extra_label_df_v1["Institution"].fillna('') +  " " + df_extra_label_df_v1["Receiver_Profession"].fillna('') +  " " + df_extra_label_df_v1["Country"].fillna('')
label_remains_v1["merged_text"] = label_remains_v1["merged_text"].fillna('') + " " + label_remains_v1["Receiver"].fillna('') +  " " + label_remains_v1["Institution"].fillna('') +  " " + label_remains_v1["Receiver_Profession"].fillna('') +  " " + label_remains_v1["Country"].fillna('')


df_extra_label_df_v2["merged_text"] = df_extra_label_df_v2["merged_text"].fillna('') + " " + df_extra_label_df_v2["Receiver"].fillna('') + " " + df_extra_label_df_v2["Institution"].fillna('') + " " + df_extra_label_df_v2["Receiver_Name"].fillna('') + " " + df_extra_label_df_v2["Receiver_Profession"].fillna('') + " " + df_extra_label_df_v2["Country"].fillna('')
label_remains_v2["merged_text"] = label_remains_v2["merged_text"].fillna('') + " " + label_remains_v2["Receiver"].fillna('') + " " + label_remains_v2["Institution"].fillna('') + " " + label_remains_v2["Receiver_Name"].fillna('') + " " + label_remains_v2["Receiver_Profession"].fillna('') + " " + label_remains_v2["Country"].fillna('')


df_extra_label_df_v3["merged_text"] = df_extra_label_df_v3["merged_text"].fillna('') + " " + df_extra_label_df_v3["Receiver"].fillna('') + " " + df_extra_label_df_v3["Institution"].fillna('')
label_remains_v3["merged_text"] = label_remains_v3["merged_text"].fillna('') + " " + label_remains_v3["Receiver"].fillna('') + " " + label_remains_v3["Institution"].fillna('')

test3_df_new_pred, test3_top  = all_reports_cv(df_extra_label_df_v1, label_remains_v1, "TEST3_extended_version3_1")
test4_df_new_pred, test4_top  = all_reports_cv(df_extra_label_df_v2, label_remains_v2, "TEST4_extended_version4_2")
test5_df_new_pred, test5_top  = all_reports_cv(df_extra_label_df_v3, label_remains_v3, "TEST5_extended_version5_3")

#MAKING TEST 2 DF INPUT CONCAT


df_test2_combined = pd.concat([label_remains, test2_df_new_pred])

df_test2_combined["pred_merge"] = df_test2_combined["prediction"].fillna(df_test2_combined["sentiment"])

df_combined2 = df_test2_combined.copy()

df_combined2['Grant_size_(DKK)'] = pd.to_numeric(df_combined2['Grant_size_(DKK)'], errors='coerce')
df_combined2['Year'] = pd.to_numeric(df_combined2['Year'], errors='coerce')

grouped_category2 = df_combined2.groupby('pred_merge').size()
grouped_category_grantsize2 = df_combined2.groupby('pred_merge')['Grant_size_(DKK)'].sum()

make_distribution2(grouped_category2, "D_total (labeled + predicted) Categories")
make_distribution2(grouped_category_grantsize2, "D_total - Grant Size in DKK")


df_fullLUNDBECK2, class_report_dictLUNDBECK2, f1score_LUNDBECK2, class_report_LUNDBECK2 = foundations_asso_print(df_combined2, "LUNDBECK")
df_fullLUNDBECK2
print("Classification Report : Lundbeck Foundation")
print("Weighted F-1 score :", f1score_LUNDBECK2)
print(class_report_LUNDBECK2)


df_fullveluxtest2, class_report_dictveluxtest2, f1score_veluxtest2, class_report_veluxtest2 = foundations_asso_print(df_combined2, "VELUX")
df_fullveluxtest2
print("Classification Report : Velux Foundation")
print("Weighted F-1 score :", f1score_veluxtest2)
print(class_report_veluxtest2)


df_full_novo_test2, class_report_dictnovo_test2, f1score_novo_test2, class_report_novo_test2 = foundations_asso_print(df_combined2, "NOVO")
df_full_novo_test2

print("Classification Report : Novo Nordisk Foundation")
print("Weighted F-1 score :", f1score_novo_test2)
print(class_report_novo_test2)

df_full_carls2, class_report_dictcarls2, f1score_carls2, class_report_carls2 = foundations_asso_print(df_combined2, "CARLSBERG")
print("Classification Report : Carlsberg Foundation")
print("Weighted F-1 score :", f1score_carls2)
print(class_report_carls2)
df_full_carls2