In [None]:
import pandas as pd
import numpy as np
import itertools
import warnings
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.cluster import AgglomerativeClustering, KMeans, OPTICS

import pyPLNmodels
from pyPLNmodels.models import PlnPCAcollection, Pln, ZIPln

import predictive_plots,predictive_clustering,utils


In [ ]:
warnings.filterwarnings('ignore', category=FutureWarning)

scales_BEBRASK = pd.read_excel("../Datasets/RETOS_scales.xlsx")
scales_RETOS = pd.read_excel("../Datasets/BEBRASK_scales.xlsx")
scales = pd.concat([scales_BEBRASK, scales_RETOS])
scales = scales.query("EPRIME_CODE != 'PREDWELL_RETOS-307-1'")
cols = scales.columns
#We convert all data to numeric, and change to NaN those values that can not be converted
scales[cols[3:]] = scales[cols[3:]].apply(pd.to_numeric, errors='coerce')
scales.reset_index(drop=True, inplace=True)
nan_counts = scales.isna().sum()

# Identify columns where the count of NaN is less than or equal to 50
columns_with_fewer_nans = nan_counts[nan_counts <= 50].index.tolist()

# Filter the DataFrame to include only these columns
scales = scales[columns_with_fewer_nans]


In [ ]:
#Now we fill the NaN values
scales.isna().sum()
scales["UPPSP_PMD"].fillna(value=scales["UPPSP_PMD"].mean(),inplace=True)
scales["ASI_P"].fillna(value=scales["ASI_P"].mean(),inplace=True)
scales["ASI_C"].fillna(value=scales["ASI_C"].mean(),inplace=True)
aux = scales.drop(["SUBJECT_CODE","EPRIME_CODE","Age","SPQ","SPQ_IR","MSSB_POS","MSSB_NEG","MSSB_DES"],axis=1)


In [ ]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.linear_model import LinearRegression, Lasso

aux = scales.drop(["SUBJECT_CODE", "EPRIME_CODE", "Age", "SPQ", "SPQ_IR", "MSSB_POS", "MSSB_NEG", "MSSB_DES"], axis=1)

#Since there are plenty of missing values, we will use a regression approach to fill these values
variables = ['SPQ', 'SPQ_IR', 'MSSB_POS', 'MSSB_NEG', 'MSSB_DES']
#Non NaN for SPQ
for var in variables:
    # Filter for non-NaN rows for the current variable
    X = aux[scales[var].notna()]
    y = scales.loc[X.index][var]

    # Splitting the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=30, train_size=len(X) - 30, random_state=42)

    # Initialize and train the model
    model = RandomForestRegressor(n_estimators=1000, criterion="absolute_error", max_features="sqrt",
                                  min_samples_split=15)
    model.fit(X_train, y_train)

    # Predict on test set
    y_pred = model.predict(X_test)

    # Calculate RMSE for test predictions
    rmse = sqrt(mean_squared_error(y_test, y_pred))
    print(f"Test RMSE for {var}: {rmse}")

    mean_value = y_train.mean()

    # Create a list of 30 repeated values of the mean
    repeated_means = [mean_value] * 30
    rmse = sqrt(mean_squared_error(y_test, repeated_means))
    print(f"Means RMSE for {var}: {rmse}")
    # Predict missing values
    var_NA = aux[scales[var].isna()]
    pred_var = np.round(model.predict(var_NA))

    # Convert predictions to a pandas Series with an index matching that of var_NA
    pred_var_series = pd.Series(pred_var, index=var_NA.index)

    # Assign the predicted values back into the original 'scales' DataFrame
    scales.loc[pred_var_series.index, var] = pred_var_series


In [ ]:
#Grouping the metrics that will be evaluated from scales for the BEBRASK and RETOS dataset (as the rest were no used
#or contain to many NaNs
PANAS = ["PA", "NA."]
ERQ = ["ERQ_CR", "ERQ_ES"]
UPPSP = ["UPPSP_NU", "UPPSP_PU", "UPPSP_SS", "UPPSP_PMD", "UPPSP_PSV"]
BIS_BAS = ["BIS", "BAS_D", "BAS_RR", "BAS_FS"]
TEPS = ["TEPS_AF", "TEPS_CF"]
SHS = ["SHS"]
FS = ["FS"]
LOTR = ["LOT_R"]
RRQ = ["RRQ_Rum", "RRQ_Ref"]
ASI3 = ["ASI_P", "ASI_C", "ASI_S"]
SPQ = ["SPQ", "SPQ_IR"]
MSSB = ["MSSB_POS", "MSSB_NEG", "MSSB_DES"]

list_metrics = [PANAS, ERQ, UPPSP, BIS_BAS, TEPS, SHS, FS, LOTR, RRQ, ASI3, SPQ, MSSB]
