In [None]:
# Importando Bibliotecas
import pandas as pd
from scipy.stats import zscore
from AnalysisUtils import *
from sklearn.preprocessing import StandardScaler
from scipy.stats import spearmanr
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.impute import KNNImputer
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn import cluster
from sklearn.cluster import KMeans, OPTICS
from scipy.cluster.hierarchy import linkage, fcluster
from itertools import product
from tqdm import tqdm
from sklearn.neighbors import NearestNeighbors
import pandas as pd
from random import sample
from numpy.random import uniform
from matplotlib.colors import Normalize
from matplotlib import colormaps

In [None]:
def encode_string_columns(df):
  # Select columns of type 'object' or 'category'
  non_numeric_columns = df.select_dtypes(include=['object', 'category'])

  for col in non_numeric_columns.columns:
    # Get the unique values in the column
    unique_values = df[col].dropna().unique()
    
    if col == 'come_entre_refeicoes':
      df['dummy_come_entre_refeicoes'] = df[col].str.lower().map({'nao':0, 'as vezes':1, 'frequentemente':2, 'sempre': 3})
      continue
      
    if col == 'consumo_alcool':
      df['dummy_consumo_alcool'] = df[col].str.lower().map({'nao':0, 'as vezes':1, 'frequentemente':2})
      continue

    if df[col].str.lower().isin(['sim', 'nao']).all():
      df[f"dummy_{col}"] = df[col].str.lower().map({'sim': 1, 'nao': 0})
      continue
      
    if len(unique_values) == 2:
      label_encoder = LabelEncoder()
      df[f"dummy_{col}"] = label_encoder.fit_transform(df[col])
      continue
    else:
      df_dummies = pd.get_dummies(df[col], prefix=f"dummy_{col}")
      df = pd.concat([df, df_dummies], axis=1)
      continue
        
  return df

def fill_missing_data(df, method='simple', n_neighbors=5):
  if method == 'simple':
    for col in df.columns:
      if df[col].dtype in ['float64', 'int64']:
        df[col] = df[col].fillna(df[col].mean())
      else:
        if df[col].notna().any():
          df[col] = df[col].fillna(df[col].mode()[0])

  elif method == 'knn':
    # Separate numeric and non-numeric columns
    df_numeric = df.select_dtypes(include=['float64', 'int64'])
    df_non_numeric = df.select_dtypes(exclude=['float64', 'int64'])

    # Apply get_dummies for non-numeric columns to perform one-hot encoding
    df_non_numeric_dummies = pd.get_dummies(df_non_numeric, drop_first=False)

    # Impute using KNN
    imputer_numeric = KNNImputer(n_neighbors=n_neighbors)
    imputer_non_numeric = KNNImputer(n_neighbors=1)
    
    df_imputed_numeric = pd.DataFrame(
      imputer_numeric.fit_transform(df_numeric),
      columns=df_numeric.columns,
      index=df_numeric.index
    )

    df_imputed_non_numeric = pd.DataFrame(
      imputer_non_numeric.fit_transform(df_non_numeric_dummies),
      columns=df_non_numeric_dummies.columns,
      index=df_non_numeric_dummies.index
    )
    # Reverse the one-hot encoding by getting the most frequent category for each column
    df_non_numeric_imputed = pd.DataFrame(index=df_imputed_non_numeric.index)
    for col in df_non_numeric.columns:
      vals_col = df_non_numeric[col].dropna().unique()
      
      dummies = [col + '_' + str(val) for val in vals_col]
      dummies_values = df_imputed_non_numeric[dummies].idxmax(axis=1).apply(lambda x: x.split('_')[-1])
      df_non_numeric_imputed[col] = dummies_values

    # Combine numeric and non-numeric back to the original DataFrame
    df = pd.concat([df_imputed_numeric, df_non_numeric_imputed], axis=1)

  else:
    raise ValueError("Invalid method. Choose from 'simple' or 'knn'.")

  return df

def detect_outliers(df):
  z_scores = np.abs(zscore(df.select_dtypes(include=[np.number])))
  outliers = z_scores > 3  # Consider values with Z-score greater than 3 as outliers
  return outliers.sum()

def analyze_numerical_column(row):
  print_simple_metrics(row)
  boxplot_with_quartiles(row, yscale='linear')
  create_filtered_histograms(row, log=False, filters=None, color='blue', bins=100)

def calculate_group_metric(df, category_columns, numerical_columns, metric='median'):
  result_list = []

  for i in range(len(category_columns)):
    for value in df[category_columns[i]].unique():
      group_df = df[df[category_columns[i]] == value]

      if metric == 'median':
        group_metric = group_df[numerical_columns].median()
      else:
        group_metric = group_df[numerical_columns].mean()

      group_metric['category'] = f"{category_columns[i]}: {value}"
      result_list.append(group_metric)

  result_df = pd.DataFrame(result_list)

  return result_df

def plot_sorted_group_metric(result_df, numerical_columns, metric='median'):
  for col in numerical_columns:
    # Sort the result DataFrame by the median of the current column
    sorted_df = result_df.set_index('category').sort_values(by=col).reset_index()

    plt.figure(figsize=(12, 8))

    sorted_categories = sorted_df['category'].values
    boxplot_data = []

    for category in sorted_categories:
      group_category = category.split(":")[0]  # Extract the category name from 'column: value'
      group_value = category.split(":")[1].strip()  # Extract the value
      group_df = df[df[group_category] == group_value]  # Get the subgroup
      boxplot_data.append(group_df[col].values)

    group_metrics = sorted_df[col].values
    norm = Normalize(vmin=np.min(group_metrics), vmax=np.max(group_metrics))
    colors = colormaps['viridis'](norm(group_metrics))
    colors_list = colors.tolist()
    print(len(boxplot_data), len(colors_list))

    sns.boxplot(
      data=boxplot_data,
      order=range(len(sorted_categories)),
      palette=colors_list
    )
    
    plt.xticks(range(len(sorted_categories)), sorted_categories, rotation=45, ha='right')
    plt.xlabel('Category')
    plt.ylabel(f'{metric} of {col}')
    plt.title(f'Sorted {metric}s of {col} by Category')
    plt.tight_layout()

    plt.show()

def plot_all_category_columns(df, category_columns, color='skyblue', top=10):
  num_columns = len(category_columns)
  rows = (num_columns // 3) + (num_columns % 3 > 0)

  fig, axes = plt.subplots(rows, 3, figsize=(15, 5 * rows))
  axes = axes.flatten()

  for idx, column in enumerate(category_columns):
    ax = axes[idx]

    # Get the value counts for the encoded column
    result_df = df[column].value_counts().reset_index()
    result_df.columns = [column, 'Count']
    result_df['Percentage'] = (result_df['Count'] / result_df['Count'].sum()) * 100

    # Keep only top N values if specified
    result_df = result_df.head(top)

    # Plot the bar chart
    result_df.plot(kind='bar', x=column, y='Count', color=color, ax=ax, legend=False)

    ax.set_title(f"Value Counts of {column}", fontsize=12, pad=20)
    ax.set_xlabel('Values', fontsize=10)
    ax.set_ylabel('Count', fontsize=10)
    ax.tick_params(axis='x', rotation=45)

    for i, (count, pct) in enumerate(zip(result_df['Count'], result_df['Percentage'])):
      ax.text(i, count + 0.5, f'{count} / {pct:.1f}%', ha='center', va='bottom')

  # Remove any empty axes
  for i in range(idx + 1, len(axes)):
    fig.delaxes(axes[i])

  plt.subplots_adjust(hspace=0.5, wspace=0.3)
  plt.tight_layout()
  plt.show()

def scale_data(df, columns):
  scaler = StandardScaler()
  new_df = df.copy()
  new_df[columns] = scaler.fit_transform(df[columns])
  return new_df

def scale_and_prepare_data(df, categorical_columns, numerical_columns):
  # One-hot encode categorical columns
  df_dummies = pd.get_dummies(df[categorical_columns], drop_first=True)

  # Initialize StandardScaler
  scaler = StandardScaler()

  # Scale the specified numerical columns
  df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

  # Scale the one-hot encoded categorical columns (df_dummies)
  df_dummies_scaled = scaler.fit_transform(df_dummies)

  # Convert the scaled dummies back to a DataFrame and assign correct column names manually
  df_dummies_scaled = pd.DataFrame(df_dummies_scaled, columns=df_dummies.columns, index=df.index)

  # Concatenate the scaled numerical columns with the scaled one-hot encoded columns
  df = pd.concat([df[numerical_columns], df_dummies_scaled], axis=1)

  return df

def add_health_columns(df):
    # Calcular IMC
    df['IMC'] = df['peso'] / (df['altura'] ** 2)

    # Classificar IMG
    def classify_imc(img):
        if img <= 18.5:
            return 'Baixo'
        elif img <= 24.9:
            return 'Normal'
        elif img <= 29.9:
            return 'Sobrepeso'
        else:
            return 'Obesidade'

    df['Class_IMC'] = df['IMC'].apply(classify_imc)

    return df

def plot_and_return_correlation(df, category_columns_dummies, numeric_columns, method='spearman', plot=True):

  df_numeric = df[numeric_columns + category_columns_dummies]

  correlation_matrix = df_numeric.corr(method=method)

  # Plot the heatmap
  if plot:
    plt.figure(figsize=(8, 6))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
    plt.title(f'Matriz de Correlação ({method.capitalize()})')
    plt.show()

  # Flatten the correlation matrix and sort by absolute value
  corr_pairs = correlation_matrix.unstack()
  sorted_corr = corr_pairs.sort_values(key=lambda x: x.abs(), ascending=False)

  # Exclude self-correlations
  filtered_corr = sorted_corr[sorted_corr.index.get_level_values(0) != sorted_corr.index.get_level_values(1)]

  # Remove duplicates like [A, B] and [B, A]
  unique_pairs = filtered_corr.reset_index()
  unique_pairs['sorted_index'] = unique_pairs.apply(lambda row: tuple(sorted([row['level_0'], row['level_1']])), axis=1)
  unique_pairs = unique_pairs.drop_duplicates(subset='sorted_index').drop(columns='sorted_index')

  # Rename columns for clarity
  unique_pairs.columns = ['Variable 1', 'Variable 2', 'Correlation']

  return unique_pairs, correlation_matrix

def show_changed_missing_values(df1, df2):

  df1_ordered = df1[df2.columns]
  changed = df1_ordered != df2
  changed_rows_df1 = df1_ordered[changed].dropna(how='all')
  changed_rows_df2 = df2[changed].dropna(how='all')

  merged_values = changed_rows_df1.combine(
    changed_rows_df2,
    lambda x1, x2: pd.Series([f"{v1} | {v2}" if pd.notna(v1) and pd.notna(v2) else None for v1, v2 in zip(x1, x2)])
  )

  return merged_values

def pca_analysis(df, columns, plot=True):
  # Performing PCA
  pca_df = df[columns].copy()
  pca = PCA()
  pca_data = pca.fit_transform(pca_df)
  for i in range(pca_data.shape[1]):
    pca_df[f'pca{i+1}'] = pca_data[:, i]

  # Loading components
  loadings = pd.DataFrame(
    pca.components_,
    columns=pca_df.columns[:pca.components_.shape[-1]],
    index=[f'PCA{i + 1}' for i in range(pca.n_components_)]
  )

  # Explained variance ratio and PCA contributions
  explained_variance_ratio = pca.explained_variance_ratio_
  pca_contributions = (explained_variance_ratio[:5]).round(4)
  
  if plot:
    plt.figure(figsize=(8, 6))
    plt.scatter(pca_df['pca1'], pca_df['pca2'], s=50)
    plt.title("PCA Clustering")
    plt.xlabel("PCA Component 1")
    plt.ylabel("PCA Component 2")
    plt.show()

  sorted_loadings = {}
  for i in range(pca.n_components_):
    pca_component = f'PCA{i + 1}'
    top_contributors = loadings.iloc[i].abs().sort_values(ascending=False).head(5)

    # Create a dictionary for the top contributors and their contributions
    sorted_loadings[pca_component] = {
      feature: top_contributors[feature] for feature in top_contributors.index
    }

  return pca_df, sorted_loadings, pca_contributions

def pca_explained_variance_plot(df, columns):
  pca_df = df[columns].copy()
  pca = PCA()
  pca_data = pca.fit_transform(pca_df)
  explained_variance_ratio = pca.explained_variance_ratio_
  cumulative_explained_variance = explained_variance_ratio.cumsum()

  # Plot the cumulative explained variance vs the number of principal components
  plt.figure(figsize=(8, 6))
  plt.plot(range(1, len(cumulative_explained_variance) + 1), cumulative_explained_variance, marker='o', linestyle='-')
  plt.title("Cumulative Explained Variance vs. Number of Principal Components")
  plt.xlabel("Number of Principal Components")
  plt.ylabel("Cumulative Explained Variance (%)")
  plt.xticks(range(1, len(explained_variance_ratio) + 1))
  plt.yticks([i / 10 for i in range(0, 11)])
  plt.grid(True)
  plt.show()

  # Now, let's calculate and plot the contribution of each original column to the total variance explained
  loadings = pd.DataFrame(
    pca.components_,
    columns=pca_df.columns[:pca.components_.shape[-1]],
    index=[f'PCA{i + 1}' for i in range(pca.n_components_)]
  )
  
  # Calculate the contribution of each original feature across all principal components
  feature_contributions = loadings.abs().sum(axis=0).sort_values(ascending=False)
  
  plt.figure(figsize=(12, 8))  # Set larger figure size
  feature_contributions.plot(kind='bar', stacked=True, colormap='viridis', figsize=(12, 8))
  plt.title("Feature Contribution to PCA Components", fontsize=16, fontweight='bold')
  plt.xlabel("Original Features", fontsize=12)
  plt.ylabel("Total Contribution to PCA Components", fontsize=12)
  plt.xticks(rotation=45, ha='right', fontsize=10)
  plt.grid(axis='y', linestyle='--', alpha=0.7)
  plt.tight_layout()
  plt.show()

  return cumulative_explained_variance, feature_contributions

def kmeans_elbow(df, max_clusters):
  # Lista para armazenar os valores de inertia
  inertias = []

  # Testar diferentes números de clusters (k)
  k_values = range(1, max_clusters + 1)
  for k in k_values:
      kmeans = KMeans(n_clusters=k, random_state=0)
      kmeans.fit(df)
      inertias.append(kmeans.inertia_)

  # Plotar o gráfico do método do cotovelo
  plt.figure(figsize=(8, 5))
  plt.plot(k_values, inertias, 'o-', color='blue')
  plt.xlabel('Número de clusters (k)')
  plt.ylabel('Inércia')
  plt.title('Método do Cotovelo')
  plt.xticks(k_values)
  plt.grid(True)
  plt.show()

def silhouette_score_clusters_plot(df, max_clusters):
  # Testar diferentes valores de k
  best_clusters_amount = 0
  max_silhouette_score = -1
  silhouette_scores = []
  k_values = range(2, max_clusters + 1)

  for k in k_values:
    avg_score = 0
  
    # Run KMeans 5 times for each k
    for _ in range(5):
      kmeans = KMeans(n_clusters=k, random_state=None)  # Ensure a different random state each run
      labels = kmeans.fit_predict(df)
      score = silhouette_score(df, labels)
      avg_score += score
  
    # Calculate average score for this k
    avg_score /= 5
    silhouette_scores.append(avg_score)
  
    # Update best clusters amount if we have a new max silhouette score
    if avg_score > max_silhouette_score:
      best_clusters_amount = k
      max_silhouette_score = avg_score

  # Plotar o Silhouette Score
  plt.figure(figsize=(8, 5))
  plt.plot(k_values, silhouette_scores, 'o-', color='green')
  plt.xticks(k_values)
  plt.xlabel('Número de clusters (k)')
  plt.ylabel('Coeficiente de Silhueta')
  plt.title('Coeficiente de Silhueta para diferentes k')
  plt.grid(True)
  plt.show()

  return best_clusters_amount, max_silhouette_score

def davies_bouldin_index(df, max_clusters):
  best_clusters_amount = 0
  min_davies_bouldin_index = 99999999
  db_scores = []
  k_values = range(2, max_clusters + 1)

  for k in k_values:
      kmeans = KMeans(n_clusters=k, random_state=0)
      labels = kmeans.fit_predict(df)
      score = davies_bouldin_score(df, labels)
      db_scores.append(score)

      if score < min_davies_bouldin_index:
        min_davies_bouldin_index = score
        best_clusters_amount = k

  # Plotar Davies-Bouldin Index
  plt.figure(figsize=(8, 5))
  plt.plot(k_values, db_scores, 'o-', color='purple')
  plt.xlabel('Número de clusters (k)')
  plt.ylabel('Davies-Bouldin Index')
  plt.title('Davies-Bouldin Index para diferentes k')
  plt.grid(True)
  plt.show()

  return best_clusters_amount, min_davies_bouldin_index

def kmeans_clusters_amount_analysis(df, max_clusters):
  kmeans_elbow(df, max_clusters)
  print("depois do elbow")

  best_clusters_amount_silhouette, max_silhouette_score = silhouette_score_clusters_plot(df, max_clusters)
  kmeans_plot(df, best_clusters_amount_silhouette)

  print(f'Melhor quantidade de clusters: {best_clusters_amount_silhouette} | Silhueta: {max_silhouette_score}')

def plot_clusters_pca(df, labels, cluster_centers=None):
  pca = PCA(n_components=2)
  reduced_data = pca.fit_transform(df)

  plt.figure(figsize=(8, 6))

  unique_labels = np.unique(labels)
  for label in unique_labels:
    if label == -1:
        color = 'black'
        label_name = 'Noise'
    else:
        color = None
        label_name = f'Cluster {label}'
    
    cluster_data = reduced_data[labels == label]
    plt.scatter(cluster_data[:, 0], cluster_data[:, 1], label=label_name, c=color)
  
  if cluster_centers is not None:
    centroids = pca.transform(cluster_centers)
    plt.scatter(centroids[:, 0], centroids[:, 1], s=200, c='black', marker='x', label='Centroids')
  
  plt.title("Clusters visualizados com PCA")
  plt.xlabel("PCA Componente 1")
  plt.ylabel("PCA Componente 2")
  plt.legend()
  plt.show()

def kmeans_plot(df, n_clusters, plot=True, random_state=None):
  kmeans = cluster.KMeans(n_clusters=n_clusters, random_state=random_state)
  y_kmeans = kmeans.fit_predict(df)

  if plot:
    plot_clusters_pca(df, kmeans.labels_, kmeans.cluster_centers_)
      
  return y_kmeans

def dbscan_analysis(df, eps, min_samples, plot=True):
  dbscan = cluster.DBSCAN(eps=eps, min_samples=min_samples)
  y_dbscan = dbscan.fit_predict(df)

  if not plot:
    return y_dbscan
  
  labels = dbscan.labels_
  plot_clusters_pca(df, labels)  

def clustering_analysis(df, method, columns, max_clusters=50):
  scaled_df = scale_data(df, list(set(columns)))
  scaled_df = scaled_df[columns]
  
  if method == 'kmeans':
    kmeans_clusters_amount_analysis(scaled_df, max_clusters=max_clusters)
  elif method == 'optics':
    optics_hyperparam_search(scaled_df)
  elif method == 'hierarchy':
    hierarchical_hyperparam_search(scaled_df)

  pca_analysis(scaled_df, columns=columns, plot=False)
  pca_explained_variance_plot(scaled_df, columns=columns)

def clustering_method(df, method, columns, **kwargs):
  results = {}
  scaled_df = scale_data(df, list(set(columns)))
  scaled_df = scaled_df[columns]

  if method == 'kmeans':
    n_clusters = kwargs['n_clusters']
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    kmeans.fit(scaled_df)
    results['labels'] = kmeans.labels_
    results['inertia'] = kmeans.inertia_

  elif method == 'hierarchy':
    linkage_method = kwargs['linkage_method']
    diana_linkage = linkage(scaled_df, method=linkage_method)
    results['linkage_matrix'] = diana_linkage

  elif method == 'optics':
    min_samples = kwargs['min_samples']
    xi = kwargs['xi']
    min_cluster_size = kwargs['min_cluster_size']
    optics = OPTICS(min_samples=min_samples, xi=xi, min_cluster_size=min_cluster_size)
    optics.fit(scaled_df)
    results['labels'] = optics.labels_
    results['reachability'] = optics.reachability_
    results['core_distances'] = optics.core_distances_

  else:
    raise ValueError(f"Method '{method}' is not supported. Choose from 'kmeans', 'diana', 'optics', or 'clique'.")

  return results

def plot_best_silhouette(results_df):
  # Group results by number of clusters and get the best silhouette score for each number of clusters
  results_df['n_clusters'] = results_df['labels'].apply(lambda x: len(set(x)) - (1 if -1 in set(x) else 0))
  best_results = results_df.loc[results_df.groupby('n_clusters')['silhouette_score'].idxmax()]

  # Plot silhouette score vs number of clusters
  plt.figure(figsize=(8, 6))
  plt.plot(best_results['n_clusters'], best_results['silhouette_score'], marker='o', linestyle='-', color='b')
  plt.title("Best Silhouette Score by Number of Clusters (OPTICS)")
  plt.xlabel("Number of Clusters")
  plt.ylabel("Silhouette Score")
  plt.grid(True)
  plt.show()

def optics_hyperparam_search(df):
  min_samples_range = [2, 3, 5, 5, 6, 7, 8, 9, 10]
  xi_range = [0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4]
  min_cluster_size_range = [0.001, 0.025, 0.05, 0.1, 0.15]

  best_score = -2
  best_params = {}

  param_combinations = product(min_samples_range, xi_range, min_cluster_size_range)
  results = []

  for min_samples, xi, min_cluster_size in tqdm(param_combinations, desc="Hyperparameter search", total=len(min_samples_range) * len(xi_range) * len(min_cluster_size_range)):
    try:
      optics = OPTICS(min_samples=min_samples, xi=xi, min_cluster_size=min_cluster_size)
      optics.fit(df)
      labels = optics.labels_

      if len(set(labels)) <= 1 or (-1 in set(labels) and len(set(labels)) == 2):
        continue

      score = silhouette_score(df, labels)

      results.append({
        'min_samples': min_samples,
        'xi': xi,
        'min_cluster_size': min_cluster_size,
        'silhouette_score': score,
        'labels': labels
      })

      if score > best_score:
        best_score = score
        best_params = {
          'min_samples': min_samples,
          'xi': xi,
          'min_cluster_size': min_cluster_size,
          'silhouette_score': score
        }

    except Exception as e:
      print(f"Skipping parameters (min_samples={min_samples}, xi={xi}, min_cluster_size={min_cluster_size}): {e}")

  # Plotting the results for best silhouette score
  results_df = pd.DataFrame(results)
  best_result = results_df[results_df['silhouette_score'] == best_score].iloc[0]

  plot_best_silhouette(results_df)

  # Retrain the model with best parameters
  optics_best = OPTICS(min_samples=int(best_result['min_samples']),
                       xi=best_result['xi'],
                       min_cluster_size=best_result['min_cluster_size'])
  optics_best.fit(df)
  labels_best = optics_best.labels_

  # Plot the clustering result using PCA
  plot_clusters_pca(df, labels_best)
  
  print(best_params)
  
  return best_params

def hierarchical_hyperparam_search(df):
  # Define the hyperparameter ranges
  linkage_methods = ['single', 'complete', 'average', 'ward']
  thresholds = [0.5, 1, 1.5, 2, 2.5]  # Define the range of thresholds to search over

  best_score = -2
  best_params = {}
  results = []

  # Perform grid search over the hyperparameter combinations
  param_combinations = product(linkage_methods, thresholds)
  for linkage_method, threshold in tqdm(param_combinations, desc="Hyperparameter search", total=len(linkage_methods) * len(thresholds)):
    try:
      # Perform hierarchical clustering using the current parameters
      linkage_matrix = linkage(df, method=linkage_method)

      # Generate flat clusters based on the threshold
      labels = fcluster(linkage_matrix, t=threshold, criterion='distance')

      if len(set(labels)) <= 1 or (-1 in set(labels) and len(set(labels)) == 2):
        continue

      # Compute silhouette score for the current clustering
      score = silhouette_score(df, labels)

      results.append({
        'linkage_method': linkage_method,
        'threshold': threshold,
        'silhouette_score': score,
        'labels': labels
      })

      # Update best parameters if current score is better
      if score > best_score:
        best_score = score
        best_params = {
          'linkage_method': linkage_method,
          'threshold': threshold,
          'silhouette_score': score
        }

    except Exception as e:
      print(f"Skipping parameters (linkage_method={linkage_method}, threshold={threshold}): {e}")

  # Convert results to DataFrame
  results_df = pd.DataFrame(results)

  # Plot the best silhouette scores by number of clusters
  plot_best_silhouette(results_df)

  # Retrain the model with best parameters
  best_result = results_df.loc[results_df['silhouette_score'].idxmax()]
  best_linkage_method = best_result['linkage_method']
  best_threshold = best_result['threshold']

  # Retrain the hierarchical clustering model with the best parameters
  linkage_matrix_best = linkage(df, method=best_linkage_method)
  labels_best = fcluster(linkage_matrix_best, t=best_threshold, criterion='distance')

  # Plot the clustering result using PCA
  plot_clusters_pca(df, labels_best)

  return best_params

def load_and_preprocess_data(inputting_method):
  df = pd.read_csv('data/trabalho2_dados_4.csv')
  df = fill_missing_data(df, method=inputting_method) # substituindo valores faltantes numéricos pela media e categóricos pela moda
  df['idade_int'] = np.floor(df['idade'])
  df['n_refeicoes_int'] = df['n_refeicoes'].round()
  df['consome_vegetais_int'] = df['consome_vegetais'].round()
  df['consumo_diario_agua_int'] = df['consumo_diario_agua'].round()
  df['frequencia_atividade_fisica_int'] = df['frequencia_atividade_fisica'].round()
  df['tempo_usando_eletronicos_int'] = df['tempo_usando_eletronicos'].round()
  df['tipo_transporte_adaptado'] = df['tipo_transporte'].apply(
    lambda x: 'ativo' if x in ['bicicleta', 'andando'] else 'automovel' if x in ['carro', 'moto', 'transporte publico'] else x)
  df = add_health_columns(df)
  df = encode_string_columns(df) # encodando colunas de string
  df = df.drop_duplicates(df, keep='first') # remoção de ~250 entradas duplicadas (substituição de valores faltantes e transformação direta em int nao criaram novas duplicados apenas 1 para o caso de substituição
  return df

def get_generic_df(df, separate_cols=False):
  numerical_columns_test = [
    'idade',
    'consome_vegetais_int',
    'consumo_diario_agua_int',
    'frequencia_atividade_fisica_int',
    'IMC',
  ]
  category_columns_test = [
    'dummy_sexo',
    'dummy_historico_obesidade_familia',
    'dummy_consome_comida_calorica',
    'dummy_come_entre_refeicoes',
    'dummy_consumo_alcool',
    'dummy_tipo_transporte_transporte publico',
    'dummy_tipo_transporte_carro',
  ]
  
  test_df = df.copy()
  test_df = test_df[
    (test_df['dummy_tipo_transporte_andando'] == 0) &
    (test_df['dummy_tipo_transporte_bicicleta'] == 0) &
    (test_df['dummy_tipo_transporte_moto'] == 0) &
    (test_df['dummy_fuma'] == 0) &
    (test_df['dummy_consumo_alcool'] < 2)
    & (
            (test_df['dummy_come_entre_refeicoes'] == 2)
            | (test_df['dummy_come_entre_refeicoes'] == 1)
    ) &
    (test_df['consome_vegetais_int'] > 1) &
    (test_df['n_refeicoes_int'] == 3)
    & (test_df['frequencia_atividade_fisica_int'] < 3)
    ]
  test_df['dummy_tipo_transporte'] = test_df['dummy_tipo_transporte_transporte publico']
  
  if separate_cols:
    return test_df.copy(), numerical_columns_test, category_columns_test
  
  return test_df.copy(), numerical_columns_test+category_columns_test

def get_generic_women_df(df, separate_cols=False):
  numerical_columns_test = [
    'idade',
    'consome_vegetais_int',
    'consumo_diario_agua_int',
    'frequencia_atividade_fisica_int',
    'IMC',
  ]

  category_columns_test = [
    # 'dummy_sexo',
    'dummy_historico_obesidade_familia',
    'dummy_consome_comida_calorica',
    'dummy_come_entre_refeicoes',
    'dummy_consumo_alcool',
    'dummy_tipo_transporte_transporte publico',
    'dummy_tipo_transporte_carro',
  ]

  test_df = df.copy()
  test_df = test_df[
    (test_df['dummy_sexo'] == 0) & # mulher
    (test_df['dummy_tipo_transporte_andando'] == 0) &
    (test_df['dummy_tipo_transporte_bicicleta'] == 0) &
    (test_df['dummy_tipo_transporte_moto'] == 0) &
    (test_df['dummy_fuma'] == 0) &
    (test_df['dummy_consumo_alcool'] < 2)
    & (
            (test_df['dummy_come_entre_refeicoes'] == 2)
            | (test_df['dummy_come_entre_refeicoes'] == 1)
    ) &
    (test_df['consome_vegetais_int'] > 1) &
    (test_df['n_refeicoes_int'] == 3)
    & (test_df['frequencia_atividade_fisica_int'] < 3)
    ]
  test_df['dummy_tipo_transporte'] = test_df['dummy_tipo_transporte_transporte publico']
  
  if separate_cols:
    return test_df.copy(), numerical_columns_test, category_columns_test
  
  return test_df.copy(), numerical_columns_test + category_columns_test

def get_generic_men_df(df, separate_cols=False):
  numerical_columns_test = [
    'idade',
    'consome_vegetais_int',
    'consumo_diario_agua_int',
    'frequencia_atividade_fisica_int',
    'IMC',
  ]

  category_columns_test = [
    # 'dummy_sexo',
    'dummy_historico_obesidade_familia',
    'dummy_consome_comida_calorica',
    'dummy_come_entre_refeicoes',
    'dummy_consumo_alcool',
    'dummy_tipo_transporte_transporte publico',
    'dummy_tipo_transporte_carro',
  ]

  test_df = df.copy()
  test_df = test_df[
    (test_df['dummy_sexo'] == 1) & #homem
    (test_df['dummy_tipo_transporte_andando'] == 0) &
    (test_df['dummy_tipo_transporte_bicicleta'] == 0) &
    (test_df['dummy_tipo_transporte_moto'] == 0) &
    (test_df['dummy_fuma'] == 0) &
    (test_df['dummy_consumo_alcool'] < 2)
    & (
            (test_df['dummy_come_entre_refeicoes'] == 2)
            | (test_df['dummy_come_entre_refeicoes'] == 1)
    ) &
    (test_df['consome_vegetais_int'] > 1) &
    (test_df['n_refeicoes_int'] == 3)
    & (test_df['frequencia_atividade_fisica_int'] < 3)
    ]
  test_df['dummy_tipo_transporte'] = test_df['dummy_tipo_transporte_transporte publico']
  
  if separate_cols:
    return test_df.copy(), numerical_columns_test, category_columns_test

  return test_df.copy(), numerical_columns_test + category_columns_test

def get_unusual_df(df, separate_cols=False):
  numerical_columns_test = [
    'idade',
    'consome_vegetais_int',
    'consumo_diario_agua_int',
    'frequencia_atividade_fisica_int',
    'IMC'
  ]

  category_columns_test = [
    'dummy_sexo',
    'dummy_historico_obesidade_familia',
    'dummy_consome_comida_calorica',
    'dummy_come_entre_refeicoes',
    'dummy_consumo_alcool',
    'dummy_tipo_transporte_transporte publico',
    'dummy_tipo_transporte_carro',
  ]


  test_df = df.copy()
  test_df = test_df[~(
          (test_df['dummy_tipo_transporte_andando'] == 0) &
          (test_df['dummy_tipo_transporte_bicicleta'] == 0) &
          (test_df['dummy_tipo_transporte_moto'] == 0) &
          (test_df['dummy_fuma'] == 0) &
          (test_df['dummy_consumo_alcool'] < 2) &
          (
                  (test_df['dummy_come_entre_refeicoes'] == 2)
                  | (test_df['dummy_come_entre_refeicoes'] == 1)
          ) &
          (test_df['consome_vegetais_int'] > 1) &
          (test_df['n_refeicoes_int'] == 3) &
          (test_df['frequencia_atividade_fisica_int'] < 3)
  )]
  if separate_cols:
    return test_df.copy(), numerical_columns_test, category_columns_test
  
  return test_df.copy(), numerical_columns_test + category_columns_test

def get_unrefined_df(df, separate_cols=False):
  numerical_columns_test = [
    'idade',
    'consome_vegetais',
    'n_refeicoes',
    'consumo_diario_agua',
    'frequencia_atividade_fisica',
    'tempo_usando_eletronicos',
    'IMC',
    'peso',
    'altura'
  ]

  category_columns_test = [
    'dummy_sexo',
    'dummy_historico_obesidade_familia',
    'dummy_consome_comida_calorica',
    'dummy_come_entre_refeicoes',
    'dummy_consumo_alcool',
    'dummy_fuma',
    'dummy_tipo_transporte_andando',
    'dummy_tipo_transporte_bicicleta',
    'dummy_tipo_transporte_carro',
    'dummy_tipo_transporte_moto',
    'dummy_tipo_transporte_transporte publico',
  ]
  if separate_cols:
    return df.copy(), numerical_columns_test, category_columns_test
  
  return df.copy(), numerical_columns_test + category_columns_test

def hopkins_statistic(df, columns):
  X = scale_data(df, list(set(columns)))
  X = X[columns]
  X=X.values  #convert dataframe to a numpy array
  sample_size = int(X.shape[0]*0.4)


  #a uniform random sample in the original data space
  X_uniform_random_sample = uniform(X.min(axis=0), X.max(axis=0) ,(sample_size , X.shape[1]))

  #a random sample of size sample_size from the original data X
  random_indices=sample(range(0, X.shape[0], 1), sample_size)
  X_sample = X[random_indices]

  #initialise unsupervised learner for implementing neighbor searches
  neigh = NearestNeighbors(n_neighbors=2)
  nbrs=neigh.fit(X)

  #u_distances = nearest neighbour distances from uniform random sample
  u_distances , u_indices = nbrs.kneighbors(X_uniform_random_sample , n_neighbors=2)
  u_distances = u_distances[: , 0] #distance to the first (nearest) neighbour

  #w_distances = nearest neighbour distances from a sample of points from original data X
  w_distances , w_indices = nbrs.kneighbors(X_sample , n_neighbors=2)
  #distance to the second nearest neighbour (as the first neighbour will be the point itself, with distance = 0)
  w_distances = w_distances[: , 1]

  u_sum = np.sum(u_distances)
  w_sum = np.sum(w_distances)

  #compute and return hopkins' statistic
  H = u_sum/ (u_sum + w_sum)
  return H

def clusters_analysis(df, numerical_columns, category_columns, n_clusters, random_state=0, metric='mean'):
  cols = numerical_columns + category_columns
  cluster_df = df.copy()
  scaled_df = scale_data(df, list(set(cols)))
  scaled_df = scaled_df[cols]
  best_silhouette = -1
  best_random_state = None
  best_labels = None

  for i in range(10):
    current_random_state = random_state + i
    labels = kmeans_plot(scaled_df, n_clusters, plot=False, random_state=current_random_state)
    silhouette = silhouette_score(scaled_df, labels)

    if silhouette > best_silhouette:
      best_silhouette = silhouette
      best_labels = labels
      best_random_state = current_random_state

  kmeans_plot(scaled_df, n_clusters, plot=True, random_state=best_random_state)
  cluster_df['cluster'] = best_labels

  print(f" best silhouette: {best_silhouette}")

  cluster_numeric_metrics = calculate_cluster_metric(cluster_df, numerical_columns, metric=metric)
  plot_stacked_bar_for_clusters(cluster_df, category_columns)
  plot_sorted_cluster_metric(cluster_df, cluster_numeric_metrics, numerical_columns, metric=metric)

  return cluster_df, find_discriminative_metrics(cluster_df, cols)

def calculate_cluster_metric(df, numerical_columns, metric='mean'):
  result_list = []

  for cluster in df['cluster'].unique():
    cluster_df = df[df['cluster'] == cluster]

    if metric == 'mean':
      cluster_metric = cluster_df[numerical_columns].mean()
    elif metric == 'median':
      cluster_metric = cluster_df[numerical_columns].median()
    else:
      raise ValueError("Invalid metric. Use 'mean' or 'median'.")

    cluster_metric['cluster'] = f"Cluster {cluster}"
    result_list.append(cluster_metric)

  result_df = pd.DataFrame(result_list)
  return result_df

def plot_sorted_cluster_metric(df, result_df, numerical_columns, metric='mean'):
  for col in numerical_columns:
    sorted_df = result_df.set_index('cluster').sort_values(by=col).reset_index()

    plt.figure(figsize=(12, 8))

    sorted_clusters = sorted_df['cluster'].values
    boxplot_data = []
    cluster_sizes = []
    for cluster in sorted_clusters:
      cluster_index = int(cluster.split(" ")[1])
      group_df = df[df['cluster'] == cluster_index]
      boxplot_data.append(group_df[col].values)
      cluster_sizes.append(len(group_df))  # Count the number of individuals in this cluster

    cluster_means = sorted_df[col].values
    norm = Normalize(vmin=np.min(cluster_means), vmax=np.max(cluster_means))
    colors = colormaps['viridis'](norm(cluster_means))
    colors_list = colors.tolist()

    sns.boxplot(
      data=boxplot_data,
      order=range(len(sorted_clusters)),
      palette=colors_list
    )

    cluster_labels_with_sizes = [
      f"{cluster} (n={size})" for cluster, size in zip(sorted_clusters, cluster_sizes)
    ]
    plt.xticks(range(len(sorted_clusters)), cluster_labels_with_sizes, rotation=45, ha='right')
    plt.xlabel('Cluster')
    plt.ylabel(f'{metric} of {col}')
    plt.title(f'Sorted {metric}s of {col} by Cluster')
    plt.tight_layout()

    plt.show()

def plot_stacked_bar_for_clusters(df, category_columns):
  clusters = sorted(df['cluster'].unique())  # Sort clusters in ascending order

  for category in category_columns:
    category = category_dict[category]
    category_counts = []

    for cluster in clusters:
      cluster_df = df[df['cluster'] == cluster]
      category_count = cluster_df[category].value_counts(normalize=True) * 100
      category_count = category_count.reindex(df[category].unique())
      category_counts.append(category_count)

    stacked_df = pd.DataFrame(category_counts, index=clusters)

    ax = stacked_df.plot(kind='bar', stacked=True, figsize=(12, 8), colormap='viridis')

    plt.title(f"Stacked Bar Plot of Clusters by {category}")
    plt.xlabel('Cluster')
    plt.ylabel('Percentage (%)')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

def find_discriminative_metrics(df, cols):
  results = []

  for col in cols:
    if col != 'cluster':
      contingency_table = pd.crosstab(df['cluster'], df[col])
      stat, p_value, _, _ = stats.chi2_contingency(contingency_table)

      results.append({
        "column": col,
        "statistic": stat,
        "p_value": p_value,
        "test": "Chi-Square"
      })

  results_df = pd.DataFrame(results).sort_values("p_value")
  return results_df

In [None]:
numerical_columns = ['idade', 'idade_int', 'altura', 'peso', 'consome_vegetais', 'consome_vegetais_int', 'n_refeicoes_int', 'consumo_diario_agua', 'consumo_diario_agua_int', 'frequencia_atividade_fisica', 'frequencia_atividade_fisica_int', 'tempo_usando_eletronicos', 'tempo_usando_eletronicos_int', 'IMC']

category_columns = ['sexo', 'historico_obesidade_familia', 'consome_comida_calorica', 'come_entre_refeicoes', 'fuma', 'consumo_alcool', 'tipo_transporte', 'Class_IMC', 'tipo_transporte_adaptado']

category_columns_dummies = [
  'dummy_sexo',
  'dummy_historico_obesidade_familia',
  'dummy_consome_comida_calorica',
  'dummy_come_entre_refeicoes',
  'dummy_consumo_alcool',
  'dummy_fuma',
  'dummy_tipo_transporte_adaptado',
]

# Comparação de estratégias de inputting

In [None]:
df_test = pd.read_csv('data/trabalho2_dados_4.csv')
missing_rows_idx = df_test[df_test.isna().any(axis=1)].index.tolist()

df_test_knn = fill_missing_data(df_test, method='knn')
df_test_simple = fill_missing_data(df_test, method='simple')

show_changed_missing_values(df_test_knn, df_test_simple)

## buscando duplicada nova gerada com inputting (gerou para ambos)
ele tinha transporte publico como NaN, mas ao substiruí-lo virou uma duplicada perfeita.

In [None]:
# analisando duplicata nova gerada pelo fill_missing_data
df_test = pd.read_csv('data/trabalho2_dados_4.csv')

print('Antes do Input')
new_duplicate = df_test.loc[602].copy()
display(df_test[df_test.eq(df_test.loc[1]).all(axis=1)])

df_test = fill_missing_data(df_test) 

print('Depois do Input')
display(df_test.loc[602])
display(df_test[df_test.eq(df_test.loc[1]).all(axis=1)])
new_duplicate

## Pre processando Dados

In [None]:
# Importando dados
df = load_and_preprocess_data(inputting_method='knn')
simple_df = load_and_preprocess_data(inputting_method='simple')
df.columns

## Detecção de outliers

In [None]:
detect_outliers(df) 
# Não foram detectados outliers relevantes nas métricas numéricas
# a unica metrica com outliers usando z_score foi a de idade, mas essa distribuição não é normal, e foram apenas 7 encontrados
# nas métricas categóricas, isso removeria certas do categorias do dataset, portanto, não é necessário remover outliers neste dataset

## Análises simples

In [None]:
plot_all_category_columns(df, category_columns, color='skyblue', top=10)

In [None]:
for row in numerical_columns:
  analyze_numerical_column(df[row]) 

## Gráficos categorias 

In [None]:
result_df = calculate_group_metric(df, category_columns, numerical_columns, metric='mean')
plot_sorted_group_metric(result_df, numerical_columns, metric='mean')

## Análise de correlação com spearmann

In [None]:
sorted_corr, corr_matrix = plot_and_return_correlation(df, category_columns_dummies=category_columns_dummies, numeric_columns=['idade', 'altura', 'peso', 'consome_vegetais_int', 'n_refeicoes_int', 'consumo_diario_agua_int', 'frequencia_atividade_fisica_int', 'tempo_usando_eletronicos_int', 'IMC'], method='spearman')
sorted_corr

# analise de influencia de colunas int para correlação

In [None]:
sorted_corr, corr_matrix = plot_and_return_correlation(df, category_columns_dummies=category_columns_dummies,
                            numeric_columns=['idade', 'consome_vegetais_int', 'n_refeicoes_int',
                                             'consumo_diario_agua_int', 'frequencia_atividade_fisica_int',
                                             'tempo_usando_eletronicos_int'], method='spearman', plot=False)

sorted_corr2, corr_matrix2 = plot_and_return_correlation(df, category_columns_dummies=category_columns_dummies,
                                                       numeric_columns=['idade', 'consome_vegetais', 'n_refeicoes', 'consumo_diario_agua','frequencia_atividade_fisica', 'tempo_usando_eletronicos'],
                                                       method='spearman', plot=False)

labels = ['idade', 'consome_vegetais', 'n_refeicoes', 'consumo_diario_agua','frequencia_atividade_fisica', 'tempo_usando_eletronicos'] + category_columns_dummies
diff = abs(corr_matrix.to_numpy()) - abs(corr_matrix2.to_numpy())
mask1 = (abs(corr_matrix.to_numpy()) >= 0.15) & (diff > 0.03)
plt.figure(figsize=(8, 6))
sns.heatmap(diff, annot=True, cmap='coolwarm', fmt=".2f", xticklabels=labels, yticklabels=labels)
plt.title(f'int vs sem int  (diff>0)')
plt.show()


mask2 = (abs(corr_matrix.to_numpy()) >= 0.15) & (diff < 0.03)
plt.figure(figsize=(8, 6))
sns.heatmap(diff, annot=True, cmap='coolwarm', fmt=".2f", xticklabels=labels, yticklabels=labels)
plt.title(f'int vs sem int (diff<0)')
plt.show()


# começando analises para agrupamento

In [None]:
numerical_columns_clustering_refinado = [
  'idade',
  'consome_vegetais_int',
  'n_refeicoes_int',
  'consumo_diario_agua_int',
  'frequencia_atividade_fisica_int',
  'IMC'
] 
numerical_columns_clustering = [
  'idade',
  'consome_vegetais',
  'n_refeicoes',
  'consumo_diario_agua',
  'frequencia_atividade_fisica',
  'tempo_usando_eletronicos',
  'IMC',
  'peso',
  'altura'
]
category_columns_clustering_refinado = [
  'dummy_sexo',
  'dummy_historico_obesidade_familia',
  'dummy_consome_comida_calorica',
  'dummy_come_entre_refeicoes',
  'dummy_consumo_alcool',
  'dummy_fuma',
  'dummy_tipo_transporte_andando',
  'dummy_tipo_transporte_bicicleta',
  'dummy_tipo_transporte_carro',
  'dummy_tipo_transporte_moto',
  'dummy_tipo_transporte_transporte publico',
]
category_columns_clustering = [
  'dummy_sexo',
  'dummy_historico_obesidade_familia',
  'dummy_consome_comida_calorica',
  'dummy_come_entre_refeicoes',
  'dummy_consumo_alcool',
  'dummy_fuma',
  'dummy_tipo_transporte_andando', 
  'dummy_tipo_transporte_bicicleta',
  'dummy_tipo_transporte_carro', 
  'dummy_tipo_transporte_moto',
  'dummy_tipo_transporte_transporte publico',
]

print(f"dimensionalidade: {len(numerical_columns_clustering + category_columns_clustering)}")

scaled_df = scale_data(df, list(set(numerical_columns_clustering + category_columns_clustering_refinado + numerical_columns_clustering_refinado + category_columns_clustering)))
scaled_df

In [None]:
pca_analysis_df, loadings, pca_contributions = pca_analysis(scaled_df, columns=numerical_columns_clustering_refinado + category_columns_clustering_refinado)

print(f"contribuiçoes de cada pca: {pca_contributions}")
loadings

In [None]:
pca_analysis_df, loadings, pca_contributions = pca_analysis(scaled_df, columns=numerical_columns_clustering + category_columns_clustering)

print(f"contribuiçoes de cada pca: {pca_contributions}")
loadings

In [None]:
print("Baseado no gráfico, podemos remover o último pca sem problemas.")
cumulative_explained_variance, feature_contributions = pca_explained_variance_plot(scaled_df, columns=numerical_columns_clustering_refinado + category_columns_clustering_refinado)


print("Baseado no gráfico, podemos remover o último pca sem problemas.")
cumulative_explained_variance, feature_contributions = pca_explained_variance_plot(scaled_df, columns=numerical_columns_clustering + category_columns_clustering)

In [None]:
pca_df, loadings, pca_contributions = pca_analysis(scaled_df, columns=numerical_columns_clustering_refinado + category_columns_clustering_refinado, plot=False)

cumulative_explained_variance, feature_contributions = pca_explained_variance_plot(df, columns=numerical_columns_clustering_refinado + category_columns_clustering_refinado)


pca_df = pca_df[[f"pca{i}"for i in range(1, 15)]]
pca_df

# Clustering
algoritmos selecionados:
particionamento (kmeans)
hierárquica (AGNES),
densidade (optics)
grade (clique)

## Análise de estratégias
idade > idade_int
sem eletronicos > eletronicos
IMC > PESO + ALTURA
INT >= SEM INT (INT MELHORAR QUANDO < 10 GRUPOS, SEM INT MELHOR PARA > 10 GRUPOS)


---------------
consumo alcool frequentemente
consome entre refeiçoes -> sempre e não
consome comida calorica -> nao (analisar)


consome vegetais > 1
n_refeicoes == 3
atividade fisica < 3

## Analisando "Genéricos"
> excelente para resultado removendo os "diferentes" vs "genericos"
> 
> verificar grupos com homem vs mulher
> 
> verificar por faixas etárias
> 
> remover os que não consomem comida calorica atrapalhou o resultado
> 
> int ajudou bastante no resultado
> 
> remoção de subgrupos tratanto int como categoria ajudou muito o resultado
> 
> remoção de tempo usando eletronicos ajudou bastante
> 
> separação por sexo ajudou muito o resultado


In [None]:
generic_df, cols = get_generic_df(df)
generic_simple_df, _ = get_generic_df(simple_df)
clustering_analysis(generic_df, method='kmeans', columns=cols)
clustering_analysis(generic_simple_df, method='kmeans', columns=cols)
# clustering_analysis(generic_df, method='optics', columns=cols)
# clustering_analysis(generic_df, method='hierarchy', columns=cols)

## Analisando "Genéricos" mulheres

In [None]:
generic_women_df, cols = get_generic_women_df(df)
generic_women_simple_df, _ = get_generic_women_df(simple_df)
clustering_analysis(generic_women_df, method='kmeans', columns=cols)
clustering_analysis(generic_women_simple_df, method='kmeans', columns=cols)
# clustering_analysis(generic_women_df, method='optics', columns=cols)
# clustering_analysis(generic_women_df, method='hierarchy', columns=cols)

## Analisando "Genéricos" homens

In [None]:
generic_men_df, cols = get_generic_men_df(df)
generic_men_simple_df, _ = get_generic_men_df(simple_df)
clustering_analysis(generic_men_df, method='kmeans', columns=cols)
clustering_analysis(generic_men_simple_df, method='kmeans', columns=cols)
# clustering_analysis(generic_men_df, method='optics', columns=cols)
# clustering_analysis(generic_men_df, method='hierarchy', columns=cols)

## Analisando "diferentes"
> pouca diferenciação entre os transporte não usuais, mas incluilos na base atrapalhe muito
> 
> remover os fumantes, ajuda mas não tanto quando transporte, porém é perceptível 2 subgrupos de fumantes
> 
> consumo alto de alcool pareido com fuma porém não é tão diferente internamente
> 
> pessoas com consumo entre refeiçoes não usuais atrapalham resultado, mas aparentem ter 2 subgrupos
> 
> pessoas que comem poucos vegetais fazem muita diferença mas entre si não tem subgrupos
> 
> atrapalha muito o resultado e pouquissima diferenciação interna para pessoas que não comem 3 refeiçoes por dia
> 
> atrapalha muito o resulta e pouqiissima diferenciação interna para pessoas que realizam alta atividade fisica

In [None]:
unusual_df, cols = get_unusual_df(df)
unusual_simple_df, _ = get_unusual_df(simple_df)
clustering_analysis(unusual_df, method='kmeans', columns=cols)
clustering_analysis(unusual_simple_df, method='kmeans', columns=cols)
# clustering_analysis(unusual_df, method='optics', columns=cols)
# clustering_analysis(unusual_df, method='hierarchy', columns=cols)

In [None]:
unrefined_df, cols = get_unrefined_df(df)
unrefined_simple_df, _ = get_unrefined_df(simple_df)
clustering_analysis(unrefined_df, method='kmeans', columns=cols)
clustering_analysis(unrefined_simple_df, method='kmeans', columns=cols)
# clustering_analysis(unrefined_df, method='optics', columns=cols)
# clustering_analysis(unrefined_df, method='hierarchy', columns=cols)

In [None]:
category_dict = {
  'dummy_sexo':'sexo',
  'dummy_historico_obesidade_familia':'historico_obesidade_familia',
  'dummy_consome_comida_calorica':'consome_comida_calorica',
  'dummy_come_entre_refeicoes':'come_entre_refeicoes',
  'dummy_consumo_alcool':'fuma',
  'dummy_fuma':'consumo_alcool',
  'dummy_tipo_transporte_transporte publico': 'tipo_transporte',
  'dummy_tipo_transporte_carro': 'tipo_transporte',
  'dummy_tipo_transporte_andando': 'tipo_transporte',
  'dummy_tipo_transporte_bicicleta': 'tipo_transporte',
  'dummy_tipo_transporte_moto': 'tipo_transporte',
  'dummy_tipo_transporte_adaptado':'tipo_transporte_adaptado'
}

generic_df, generic_numeric_cols, generic_category_cols = get_generic_df(df, separate_cols=True)
unrefined_df, unrefined_numeric_cols, unrefined_category_cols = get_unrefined_df(df, separate_cols=True)
unusual_df, unusual_numeric_cols, unusual_category_cols = get_unusual_df(df, separate_cols=True)
generic_men_df, generic_men_numeric_cols, generic_men_category_cols = get_generic_men_df(df, separate_cols=True)
generic_women_df, generic_women_numeric_cols, generic_women_category_cols = get_generic_women_df(df, separate_cols=True)

print(f"Hopkins test for generic_df, with value: {hopkins_statistic(generic_df, columns=generic_numeric_cols + generic_category_cols)}")
print(f"Hopkins test for unrefined_df, with value: {hopkins_statistic(unrefined_df, columns=unrefined_numeric_cols + unrefined_category_cols)}")
print(f"Hopkins test for unusual_df, with value: {hopkins_statistic(unusual_df, columns=unusual_numeric_cols + unusual_category_cols)}")
print(f"Hopkins test for generic_men_df, with value: {hopkins_statistic(generic_men_df, columns=generic_men_numeric_cols +  generic_men_category_cols)}")
print(f"Hopkins test for generic_women_df, with value: {hopkins_statistic(generic_women_df, columns=generic_women_numeric_cols + generic_women_category_cols)}")

In [None]:
cluster_df, discriminative_metrics = clusters_analysis(generic_df, n_clusters=8, numerical_columns=generic_numeric_cols, category_columns=generic_category_cols)
discriminative_metrics

In [None]:
cluster_df, discriminative_metrics = clusters_analysis(generic_women_df, n_clusters=3, numerical_columns=generic_women_numeric_cols, category_columns=generic_women_category_cols)
discriminative_metrics

In [None]:
cluster_df, discriminative_metrics = clusters_analysis(generic_men_df, n_clusters=3, numerical_columns=generic_men_numeric_cols, category_columns=generic_men_category_cols)
discriminative_metrics

In [None]:
cluster_df, discriminative_metrics = clusters_analysis(unusual_df, n_clusters=3, numerical_columns=unusual_numeric_cols, category_columns=unusual_category_cols)
discriminative_metrics