In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os 
import plotly.express as px

## **load data function**

In [2]:
def load_data(filepath: str):
    # To Handle if the file is not found
    try:
        if not os.path.exists(filepath):
            raise FileNotFoundError(f"The file {filepath} does not exist.")

        # Check the extention if it excel or csv
        if filepath.endswith('.csv'):
            data = pd.read_csv(filepath)
        elif filepath.endswith(('.xls', '.xlsx')):
            data = pd.read_excel(filepath)
        else:
            raise ValueError("Unsupported file format. Please provide a CSV or Excel file.")

        # Optional: Check if the DataFrame is empty
        if data.empty:
            raise ValueError("The loaded file is empty.")

        return data
    #print the error that occured in file reading or loading
    except FileNotFoundError as e:
        print(f"Error: {e}")
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")




In [11]:
df = load_data('knesset_25.xlsx')
df

Unnamed: 0,city_name,ballot_code,party_avoda,party_shahar_kalkali_hadash,party_bayit_yehudi,party_agudat_israel,party_daled,party_vavmem,party_shahar_koach_hevrati,party_kama,...,party_tze'irim_bo'arim,party_manhigut_hevratit,party_kol_hasviva_vehachai,party_halev_hayehudi,party_seder_chadash,party_kol,party_beometz_bishvilech,party_kavod_umasoret,party_shas,party_daat_tov_vera
0,אבו גווייעד שבט,3.1,0,0,0,0,4,21,0,0,...,0,0,0,0,0,0,0,0,0,2
1,אבו גווייעד שבט,3.2,1,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,1
2,אבו גווייעד שבט,3.3,0,0,0,0,0,12,0,0,...,0,0,0,0,0,0,0,0,2,0
3,אבו גווייעד שבט,3.4,0,0,0,0,0,3,0,0,...,1,0,0,0,0,0,0,0,2,0
4,אבו גוש,1.1,1,0,0,0,171,43,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12540,תקוע,3.0,4,3,66,1,0,1,0,0,...,1,0,0,0,0,0,6,0,12,0
12541,תקוע,4.0,6,3,66,3,0,0,0,0,...,0,0,0,1,0,0,1,0,3,0
12542,תראבין אצאנע שבט,1.0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
12543,תראבין אצאנעישוב,1.0,1,0,0,0,6,1,0,0,...,0,0,0,0,0,0,1,0,1,0


## **The second function**  
this function aggregate the data and group it by specific column

ok we should be careful about the 3rd function if it gets its input form the 2nd function or not 
 and the ballot code is deleted from the 2nd function so there is no need to put it in the 3rd fucntion 
 and there is no need to keep keep list in the 3rd function because the city name is alrady an index for the data frame 
 and idont know if we should keep the agg function as string or functoion 
 in python ithnk we can get rid from loops 
 

In [30]:
def group_and_aggregate_data(df: pd.DataFrame, group_by_column: str, agg_func) -> pd.DataFrame:
    # Performing the functionality
    result = df.drop(columns='ballot_code').groupby(group_by_column).agg(agg_func)
    return result  


Demonstration : group the data but city name and aggregate them by mean value 

In [None]:
df2 = group_and_aggregate_data(df, 'city_name','sum')
df2

In [42]:
def remove_sparse_columns_with_grouping(df: pd.DataFrame, threshold: int):
    #always_keep = ['city_name']  # Column to always keep
    
    # Aggregate the data by the group-by column
    grouped_df =group_and_aggregate_data(df, 'city_name', 'sum')
    
    # Filter numeric columns by sum and threshold
    numeric_df = grouped_df.select_dtypes(include='number')
    filtered_columns = numeric_df.columns[numeric_df.sum(axis=0) >= threshold].tolist()
    
    # Combine the always-keep columns with filtered columns
    #filtered_columns = list(set(always_keep + filtered_columns))
    
    # Ensure the correct column order
    ordered_columns = list(set(grouped_df.columns).intersection(filtered_columns))
    
    return grouped_df[ordered_columns]

In [43]:
df3= remove_sparse_columns_with_grouping(df, 1000)

df3

Unnamed: 0_level_0,party_israel_hofshit_demokratit,party_shas,party_beometz_bishvilech,party_pesofit,party_kahol_lavan,party_tze'irim_bo'arim,party_bayit_yehudi,party_kol,party_vavmem,party_daat_tov_vera,...,party_raam,party_israel_beitenu,party_yesh_atid,party_daled,party_nativ,party_kol_hasviva_vehachai,party_avoda,party_agudat_israel,party_shahar_kalkali_hadash,party_likud
city_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
אבו גווייעד שבט,0,4,0,0,5,1,0,0,38,3,...,468,0,1,4,0,0,1,0,0,12
אבו גוש,0,4,3,0,8,2,1,1,312,0,...,838,2,26,1263,0,1,14,3,1,208
אבו סנאן,4,12,9,0,401,1,3,6,2030,1,...,1160,355,163,677,0,1,34,0,0,405
אבו עבדון שבט,0,0,0,0,0,0,0,0,1,0,...,39,0,0,1,0,0,0,0,0,0
אבו קורינאת שבט,0,3,0,1,0,0,1,1,65,0,...,1096,0,4,10,0,0,5,0,0,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
תקומה,0,13,1,0,44,0,42,0,0,0,...,0,1,25,0,0,0,3,1,2,142
תקוע,0,27,26,5,94,1,266,0,1,0,...,1,28,48,0,0,1,25,13,18,353
תראבין אצאנע שבט,0,0,0,0,0,0,0,0,1,0,...,42,0,0,0,0,0,0,0,0,28
תראבין אצאנעישוב,0,1,1,0,0,0,0,0,1,0,...,87,0,1,6,0,0,1,0,0,143


In [44]:
def remove_sparse_columns2(df: pd.DataFrame, threshold: int):
        always_keep = ['city_name', 'ballot_code'] # ok i think here there is no need to put ballot code
        numeric_df = df.select_dtypes(include='number')
        filtered_columns = [col for col in numeric_df.columns
                                if numeric_df[col].sum() >= threshold] # i think and dont know but there is way much easier chat gpt:     filtered_columns = numeric_df.columns[numeric_df.sum() >= threshold].tolist()---- see below

        filtered_columns = always_keep + filtered_columns
        filtered_columns = list(set(filtered_columns))
        ordered_columns = [col for col in df.columns
                            if col in filtered_columns]
        return df[ordered_columns]


In [45]:
df4= remove_sparse_columns_with_grouping(df, 1000)

df4

Unnamed: 0_level_0,party_israel_hofshit_demokratit,party_shas,party_beometz_bishvilech,party_pesofit,party_kahol_lavan,party_tze'irim_bo'arim,party_bayit_yehudi,party_kol,party_vavmem,party_daat_tov_vera,...,party_raam,party_israel_beitenu,party_yesh_atid,party_daled,party_nativ,party_kol_hasviva_vehachai,party_avoda,party_agudat_israel,party_shahar_kalkali_hadash,party_likud
city_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
אבו גווייעד שבט,0,4,0,0,5,1,0,0,38,3,...,468,0,1,4,0,0,1,0,0,12
אבו גוש,0,4,3,0,8,2,1,1,312,0,...,838,2,26,1263,0,1,14,3,1,208
אבו סנאן,4,12,9,0,401,1,3,6,2030,1,...,1160,355,163,677,0,1,34,0,0,405
אבו עבדון שבט,0,0,0,0,0,0,0,0,1,0,...,39,0,0,1,0,0,0,0,0,0
אבו קורינאת שבט,0,3,0,1,0,0,1,1,65,0,...,1096,0,4,10,0,0,5,0,0,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
תקומה,0,13,1,0,44,0,42,0,0,0,...,0,1,25,0,0,0,3,1,2,142
תקוע,0,27,26,5,94,1,266,0,1,0,...,1,28,48,0,0,1,25,13,18,353
תראבין אצאנע שבט,0,0,0,0,0,0,0,0,1,0,...,42,0,0,0,0,0,0,0,0,28
תראבין אצאנעישוב,0,1,1,0,0,0,0,0,1,0,...,87,0,1,6,0,0,1,0,0,143


In [46]:
if df4.equals(df3):
    print("The DataFrames are identical.")
else:
    print("The DataFrames are not identical.")


The DataFrames are identical.


## the pca function 

In [47]:
def dimensionality_reduction(df: pd.DataFrame, num_components: int, meta_columns: list[str]) -> pd.DataFrame:

    metadata = df[meta_columns]
    data = df.drop(columns=meta_columns)
    centered_data = data - data.mean()
    covariance_matrix = np.cov(centered_data, rowvar=False)
    eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix)
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]
    principal_components = eigenvectors[:, :num_components]
    #to make the custom function match the pca function
    for i in range(principal_components.shape[1]):
        if np.sum(principal_components[:, i]) < 0:
            principal_components[:, i] *= -1
    reduced_data = np.dot(centered_data, principal_components)
    reduced_df = pd.DataFrame(reduced_data, columns=[f"PC{i + 1}" for i in range(num_components)])
    result = pd.concat([metadata.reset_index(drop=True), reduced_df], axis=1)
    return result
