# Load Data

In [1]:
import pandas as pd
from dask.array import where
from holoviews.operation import threshold
from twisted.names.client import theResolver

from Data_Manager import load_data

data = load_data('Data_Files/knesset_25.xlsx')
print("Data shape: ", data.shape)
print("\nFirst 5 rows of the data:")
data.head()

Data shape:  (12545, 42)

First 5 rows of the data:


Unnamed: 0,city_name,ballot_code,party_avoda,party_shahar_kalkali_hadash,party_bayit_yehudi,party_agudat_israel,party_daled,party_vavmem,party_shahar_koach_hevrati,party_kama,...,party_tze'irim_bo'arim,party_manhigut_hevratit,party_kol_hasviva_vehachai,party_halev_hayehudi,party_seder_chadash,party_kol,party_beometz_bishvilech,party_kavod_umasoret,party_shas,party_daat_tov_vera
0,אבו גווייעד שבט,3.1,0,0,0,0,4,21,0,0,...,0,0,0,0,0,0,0,0,0,2
1,אבו גווייעד שבט,3.2,1,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,1
2,אבו גווייעד שבט,3.3,0,0,0,0,0,12,0,0,...,0,0,0,0,0,0,0,0,2,0
3,אבו גווייעד שבט,3.4,0,0,0,0,0,3,0,0,...,1,0,0,0,0,0,0,0,2,0
4,אבו גוש,1.1,1,0,0,0,171,43,0,0,...,0,1,0,0,0,0,0,0,0,0


# Data Preprocessing

In [3]:
from Data_Manager import group_and_aggregate_data

# Grouping the data by the 'city' column and summing the 'votes' column
agg_city_votes = group_and_aggregate_data(data, 'city_name', 'sum')
print("\nTotal votes per city:")
# Displaying all rows of the grouped data
agg_city_votes



Total votes per city:


Unnamed: 0,city_name,ballot_code,party_avoda,party_shahar_kalkali_hadash,party_bayit_yehudi,party_agudat_israel,party_daled,party_vavmem,party_shahar_koach_hevrati,party_kama,...,party_tze'irim_bo'arim,party_manhigut_hevratit,party_kol_hasviva_vehachai,party_halev_hayehudi,party_seder_chadash,party_kol,party_beometz_bishvilech,party_kavod_umasoret,party_shas,party_daat_tov_vera
0,אבו גווייעד שבט,13.0,1,0,0,0,4,38,0,0,...,1,0,0,0,0,0,0,0,4,3
1,אבו גוש,38.2,14,1,1,3,1263,312,0,0,...,2,7,1,0,1,1,3,0,4,0
2,אבו סנאן,170.2,34,0,3,0,677,2030,4,1,...,1,4,1,3,1,6,9,0,12,1
3,אבו עבדון שבט,1.0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,אבו קורינאת שבט,26.5,5,0,1,0,10,65,0,0,...,0,1,0,0,2,1,0,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1211,תקומה,1.0,3,2,42,1,0,0,0,0,...,0,0,0,0,0,0,1,0,13,0
1212,תקוע,10.0,25,18,266,13,0,1,0,0,...,1,1,1,3,0,0,26,0,27,0
1213,תראבין אצאנע שבט,1.0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1214,תראבין אצאנעישוב,1.0,1,0,0,0,6,1,0,0,...,0,0,0,0,0,0,1,0,1,0


# Removing sparse columns

In [5]:
# Import required functions
from Data_Manager import remove_sparse_columns

# Apply threshold
threshold = 100000
filtered_df = remove_sparse_columns(agg_city_votes,threshold)
filtered_df

Unnamed: 0,city_name,ballot_code,party_avoda,party_agudat_israel,party_daled,party_vavmem,party_tet,party_kahol_lavan,party_israel_beitenu,party_likud,party_meretz,party_raam,party_yesh_atid,party_shas
0,אבו גווייעד שבט,13.0,1,0,4,38,0,5,0,12,0,468,1,4
1,אבו גוש,38.2,14,3,1263,312,13,8,2,208,58,838,26,4
2,אבו סנאן,170.2,34,0,677,2030,6,401,355,405,82,1160,163,12
3,אבו עבדון שבט,1.0,0,0,1,1,0,0,0,0,0,39,0,0
4,אבו קורינאת שבט,26.5,5,0,10,65,0,0,0,9,6,1096,4,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1211,תקומה,1.0,3,1,0,0,138,44,1,142,2,0,25,13
1212,תקוע,10.0,25,13,0,1,749,94,28,353,9,1,48,27
1213,תראבין אצאנע שבט,1.0,0,0,0,1,0,0,0,28,1,42,0,0
1214,תראבין אצאנעישוב,1.0,1,0,6,1,0,0,0,143,2,87,1,1


# PCA

In [11]:
# import function of PCA reduction
from Data_Manager import dimensionality_reduction

meta_cols = ['city_name', 'ballot_code']    #the meta columns we want to not be reduced
# create the pca DataFrame from the aggregated DataFrame
pca_df = dimensionality_reduction(filtered_df, 2, ['city_name', 'ballot_code']).drop(columns = 'ballot_code')
pca_df


Unnamed: 0,city_name,PC1,PC2
0,אבו גווייעד שבט,0.432395,-0.165294
1,אבו גוש,-0.130266,-1.773289
2,אבו סנאן,-0.577928,-2.601544
3,אבו עבדון שבט,0.516284,0.161418
4,אבו קורינאת שבט,0.316654,-0.627522
...,...,...,...
1211,תקומה,0.482694,0.203172
1212,תקוע,0.359274,0.239777
1213,תראבין אצאנע שבט,0.513864,0.160957
1214,תראבין אצאנעישוב,0.495599,0.127004
