In [None]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("ExcelProcessing") \
    .config("spark.jars.packages", "com.crealytics:spark-excel_2.12:0.13.5") \
    .getOrCreate()

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

In [None]:
def get_string_schema(columns):
    return StructType([StructField(column, StringType(), True) for column in columns])

In [None]:
def load_data_to_dataframe(file_path, schema):
    try:
        df = spark.read.format('com.crealytics.spark.excel') \
                       .option('dataAddress', "'Sheet1'!A1") \
                       .option('header', 'true') \
                       .option('inferSchema', 'false') \
                       .schema(schema) \
                       .load(file_path)
        return df
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None

In [None]:
# Load data for each cluster
dataframes = {i: [] for i in range(num_clusters)}
for cluster_id, files in subsets.items():
    if files:
        # Use the first file to infer the schema
        sample_df = pd.read_excel(os.path.join(excel_dir, files[0]), nrows=0)
        schema = get_string_schema(sample_df.columns)
        for file in files:
            file_path = os.path.join(excel_dir, file)
            df = load_data_to_dataframe(file_path, schema)
            if df:
                dataframes[cluster_id].append(df)

In [None]:
# Combine all DataFrames into one for each cluster
combined_dfs = {}
for cluster_id, dfs in dataframes.items():
    if dfs:
        combined_df = dfs[0]
        for df in dfs[1:]:
            combined_df = combined_df.union(df)
        combined_dfs[cluster_id] = combined_df

In [None]:
# Show combined DataFrames for each cluster
for cluster_id, df in combined_dfs.items():
    print(f"Cluster {cluster_id} Combined DataFrame:")
    df.show()