In [1]:
import pandas as pd
import os
from glob import glob

# Path to the directory containing your xlsx files
input_directory = "/content"

# Step 1: Merge all the xlsx files into a single file
all_files = glob(os.path.join(input_directory, "*.xlsx"))
merged_data = []

# Read each file and append its content to the list
for file in all_files:
    df = pd.read_excel(file, usecols=[0], header=None)  # Read only the first column
    merged_data.append(df)

# Concatenate all dataframes vertically
merged_df = pd.concat(merged_data, ignore_index=True)

# Save the merged data to a new xlsx file
merged_filename = os.path.join(input_directory, "merged.xlsx")
merged_df.to_excel(merged_filename, index=False, header=False)

# Step 2: Calculate the total number of topics and the number of question topics
# Total number of topics
total_topics = len(merged_df)

# Count the number of topics ending with a question mark
question_topics = merged_df[merged_df[0].str.endswith("?")].shape[0]

# Step 3: Calculate the percentage of question topics
percentage_question_topics = (question_topics / total_topics) * 100

# Print the results
print(f"Total number of topics: {total_topics}")
print(f"Number of question topics: {question_topics}")
print(f"Percentage of question topics: {percentage_question_topics:.2f}%")

Total number of topics: 3352
Number of question topics: 1365
Percentage of question topics: 40.72%
