In [36]:
import pandas as pd
import os

In [37]:
# Specify the folder where your CSV files are located
folder_path = 'Data/raw'  # Replace with the actual path


In [38]:
# List all the CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]


In [39]:
print(csv_files)

['community_issues_letters.csv', 'uk_issues_positive_sentiment.csv', 'community_issues_dataset_final.csv', 'community_issues_extended_dataset.csv', 'community_issues_dataset(1).csv', 'community_issues_dataset_updated.csv', 'community_issues_dataset_long.csv', 'uk_issues_negative_sentiment.csv', 'copilot.csv', 'community_issues_dataset-2.csv', 'community_issues_dataset copy.csv', 'community_issues_further_extended_dataset.csv', 'uk_issues_full_dataset.csv', 'community_issues_dataset.csv', 'community_issues_dataset_long copy.csv']


In [40]:
# Initialize an empty list to store each CSV data as DataFrames
dataframes = []

In [41]:
# Initialize variables for column consistency check
all_columns_match = True
expected_columns = None  # This will store the expected columns if the first file is correct


In [42]:

# Loop through each CSV file and read it into a DataFrame
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)

    # Check the column names of each file to debug
    print(f"Checking columns in {file}: {df.columns.tolist()}")

    # Check if the required 'Text' or 'Letter Text' column exists and standardize it to 'letter_text'
    if 'Text' in df.columns:
        df.rename(columns={'Text': 'letter_text'}, inplace=True)
    elif 'Letter Text' in df.columns:
        df.rename(columns={'Letter Text': 'letter_text'}, inplace=True)
    else:
        print(f"Warning: Neither 'Text' nor 'Letter Text' column is present in {file}. Skipping this file.")
        continue  # Skip this file if neither column is found

    # Check if the number of columns and their names match
    if expected_columns is None:
        # Set the columns of the first file as the expected structure
        expected_columns = df.columns
    elif list(df.columns) != list(expected_columns):
        print(f"Warning: Column mismatch in {file}. Skipping this file.")
        all_columns_match = False
        continue  # Skip this file if columns do not match

    # If checks pass, append the DataFrame to the list
    dataframes.append(df)


Checking columns in community_issues_letters.csv: ['Letter_ID', 'issue_category', 'Category', 'Severity', 'Frequency', 'sentiment', 'letter_text', 'Document Style']
Checking columns in uk_issues_positive_sentiment.csv: ['Letter ID', 'Severity', 'Frequency', 'Category', 'Sentiment', 'Issue Name', 'Content']
Checking columns in community_issues_dataset_final.csv: ['letter_text', 'issue_category', 'sentiment']
Checking columns in community_issues_extended_dataset.csv: ['letter_text', 'issue_category', 'sentiment']
Checking columns in community_issues_dataset(1).csv: ['letter_text', 'issue_category', 'sentiment']
Checking columns in community_issues_dataset_updated.csv: ['letter_text', 'issue_category', 'sentiment']
Checking columns in community_issues_dataset_long.csv: ['Issue Name', 'Category', 'Severity', 'Frequency', 'Sentiment', 'Letter Text']
Checking columns in uk_issues_negative_sentiment.csv: ['Letter ID', 'Severity', 'Frequency', 'Category', 'Sentiment', 'Issue Name', 'Content']


In [43]:

# If columns mismatch, notify the user
if not all_columns_match:
    print("Not all files have matching columns or required 'Text'/'Letter Text' column. Skipped incompatible files.")


In [44]:

# Concatenate all DataFrames into one single DataFrame
if dataframes:
    merged_df = pd.concat(dataframes, ignore_index=True)

    # Print out the columns of the merged dataset for debugging
    print("Merged Dataset Columns:", merged_df.columns.tolist())

    # Check if the necessary columns are present before selecting them
    columns_to_select = ['letter_text']

    # If 'issue_category' exists, add it to the selected columns
    if 'issue_category' in merged_df.columns:
        columns_to_select.append('issue_category')

    # If 'sentiment' exists, add it to the selected columns
    if 'sentiment' in merged_df.columns:
        columns_to_select.append('sentiment')

    # Select only the columns you need
    merged_df = merged_df[columns_to_select]

    # Check the first few rows of the merged dataset
    print("Merged Dataset:")
    print(merged_df.head())
else:
    print("No valid files to merge.")

Merged Dataset Columns: ['Issue Name', 'Category', 'Severity', 'Frequency', 'Sentiment', 'letter_text']
Merged Dataset:
                                         letter_text
0  Dear Council,\n\nI am writing to formally rais...
1  Dear Council,\n\nI am writing to formally rais...
2  Dear Council,\n\nI am writing to formally rais...
3  Dear Council,\n\nI am writing to formally rais...
4  Dear Council,\n\nI am writing to formally rais...


In [45]:
merged_df.describe()

Unnamed: 0,letter_text
count,97000
unique,97
top,"Dear Council,\n\nI am writing to formally rais..."
freq,1314


In [46]:

# Optionally: Save the merged dataset to a new CSV file
# merged_df.to_csv('merged_data.csv', index=False)