In [16]:
import pandas as pd
import os
import warnings
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType
# Ignore specific warnings
warnings.filterwarnings('ignore')
 

In [2]:
# Set environment variable for OpenMP threads
os.environ["OMP_NUM_THREADS"] = "3"

In [3]:
# Directory containing the Excel files
excel_dir = r'C:\Users\hparnell\Desktop\MH10010\Resources'

In [4]:
# Read all Excel files into a list of data frames
data_frames = []
file_names = []
 

In [5]:
for file in os.listdir(excel_dir):
    if file.endswith('.xlsx') or file.endswith('.xls'):
        df = pd.read_excel(os.path.join(excel_dir, file))
        data_frames.append(df)
        file_names.append(file)

In [6]:
# Convert DataFrames to text data for clustering
text_data = []
for df in data_frames:
    # Convert the entire DataFrame to a single string 
    text = df.astype(str).values.flatten()
    text = ' '.join(text)
    text_data.append(text)

In [7]:
# Preprocessing and vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(text_data)
 

In [8]:
# Calculate the cosine similarity matrix
similarity_matrix = cosine_similarity(X)

In [9]:
# Perform K-means clustering on the similarity matrix
num_clusters = 10  # Adjust based on your needs
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(similarity_matrix)

In [10]:
# Get cluster labels
labels = kmeans.labels_

In [11]:
# Create a dictionary to hold the subsets
subsets = {i: [] for i in range(num_clusters)}

In [12]:
# Assign files to clusters
for i, label in enumerate(labels):
    subsets[label].append(file_names[i])

In [13]:
# Save each cluster's filenames to a CSV file
output_dir = r'C:\Users\hparnell\Desktop\MH10010\Clustered_Files'
os.makedirs(output_dir, exist_ok=True)

In [14]:
for cluster, files in subsets.items():
    df = pd.DataFrame(files, columns=["File Name"])
    file_path = os.path.join(output_dir, f'cluster_{cluster}_files.csv')
    df.to_csv(file_path, index=False)
    print(f"Cluster {cluster} DataFrame saved to {file_path}")

Cluster 0 DataFrame saved to C:\Users\hparnell\Desktop\MH10010\Clustered_Files\cluster_0_files.csv
Cluster 1 DataFrame saved to C:\Users\hparnell\Desktop\MH10010\Clustered_Files\cluster_1_files.csv
Cluster 2 DataFrame saved to C:\Users\hparnell\Desktop\MH10010\Clustered_Files\cluster_2_files.csv
Cluster 3 DataFrame saved to C:\Users\hparnell\Desktop\MH10010\Clustered_Files\cluster_3_files.csv
Cluster 4 DataFrame saved to C:\Users\hparnell\Desktop\MH10010\Clustered_Files\cluster_4_files.csv
Cluster 5 DataFrame saved to C:\Users\hparnell\Desktop\MH10010\Clustered_Files\cluster_5_files.csv
Cluster 6 DataFrame saved to C:\Users\hparnell\Desktop\MH10010\Clustered_Files\cluster_6_files.csv
Cluster 7 DataFrame saved to C:\Users\hparnell\Desktop\MH10010\Clustered_Files\cluster_7_files.csv
Cluster 8 DataFrame saved to C:\Users\hparnell\Desktop\MH10010\Clustered_Files\cluster_8_files.csv
Cluster 9 DataFrame saved to C:\Users\hparnell\Desktop\MH10010\Clustered_Files\cluster_9_files.csv


In [None]:
# Show combined DataFrames for each cluster
for cluster_id, df in combined_dfs.items():
    print(f"Cluster {cluster_id} Combined DataFrame:")
    df.show()