# Cluster Target Fill Values

This file processes the syllabi text through tf/idf vectorization to perform clustering on the documents.  Some documents do not have values for the target variable "RATING".  The documents are clustered to find the k value with the highest silhouette score while having at least one document with an assigned target value for every cluster.  If a document does not have a target value, it is assigned the value of the average of the assigned values in its cluster.


### Input:  InstructorRatingsCSV.csv, TextFiles_Combo folder
### Output: targeFillDocs.csv

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [2]:
# Step 1: Load data and preprocess
csv_path = "InstructorRatingsCSV.csv"
folder_path = "TextFiles_Combo"

# Load CSV
df = pd.read_csv(csv_path)

In [3]:
# Label encode categorical features
label_encoders = {}
for col in ["COURSENAME", "INSTRUCTOR", "LEVEL", "LOCATION"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [4]:
# Load text documents
documents = []
for file_name in df["ID"]:
    with open(os.path.join(folder_path, file_name + ".txt"), 'r') as file:
        documents.append(file.read())

df["Document"] = documents

In [5]:
# Step 2: TF-IDF vectorization and top n-gram extraction
vectorizer = TfidfVectorizer(ngram_range=(1, 3), stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df["Document"])

print(tfidf_matrix)

  (0, 60985)	0.038234915894566554
  (0, 18766)	0.1360222878319367
  (0, 64969)	0.011342809294227338
  (0, 22382)	0.012443112229009368
  (0, 17236)	0.01675614143591706
  (0, 15882)	0.016829831729980534
  (0, 12748)	0.0150782292387542
  (0, 65998)	0.05428162525951512
  (0, 25234)	0.0028794757230244396
  (0, 66101)	0.0028794757230244396
  (0, 62341)	0.00558538047863902
  (0, 69041)	0.012365662530176064
  (0, 20217)	0.06403123334178575
  (0, 988)	0.07140368625866389
  (0, 1067)	0.16188304119952796
  (0, 70066)	0.028357023235568345
  (0, 35925)	0.0433415553578456
  (0, 59748)	0.02200330752405097
  (0, 29739)	0.0029240378746932874
  (0, 73195)	0.007944373329833469
  (0, 2416)	0.0029694177259913523
  (0, 39908)	0.04564995092185721
  (0, 59288)	0.0315398268522168
  (0, 93)	0.03746930039760649
  (0, 160)	0.01261593074088672
  :	:
  (64, 8797)	0.018784038677380557
  (64, 1606)	0.018784038677380557
  (64, 26305)	0.018784038677380557
  (64, 27694)	0.018784038677380557
  (64, 3130)	0.01878403867738

In [6]:
# Step 3: K-Means clustering
silhouette_scores = []
k_values = range(2, 11)

# Ensure every cluster has at least one labeled document
def ensure_clusters_have_labels(cluster_labels, df):
    cluster_valid = True
    for cluster in set(cluster_labels):
        if df[(df['Cluster'] == cluster) & (df['RATING'].notna())].empty:
            cluster_valid = False
            break
    return cluster_valid

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    cluster_labels = kmeans.fit_predict(tfidf_matrix)
    df['Cluster'] = cluster_labels
    
    # Check if all clusters have at least one labeled document
    if ensure_clusters_have_labels(cluster_labels, df):
        score = silhouette_score(tfidf_matrix, cluster_labels)
        silhouette_scores.append(score)
    else:
        silhouette_scores.append(-1)  # Penalize clustering results that fail this condition

# Find optimal k ensuring valid clustering
optimal_k = k_values[np.argmax(silhouette_scores)]
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
cluster_labels = kmeans.fit_predict(tfidf_matrix)
df['Cluster'] = cluster_labels

print(df)

                           ID  COURSENAME  INSTRUCTOR  LEVEL  LOCATION  \
0          CSCI-1100-003-Haas          42           9      1         0   
1          CSCI-1100-901-Haas          42           9      1         1   
2       CSCI-1120-901-Hendrix           0          11      1         1   
3    CSCI-1200-001-Desjardins           2           5      1         0   
4        CSCI-1210-001-Ramsey          16          16      1         0   
..                        ...         ...         ...    ...       ...   
60  CSCI-5607-001-Bajracharya          19           0      0         1   
61    CSCI-5757-001-Battleson          20           1      0         0   
62      CSCI-5927-201-Rezwana          18          17      0         0   
63  CSCI-5957-002-Bajracharya          39           0      0         0   
64      CSCI-5989-001-Bennett          22           2      0         1   

    RATING                                           Document  Cluster  
0     40.0  On-site Course Syllabus DE

In [7]:
# Final check and reassignment for unlabeled clusters
while not ensure_clusters_have_labels(cluster_labels, df):
    print("Recomputing clusters to ensure each cluster has at least one labeled document.")
    df = df.sample(frac=1, random_state=42)  # Shuffle the DataFrame
    kmeans = KMeans(n_clusters=optimal_k, random_state=42)
    cluster_labels = kmeans.fit_predict(tfidf_matrix)
    df['Cluster'] = cluster_labels

print(optimal_k)

5


In [8]:
# Compute averages for missing ratings
for cluster in range(optimal_k):
    cluster_indices = df[df['Cluster'] == cluster].index
    cluster_mean = df.loc[cluster_indices, 'RATING'].dropna().mean()
    df.loc[cluster_indices, 'RATING'] = df.loc[cluster_indices, 'RATING'].fillna(cluster_mean)

In [9]:
print(df)

                           ID  COURSENAME  INSTRUCTOR  LEVEL  LOCATION  \
0          CSCI-1100-003-Haas          42           9      1         0   
1          CSCI-1100-901-Haas          42           9      1         1   
2       CSCI-1120-901-Hendrix           0          11      1         1   
3    CSCI-1200-001-Desjardins           2           5      1         0   
4        CSCI-1210-001-Ramsey          16          16      1         0   
..                        ...         ...         ...    ...       ...   
60  CSCI-5607-001-Bajracharya          19           0      0         1   
61    CSCI-5757-001-Battleson          20           1      0         0   
62      CSCI-5927-201-Rezwana          18          17      0         0   
63  CSCI-5957-002-Bajracharya          39           0      0         0   
64      CSCI-5989-001-Bennett          22           2      0         1   

       RATING                                           Document  Cluster  
0   40.000000  On-site Course Sylla

In [11]:
df = df.drop(columns=["Document"])

df.to_csv("targetFillDocs.csv", index=False)