In [None]:
import Pkg
Pkg.add(["DataFrames", "CSV", "StatsBase", "Clustering", "TextAnalysis", "Plots", "Distances", "UMAP", "TSVD"])

In [14]:
import Pkg
Pkg.add(["StatFiles", "DataFrames", "FileIO"])


[32m[1m   Resolving[22m[39m package versions...


[32m[1m    Updating[22m[39m `~/.julia/environments/v1.9/Project.toml`
  [90m[5789e2e9] [39m[92m+ FileIO v1.16.1[39m
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.9/Manifest.toml`


In [15]:
using DataFrames, CSV, StatsBase, Clustering, TextAnalysis, Plots, Distances, UMAP, TSVD

function elbow_method(inertias)
    diffs = diff(inertias)
    second_diffs = diff(diffs)
    return argmax(second_diffs) + 1  # Index where the second derivative is highest
end

function determine_optimal_clusters(X, max_k=20)
    println("Determining optimal number of clusters...")

    inertias = Float64[]
    silhouette_scores = Float64[]
    ch_scores = Float64[]

    # Reduce dimensionality
    svd = TSVD.tsvd(X, 100)
    X_reduced = svd.U * Diagonal(svd.S)

    for k in 2:max_k
        println("Testing k=", k)
        kmeans_result = kmeans(X', k; maxiter=300, display=:none)
        push!(inertias, sum(kmeans_result.costs))
        push!(silhouette_scores, mean(silhouettes(X_reduced, kmeans_result.assignments, Euclidean())))
        push!(ch_scores, calinski_harabasz_score(X_reduced, kmeans_result.assignments))
    end

    k_elbow = elbow_method(inertias) + 1  # Use elbow method replacement
    k_silhouette = argmax(silhouette_scores) + 1
    k_ch = argmax(ch_scores) + 1

    println("\nOptimal number of clusters:")
    println("Elbow method: ", k_elbow)
    println("Silhouette score: ", k_silhouette)
    println("Calinski-Harabasz score: ", k_ch)

    return mode([k_elbow, k_silhouette, k_ch])
end

using DataFrames, FileIO, StatFiles

function load_and_preprocess(file_path)
    println("Loading and preprocessing data...")

    # Read Stata (.dta) file correctly
    df = DataFrame(load(file_path))  # Use StatFiles for compatibility

    # Define columns of interest
    cols = ["MeasureDescription_EN", "CauseDescription_EN"]
    for col in cols
        df[!, col] .= coalesce.(df[!, col], "")  # Replace missing values with empty string
    end

    # Filter rows where all columns are empty
    df[!, "all_empty"] = [all(strip(cell) == "" for cell in row) for row in eachrow(df[:, cols])]
    df_filtered = filter(row -> !row.all_empty, df)

    # Combine text fields
    df_filtered[!, "text_combined"] = [join(filter(!isempty, strip.(row)), " ") for row in eachrow(df_filtered[:, cols])]

    println("Processed ", size(df_filtered, 1), " valid records out of ", size(df, 1), " total records")
    return df_filtered
end



# Vectorize text
function vectorize_text(df, max_features=1000)
    println("Vectorizing text...")
    
    vectorizer = TFIDF()
    corpus = StringDocument.(df.text_combined)
    for doc in corpus
        prepare!(doc, strip=true, lowercase=true, stopwords=true, stem=false)
    end
    
    fit!(vectorizer, corpus)
    X = transform(vectorizer, corpus)
    
    println("Created ", size(X, 2), " features from text")
    return X, vectorizer
end

# Perform clustering
function perform_clustering(X, optimal_k, vectorizer)
    println("Performing clustering with k=", optimal_k, "...")
    kmeans_result = kmeans(X', optimal_k; maxiter=300, display=:none)
    labels = kmeans_result.assignments
    
    println("\nTop terms per cluster:")
    terms = vectorizer.vocab
    for i in 1:optimal_k
        top_indices = sortperm(kmeans_result.centers[i, :], rev=true)[1:10]
        println("\nCluster ", i, ":")
        println(join(terms[top_indices], ", "))
    end
    
    return labels
end

# Visualize clusters
function visualize_clusters(X, labels)
    println("\nCreating cluster visualization...")
    reducer = UMAPReducer()
    X_reduced = reduce(reducer, X)
    
    scatter(X_reduced[:, 1], X_reduced[:, 2], group=labels, xlabel="UMAP1", ylabel="UMAP2", legend=false, title="UMAP visualization of clusters")
end

# Main execution
file_path = "/export/projects1/rsadun_bmw/03 Workplace/Clean Data/Production/Translation/Maintenance_Full_Step2.dta"
data = read(file_path, String)  # Read as raw text
clean_data = transcode(String, data)  # Ensure UTF-8 encoding
df = DataFrame(CSV.File(IOBuffer(clean_data), normalizenames=true))
X, vectorizer = vectorize_text(df)
optimal_k = determine_optimal_clusters(X)
labels = perform_clustering(X, optimal_k, vectorizer)
visualize_clusters(X, labels)

df.cluster = labels
output_path = "/export/projects1/rsadun_bmw/03 Workplace/Clean Data/Production/Translation/Maintenance_Full_Translated_Clustered_Optimal.csv"
CSV.write(output_path, df)
println("\nResults saved to: ", output_path)

ErrorException: Invalid UTF-8 string