## ⛏️ YANG - TopicModelling

This notebook uses LDA (Latent Dirichlet Allocation) to extract the topics from each app description.

#### Imports

In [None]:
# IMPORT
from    sklearn.metrics                    import adjusted_rand_score
from    sklearn.feature_extraction.text    import CountVectorizer
from    tqdm                               import tqdm
import  pandas                             as pd
import  lda
import  os

In [None]:
# Initialize TQDM library for Pandas
tqdm.pandas()

In [None]:
print("⚡ START ⚡\n")

#### Parameters

In [None]:
# Ground-Truth Dataset
INPUT_PATH  = "../TMP/1e_YangPreprocessedDescriptions.csv"

# Output Path
OUTPUT_PATH = "../1e_YangClusteringLabels.csv"

TMP_PATH = "../TMP"
if not os.path.exists(TMP_PATH):
    os.makedirs(TMP_PATH)
    print("📁🆕 Folder created       :", TMP_PATH)
else:
    print("📁✅ Folder already exists:", TMP_PATH)

In [None]:
RANDOM_SEED = 151836

### 1. Load Data

In [None]:
appsDF = pd.read_csv(INPUT_PATH, index_col=False)
print("#️⃣ Apps: {}".format(appsDF.shape[0]))

appsDF.head(3)

### 2.  Vectorize using CountVEctorizer

In [None]:
# Create Corpus
corpus = []
for description in appsDF['preprocessedDescription']:
    corpus.append(description)

# Vectorize
vectorizer = CountVectorizer(stop_words='english', strip_accents='ascii', dtype='int32')
tfArray    = vectorizer.fit_transform(corpus).toarray()
vocabolary = vectorizer.get_feature_names_out()

### 3. LDA 

In [None]:
def getARIscore(clusteringLabels):
    ariScore = adjusted_rand_score(appsDF["classID"].values, clusteringLabels)
    print("🎯 ARI: {:.4f}".format(ariScore))
    return

In [None]:
# Parameters
NUM_TOPICS  = 50

In [None]:
# Train and fit LDA
model = lda.LDA(n_topics = NUM_TOPICS, n_iter=100, random_state = RANDOM_SEED)
model.fit(tfArray)

In [None]:
# Retrieve the labels
clusteringLabels = [] 
docTopics = model.doc_topic_
for i in range(appsDF.shape[0]):
     clusteringLabels.append(docTopics[i].argmax())

print("\n⭐ LDA")
getARIscore(clusteringLabels)

### 4. Save everything

In [None]:
# Save the labels into the Pandas DF
appsDF = appsDF.loc[:, ['sha256']]
appsDF['clusterID'] = clusteringLabels

In [None]:
# Save the result
appsDF.to_csv(OUTPUT_PATH,index=False)
appsDF.head(5)

In [None]:
print("\n🔚 END \n")