## ⛏️ Chabada - Topic Modelling


Notebook for performing topic modeling using Latent Dirichlet Allocation (MALLET) to extract the topics of each app.

#### Imports

In [85]:
# IMPORT
from   tqdm     import tqdm
import pandas   as pd
import numpy    as np
import os
import subprocess

In [86]:
# Initialize TQDM library for Pandas
tqdm.pandas()

In [87]:
print("⚡ START ⚡")

⚡ START ⚡


#### Parameters

In [88]:
# Ground-Truth Dataset
INPUT_PATH  = "../TMP/1b_ChabadaPreprocessedDescriptions.csv"

# Output Path
OUTPUT_PATH = "../TMP/1b_ChabadaTopics.csv"

TMP_PATH = "../TMP"
if not os.path.exists(TMP_PATH):
    os.makedirs(TMP_PATH)
    print("📁🆕 Folder created       :", TMP_PATH)
else:
    print("📁✅ Folder already exists:", TMP_PATH)

📁✅ Folder already exists: ../TMP


### 1. Load Data

In [89]:
appsDF = pd.read_csv(INPUT_PATH, index_col=False)
print("#️⃣ Apps: {}".format(appsDF.shape[0]))

appsDF.head(3)

#️⃣ Apps: 50


Unnamed: 0,sha256,pkgName,classID,googlePlayCategoryID,googlePlayDescription,preprocessedDescription
0,9B30837BD2474AC3623A43D052F7ADC4C63E4AA9981F0F...,my.android.calc,Calculator,TOOLS,Handiness universal percentage calculator for ...,handi univers percentag calcul simpl engin cal...
1,686DE8D8A0D08992CB135BC7A0500D0109D9697A1140B8...,com.vpn.basiccalculator,Calculator,TOOLS,CITIZEN CALCULATOR by ANGEL NX is best Mobile ...,citizen calcul angel best mobil app world mill...
2,A49864DCC90F6730569455BDFA39B4B7CF70AE0C34D656...,com.ba.fractioncalculator,Calculator,EDUCATION,"<b>Free offline fraction calculator</b> ✌, sup...",free offlin fraction calcul support also decim...


### 2. LDA Mallet

In [90]:
# PARAMETERS
NUM_TOPICS = 50
TOPICS_MAX = 4
TOPICS_TRESHOLD = 0.05

MALLET_PATH          = "/app/Mallet/bin"
MALLET_INPUT_FOLDER  = "./inputMallet"
MALLET_OUTPUT_FOLDER = "./outputMallet"
MALLET_OUTPUT_FILE   = "appsWithTopics.json"

binary_out_file  = "topics_inf.mallet"
inferencer_file  = "inferencer"
composition_file = "composition.txt"
keywords_file    = "keywords.txt"

In [91]:
 # Create output folder
if not os.path.exists(MALLET_OUTPUT_FOLDER):
    os.makedirs(MALLET_OUTPUT_FOLDER)

 # Create input folder
if not os.path.exists(MALLET_INPUT_FOLDER):
    os.makedirs(MALLET_INPUT_FOLDER)

#### Prepare the input

In [92]:
# Prepare the input
with open(MALLET_INPUT_FOLDER + "/input.txt", 'w') as f:
    for index, row in appsDF.iterrows():
        f.write(row["sha256"] + "\t" + row["preprocessedDescription"] + "\n")

subprocess.call(["{}/mallet".format(MALLET_PATH),
                "import-file",
                "--input", MALLET_INPUT_FOLDER + "/input.txt",
                "--output", os.path.join(MALLET_INPUT_FOLDER, binary_out_file),
                "--keep-sequence"])

0

#### Train LDA Model

In [93]:
# Topic Modelling
with open('train-topics-output.txt', 'w') as out_file:
    subprocess.call(["{}/mallet".format(MALLET_PATH),
                 "train-topics",
                 "--input",                 os.path.join(MALLET_INPUT_FOLDER, binary_out_file),
                 "--num-topics",            str(NUM_TOPICS),
                 "--output-topic-keys",     os.path.join(MALLET_OUTPUT_FOLDER, keywords_file),
                 "--num-top-words",         "200",
                 "--output-doc-topics",     os.path.join(MALLET_OUTPUT_FOLDER, composition_file),
                 "--optimize-interval",     "10",
                 "--doc-topics-max",        str(TOPICS_MAX),
                 "--doc-topics-threshold",  str(TOPICS_TRESHOLD),
                 "--inferencer-filename",   os.path.join(MALLET_OUTPUT_FOLDER, inferencer_file)],
                stdout=out_file, stderr=subprocess.STDOUT
    )

#### 3. Save the composition file

In [94]:
# Prepare the column names
columnsNames = ['id','sha256']
for i in range(1, 5):
    columnsNames.extend(['topic{}'.format(i), 'probability{}'.format(i)])
columnsNames.extend(['extra'])

# Create an empty DataFrame with the defined column names
compositionDF = pd.read_csv(os.path.join(MALLET_OUTPUT_FOLDER, composition_file), names=columnsNames, sep='\t', skiprows=1, header=None)
compositionDF = compositionDF.drop(['id','extra'], axis=1)

# Cast topics to int
compositionDF = compositionDF.fillna(-1)
compositionDF = compositionDF.replace([np.inf, -np.inf], -1)

for i in range(1, 5):
    compositionDF['topic{}'.format(i)] = compositionDF['topic{}'.format(i)].astype(int)

In [95]:
# Insert the "classID" column as the second column in compositionDF
compositionDF.insert(1, 'classID', appsDF['classID'])

In [96]:
# Save the result
compositionDF.to_csv(OUTPUT_PATH,index=False)
compositionDF.head(3)

Unnamed: 0,sha256,classID,topic1,probability1,topic2,probability2,topic3,probability3,topic4,probability4
0,9B30837BD2474AC3623A43D052F7ADC4C63E4AA9981F0F...,Calculator,1,0.794802,4,0.203712,-1,-1.0,-1,-1.0
1,686DE8D8A0D08992CB135BC7A0500D0109D9697A1140B8...,Calculator,1,0.930943,2,0.06799,-1,-1.0,-1,-1.0
2,A49864DCC90F6730569455BDFA39B4B7CF70AE0C34D656...,Calculator,1,0.996025,-1,-1.0,-1,-1.0,-1,-1.0


In [97]:
print("\n🔚 END \n")


🔚 END 

