## ⛏️ REACT - Topic Modelling

Notebook for performing topic modeling using Latent Dirichlet Allocation (MALLET) to extract the topics of each app.

#### Imports

In [None]:
# IMPORT
from   pandas.core.common   import flatten
from   tqdm                 import tqdm
import pandas               as pd
import numpy                as np
import subprocess
import ast
import os

In [None]:
# Initialize TQDM library for Pandas
tqdm.pandas()

In [None]:
print("⚡ START ⚡")

#### Parameters

In [None]:
# Ground-Truth Dataset
INPUT_PATH  = "../TMP/1a_ReactDataPreprocessed.csv"

# Output Path
OUTPUT_PATH = "../TMP/1a_ReactTopics.csv"

TMP_PATH = "../TMP"
if not os.path.exists(TMP_PATH):
    os.makedirs(TMP_PATH)
    print("📁🆕 Folder created       :", TMP_PATH)
else:
    print("📁✅ Folder already exists:", TMP_PATH)

### 1. Load Data

In [None]:
appsDF = pd.read_csv(INPUT_PATH,index_col=False)
print("#️⃣   APPS: {}".format(appsDF.shape[0]))

appsDF.head(3)

#### 2. Merge the features in a unique column

In [None]:
COLUMNS = ['xmlValues','guiText','methodNamesText']

print("\n🔨 0. Loading data as lists")
for column in COLUMNS:
    appsDF[column] = appsDF[column].progress_apply(ast.literal_eval)      

def mergeColumns(sha256):
    featuresList = []
    for column in COLUMNS:
        featuresList.append(appsDF.loc[appsDF['sha256'] == sha256,column])
    return featuresList

#  Merge columns
print("\n🔨 1. Merge Columns")
appsDF['features'] = appsDF['sha256'].progress_apply(mergeColumns)
appsDF = appsDF.drop(COLUMNS,axis = 1)

# Flatten
print("\n🔨 2. Flatten Columns")
appsDF['features'] = appsDF['features'].progress_apply(lambda x: list(flatten(x)))

# To string
print("\n🔨 3. To string")
appsDF['features'] = appsDF['features'].progress_apply(' '.join)

In [None]:
appsDF.head(5)

### 3. LDA Mallet

In [None]:
# PARAMETERS
NUM_TOPICS = 50
TOPICS_MAX = 4
TOPICS_TRESHOLD = 0.01

MALLET_PATH          = "/home/marco/Mallet/bin"
MALLET_INPUT_FOLDER  = "./inputMallet"
MALLET_OUTPUT_FOLDER = "./outputMallet"
MALLET_OUTPUT_FILE   = "appsWithTopics.json"

binary_out_file  = "topics_inf.mallet"
inferencer_file  = "inferencer"
composition_file = "composition.txt"
keywords_file    = "keywords.txt"

In [None]:
 # Create output folder
if not os.path.exists(MALLET_OUTPUT_FOLDER):
    os.makedirs(MALLET_OUTPUT_FOLDER)

 # Create input folder
if not os.path.exists(MALLET_INPUT_FOLDER):
    os.makedirs(MALLET_INPUT_FOLDER)

#### Prepare the input

In [None]:
# Prepare the input
with open(MALLET_INPUT_FOLDER + "/input.txt", 'w') as f:
    for index, row in appsDF.iterrows():
        f.write(row["sha256"] + "\t" + row["features"] + "\n")

subprocess.call(["{}/mallet".format(MALLET_PATH),
                "import-file",
                "--input", MALLET_INPUT_FOLDER + "/input.txt",
                "--output", os.path.join(MALLET_INPUT_FOLDER, binary_out_file),
                "--keep-sequence"])

#### Train LDA Model

In [None]:
# Topic Modelling
with open('train-topics-output.txt', 'w') as out_file:
    subprocess.call(["{}/mallet".format(MALLET_PATH),
                 "train-topics",
                 "--input",                 os.path.join(MALLET_INPUT_FOLDER, binary_out_file),
                 "--num-topics",            str(NUM_TOPICS),
                 "--output-topic-keys",     os.path.join(MALLET_OUTPUT_FOLDER, keywords_file),
                 "--num-top-words",         "200",
                 "--output-doc-topics",     os.path.join(MALLET_OUTPUT_FOLDER, composition_file),
                 "--optimize-interval",     "10",
                 "--doc-topics-max",        str(TOPICS_MAX),
                 "--doc-topics-threshold",  str(TOPICS_TRESHOLD),
                 "--inferencer-filename",   os.path.join(MALLET_OUTPUT_FOLDER, inferencer_file)],
                stdout=out_file, stderr=subprocess.STDOUT
    )

#### 4. Save the composition file

In [None]:
# Prepare the column names
columnsNames = ['id','sha256']
for i in range(1, 5):
    columnsNames.extend(['topic{}'.format(i), 'probability{}'.format(i)])

# Read and save the file
compositionDF = pd.read_csv(os.path.join(MALLET_OUTPUT_FOLDER, composition_file), sep='\t', skiprows=1, header=None, names=columnsNames + ['extra'])

compositionDF
compositionDF = compositionDF.iloc[:, 1:-1]

# Cast topics to int
compositionDF = compositionDF.fillna(-1)
compositionDF = compositionDF.replace([np.inf, -np.inf], -1)

for i in range(1, 5):
    compositionDF['topic{}'.format(i)] = compositionDF['topic{}'.format(i)].astype(int)

In [None]:
# Insert the "classID" column as the second column in compositionDF
compositionDF.insert(1, 'classID', appsDF['classID'])

In [None]:
# Save the result
compositionDF.to_csv(OUTPUT_PATH,index=False)
compositionDF.head(3)

In [None]:
print("\n🔚 END \n")