## ⛏️ Ebrahimi - Embedding

Use GloVe model to embed descriptions into feature vectors.

#### Imports

In [None]:
# IMPORT
from  gensim.models                  import KeyedVectors
from  gensim.scripts.glove2word2vec  import glove2word2vec
from   tqdm                          import tqdm
import pandas   as pd
import os

import Ebrahimi

In [None]:
# Initialize TQDM library for Pandas
tqdm.pandas()

In [None]:
print("⚡ START ⚡")

#### Parameters

In [None]:
# Where to temporarily store APK Files
APK_PATH    = "../../../0_Data/APKS/"

# Ground-Truth Dataset
#INPUT_PATH  = "../../../../0_Data/CSV/0_AndroCatSet.csv"
INPUT_PATH  = "../../../../0_Data/CSV/1_AndroCatSet_MiniTEST.csv"

# Output Path
OUTPUT_PATH = "../TMP/1c_EbrahimiFeatures.csv"

In [None]:
# Create folder for output
TMP_PATH = "../TMP"
if not os.path.exists(TMP_PATH):
    os.makedirs(TMP_PATH)
    print("Folder created:", TMP_PATH)
else:
    print("Folder already exists:", TMP_PATH)

### 1. Load Data

In [None]:
appsDF = pd.read_csv(INPUT_PATH, index_col=False)
print("#️⃣ Apps: {}".format(appsDF.shape[0]))

appsDF.head(3)

In [None]:
# Remove and Rename
appsDF = appsDF.loc[:, ['sha256', 'classID', 'googlePlayDescription']]
appsDF = appsDF.rename(columns={'googlePlayDescription': 'Description'})

### 2. Vectorize Descriptions using GloVe Model.

In [None]:
# Load the GloVe model
glove2word2vec('glove.6B.300d.txt',"gloveW2V.txt")
glove300 = KeyedVectors.load_word2vec_format("gloveW2V.txt")

In [None]:
# Preprocess Descriptions
Ebrahimi.preprocessing(appsDF)

In [None]:
# Vectorize sing GloVe
appsDF['gloveFeatures'] = list(map(lambda sen_group: Ebrahimi.getEmbeddingFeatures(glove300, sen_group), appsDF.lemm))

# To list
appsDF['gloveFeatures'] = appsDF['gloveFeatures'].apply(lambda x: x.tolist())

### 3. Save everything

In [None]:
# Kepp only features
appsDF = appsDF.loc[:, ['sha256', 'classID', 'gloveFeatures']]

# Save the result
appsDF.to_csv(OUTPUT_PATH,index=False)
appsDF.head(3)

In [None]:
print("\n🔚 END \n")