## ⛏️ App Name - Embedding

Generate Feature Vectors starting from App Names using models from OpenAI.

#### Imports

In [None]:
# IMPORT
from   tqdm     import tqdm
import pandas   as pd
import numpy    as np
import os

In [None]:
# Initialize TQDM library for Pandas
tqdm.pandas()

In [None]:
print("⚡ START ⚡")

#### Parameters

In [None]:
# Ground-Truth Dataset
INPUT_PATH  = "../TMP/4a_AppNameDataPreprocessed.csv"

# Output Path
OUTPUT_PATH = "../TMP/4a_AppNameDataFeatures.csv"

TMP_PATH = "../TMP"
if not os.path.exists(TMP_PATH):
    os.makedirs(TMP_PATH)
    print("📁🆕 Folder created       :", TMP_PATH)
else:
    print("📁✅ Folder already exists:", TMP_PATH)

### 1. Load Data

In [None]:
appsDF = pd.read_csv(INPUT_PATH, index_col=False)
print("#️⃣ Apps: {}".format(appsDF.shape[0]))

appsDF.head(3)

### 2. App Names Embedding using OpenAI Models

In [None]:
### API KEYS ###
from dotenv import load_dotenv
load_dotenv()
ANDROZOO_API_KEY = os.getenv('ANDROZOO_API_KEY')
OPENAI_API_KEY   = os.getenv('OPENAI_API_KEY')

In [None]:
import openai
openai.api_key = OPENAI_API_KEY

In [None]:
def getGptEmbedding(text):
   # Model to be used
   model="text-embedding-ada-002"

   # Remove new line chars
   text = text.replace("\n", " ")
   #print(text)
   
   # Teturn Embedding
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

In [None]:
print("\n⛏️ GPT-Based Embedding")

appsDF['appNameFeatures'] = appsDF['appName'].progress_apply(getGptEmbedding)
appsDF['appNameFeatures'] = appsDF['appNameFeatures'].progress_apply(lambda x: list(x))

In [None]:
print("📐 Len features: {}".format(len(appsDF['appNameFeatures'][0])))

### 3. Save.

In [None]:
# Keep only sha256 an 
appsDF = appsDF.loc[:, ['sha256','classID','appNameFeatures']]
appsDF.to_csv(OUTPUT_PATH, index=False)
appsDF.head(3)

In [None]:
print("\n🔚 END")