## ⛏️ RQ2 New Approach - Embedding

This notebook can be used to embedd app descriptions using OpenAI Models.

#### Imports

In [None]:
# IMPORT
from   tqdm     import tqdm
import pandas   as pd
import numpy    as np

In [None]:
# Initialize TQDM library for Pandas
tqdm.pandas()

In [None]:
print("⚡ START ⚡")

#### Parameters

In [None]:
# Ground-Truth Dataset
INPUT_PATH  = "../TMP/2a_GcataPreprocessedDescriptions.csv"

# Output Path
OUTPUT_PATH = "../TMP/2a_GcataFeatures.csv"

TMP_PATH = "../TMP"
if not os.path.exists(TMP_PATH):
    os.makedirs(TMP_PATH)
    print("📁🆕 Folder created       :", TMP_PATH)
else:
    print("📁✅ Folder already exists:", TMP_PATH)

### 1. Load Data

In [None]:
appsDF = pd.read_csv(INPUT_PATH, index_col=False)
print("#️⃣ Apps: {}".format(appsDF.shape[0]))

appsDF.head(3)

### 2. Embedding 

In [None]:
################## API KEYS ########################
from   dotenv import load_dotenv
import os,sys
# Load API KEYS from the .env file in the current directory
CONFIG_PATH = "../../../config.env"
if not os.path.exists(CONFIG_PATH):
    print(f"⚠️ Error: File not found at path '{CONFIG_PATH}'.\n- Make sure the config.env file exists.\n- Ensure the CONFIG_PATH is correctly set.")
    sys.exit(1)
else:
    load_dotenv(CONFIG_PATH)
ANDROZOO_API_KEY = os.getenv('ANDROZOO_API_KEY')
OPENAI_API_KEY   = os.getenv('OPENAI_API_KEY')
#######################################################

In [None]:
import openai
openai.api_key = OPENAI_API_KEY

Get Embedding from GPT Text-Embedding models

In [None]:
def getGptEmbedding(text):
   # Model to be used - (Determine the price)
   model="text-embedding-ada-002"

   # Remove new line chars
   text = text.replace("\n", " ")
   
   # Return Embedding
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

In [None]:
print("\n⛏️ GPT-Based Embedding")

appsDF['features'] = appsDF['description'].progress_apply(getGptEmbedding)
appsDF['features'] = appsDF['features'].progress_apply(lambda x: list(x))

In [None]:
print("📐 Len features: {}".format(len(appsDF['features'][0])))

### 3. Save everything

In [None]:
# Keep only sha256 an 
appsDF = appsDF.loc[:, ['sha256','classID','features']]
appsDF.to_csv(OUTPUT_PATH, index=False)
appsDF.head(3)

In [None]:
print("\n🔚 END")