## ⛏️ App Strings - Preprocessing

Preprocess App Strings.

#### Imports

In [None]:
# IMPORT
from collections            import Counter
from   tqdm                 import tqdm
import pandas               as pd
import numpy                as np
import itertools
import ast
import os

import preprocessing

In [None]:
# Initialize TQDM library for Pandas
tqdm.pandas()

In [None]:
print("⚡ START ⚡\n")

#### Parameters

In [None]:
# Ground-Truth Dataset
INPUT_PATH  = "../TMP/4d_AppStringsData.csv"

# Output Path
OUTPUT_PATH = "../TMP/4d_AppStringsDataPreprocessed.csv"

TMP_PATH = "../TMP"
if not os.path.exists(TMP_PATH):
    os.makedirs(TMP_PATH)
    print("📁🆕 Folder created       :", TMP_PATH)
else:
    print("📁✅ Folder already exists:", TMP_PATH)

### 1. Load Data

In [None]:
appsDF = pd.read_csv(INPUT_PATH, index_col=False)
print("#️⃣ Apps: {}".format(appsDF.shape[0]))

appsDF.head(3)

In [None]:
print("\n🔨 Reading data as lists")
appsDF['appStrings'] = appsDF['appStrings'].progress_apply(ast.literal_eval) 

### 2. Preprocessing

In [None]:
def getAvgLen(appsDF, column):
    totLen = appsDF[column].apply(len).sum()
    return totLen / appsDF[column].count()

In [None]:
print("\n🔨 Merge to a single string")
appsDF['appStrings'] = appsDF['appStrings'].progress_apply(lambda x: ' '.join(x))

print("\n🔨 Preprocess Descriptions")
appsDF['appStrings'] = appsDF['appStrings'].progress_apply(preprocessing.preprocessText)

print("\n🔨 Back to list")
appsDF['appStrings'] = appsDF['appStrings'].progress_apply(lambda x: sorted(x.split()))

print("\n📐 AVG App Strings Len: {}".format(getAvgLen(appsDF,'appStrings')))

#### Removing too short words

In [None]:
print("\n🔨 Remove most Frequent words")
appsDF['appStrings'] = appsDF['appStrings'].progress_apply(lambda x: [string for string in x if len(string) >= 3])

print("\n📐 AVG App Strings Len: {}".format(getAvgLen(appsDF,'appStrings')))

#### Removing too frequent words

In [None]:
# # Get all the Words in the dataset
# wordsCounter = Counter(list(itertools.chain(*appsDF['appStrings'].to_list())))

# preprocessing.printMostFrequentWords(wordsCounter)
# preprocessing.printMostFrequentWordsPercentage(wordsCounter)

In [None]:
# Get the most frequent words according to a threshold and remove them form the df.
FREQUENCY_THRESHOLD = 0.05

# Get all the Words in the dataset
wordsCounter = Counter(list(itertools.chain(*appsDF['appStrings'].to_list())))

# Get most frquent words
mostFrequentWords = preprocessing.getMostFrequentWordsByPercentage(wordsCounter, FREQUENCY_THRESHOLD)

print("\n🔨 Remove most Frequent words")
appsDF['appStrings'] = appsDF['appStrings'].progress_apply(lambda x: [word for word in x if word not in mostFrequentWords])

print("\n📐 AVG App Strings Len: {}".format(getAvgLen(appsDF,'appStrings')))

#### Remove less frequent words

In [None]:
FREQUENCY_THRESHOLD = 0.0001

In [None]:
# Get all the Words in the dataset
wordsCounter = Counter(list(itertools.chain(*appsDF['appStrings'].to_list())))

# Get most frquent words
lessFrequentWords = preprocessing.getLessFrequentWordsByPercentage(wordsCounter, FREQUENCY_THRESHOLD)

print("\n🔨 Remove less Frequent words")
appsDF['appStrings'] = appsDF['appStrings'].progress_apply(lambda x: [word for word in x if word not in lessFrequentWords])

print("\n📐 AVG App Strings Len: {}".format(getAvgLen(appsDF,'appStrings')))

### 3. Save everything

In [None]:
print("\n🔨 Merge to a single string")
appsDF['appStrings'] = appsDF['appStrings'].progress_apply(lambda x: ' '.join(x))

In [None]:
appsDF = appsDF[['sha256','classID',"appStrings"]]

# Save the result
appsDF.to_csv(OUTPUT_PATH,index=False)
appsDF.head(3)

In [None]:
print("\n🔚 END \n")