In [1]:
import json, requests, time
from sqlalchemy import create_engine
import pandas as pd

engine = create_engine('postgresql://postgres:argmax@pg:5432/postgres')


ModuleNotFoundError: No module named 'requests'

# Data

Every time a user opens a mobile app, an auction is going on behind the scenes. The highest bidder gets to advertise his ad to the user.

## Auctions Table


In [None]:
sql_query = 'SELECT * FROM auctions;'
with engine.connect() as db_con:
    df = pd.read_sql(sql_query, con=db_con)

df

## App Vectors table

We've gathered the first few sentences from the app store description and embedded it with a [model](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1)


In [None]:
sql_query = f'''
SELECT
    *
FROM app_vectors
'''
has_embedding = False
while not has_embedding:
    with engine.connect() as db_con:
        df_of_embeddings = pd.read_sql(sql_query, con=db_con)
    has_embedding = (~df_of_embeddings["embedding"].isna()).all()
    if not has_embedding:
        print("Waiting for embeddings...")
        time.sleep(15)

df_of_embeddings


We can use the `<=>` operator to run vector search within the database


In [None]:

vec = json.loads(df_of_embeddings.embedding[0]) # get the first embedding
print ("Embedding size: {l}".format(l=len(vec)))

sql_query = f'''
SELECT
    "bundleId"
FROM app_vectors
ORDER BY embedding<=>'{json.dumps(vec)}'
'''
with engine.connect() as db_con:
    similar_embeddings = pd.read_sql(sql_query, con=db_con)

similar_embeddings

# What you need to do

## The hypothesis

We assume that apps with similar desciptions, would have a similar asking price in the auctions (`sentPrice` column).

Use cosine similarity (`<=>`) on the embeddings to find similar apps, and any statistical tools you find suitable to prove or disprove this hypothesis.

## Is it consistent?

There are several other features in the auctions table (such as `CountryCode` and `OS`),
Do your findings hold for those as well?


In [None]:
# let's install some additional libraries in order to visualize the data and compare different models
!pip install scikit-learn
!pip install seaborn

# import the necessary libraries
import numpy as np 
import itertools
import seaborn as sns # for data visualization
import matplotlib.pyplot as plt 
from sklearn.metrics.pairwise import cosine_similarity # for computing cosine similarity
from sklearn.model_selection import train_test_split # for splitting the data into training, validation and test sets

# will use us for selecting random apps to put in the validation and test sets
import random


## embeddings evaluation ##
First - let's evalute the quality of the embedding, I would assume that similar apps would have similar embeddings. 

For that, we can create a confusion matrix which contains the cosine similarity values.


In [None]:
# create a function that computes the cosine similarity between two embeddings
def create_cos_sim_df(df_of_embeddings):
    
    # (1) Convert string representations of lists to actual lists
    df_of_embeddings['embedding'] = df_of_embeddings['embedding'].apply(json.loads)

    # (2) Initialize a matrix to store cosine similarities
    num_embeddings = len(df_of_embeddings)
    cos_sim_matrix = np.zeros((num_embeddings, num_embeddings)) 

    # (3) Compute cosine similarity between each pair of embeddings
    for i, j in itertools.product(range(num_embeddings), range(num_embeddings)):
        # ignore the diagonal elements
        if i != j: 
            cos_sim_matrix[i, j] = cosine_similarity([df_of_embeddings['embedding'][i]], [df_of_embeddings['embedding'][j]])[0, 0]

    # (4) reate DataFrame for cosine similarity matrix
    cos_sim_df = pd.DataFrame(cos_sim_matrix, columns=df_of_embeddings['bundleId'], index=df_of_embeddings['bundleId'])

    return cos_sim_df

cos_sim_df = create_cos_sim_df(df_of_embeddings)

## Visualize confusion matrix which reflects the cosine similarity values:

In [None]:
# Create heatmap of cosine similarity values
plt.figure(figsize=(10, 8))
sns.heatmap(cos_sim_df, annot=True, cmap='coolwarm')
plt.title('Cosine Similarity Heatmap')
plt.xlabel('bundleId')
plt.ylabel('bundleId')
plt.show()

## Conducting subjective assessment of similar apps based on their cosine similarty.
for example - com.volt.dresstoimpress and 1569586264 got cosine similarity = 1

The com.volt.dresstoimpress app content is "Choose the appropriate outfit to make it through different social events!"

The 1569586264 app content is "Choose the appropriate outfit to make it through different social events!"

This analysis suggest that these might be the same apps!

## ----------------------------------------------------------------------------------------------------------------

# Let's also anaylse 2 embeddings which are not identical but their cosine similiarty is high = 0.92
The com.loop.match3d app content is "Get ready for a new, challenging and original matching pairs brain game"

The 1502447854 app content is ""Get ready for a new, challenging and original matching pairs game.

These descriptions are really close!

## ----------------------------------------------------------------------------------------------------------------

# Lastly, let's pick a couple of apps with medium similiarty = 0.5
The com.tintash.nailsalon app content is:

"It is manicure madness over here and it�s your time to become the greatest Nail Salon of 2021! All you need to do is scrape, clip, paint, polish and perfect your client�s nails and you will be raking in the money in no time! 

Just don�t mess up! People don�t like when you accidentally pull their nails off. Ouch!"

the com.volt.dresstoimpress app content is:

"Choose the appropriate outfit to make it through different social events!"

Conclusion - not so similar 

## ----------------------------------------------------------------------------------------------------------------

Final conclusion - the embeddings seems to reflect the semantical similarity between apps

# Next - Let's analyze the sentPrice values with resepct to each app

In [None]:
# lets learn more about our data. We can check the apps percenatge of records and the descriptives
# Group by 'bundleId' and calculate descriptive statistics for 'sentPrice'
grouped_stats = df.groupby('bundleId')['sentPrice'].describe()

# Calculate percentage of each group out of all records
grouped_stats['percentage'] = grouped_stats['count'] / len(df) * 100

print(grouped_stats)

# Notes from this analysis:
It seems that some apps have less than 5% of the samples => 
We should put some apps in the validation and test data only in order to make sure the embeddings of these apps would not leak in to training data.

In [None]:
low_appearance_apps_threshold = 5
low_representation_apps = grouped_stats[grouped_stats["percentage"]<low_appearance_apps_threshold]

random_apps = random.sample(list(low_representation_apps.index), 4)
random_apps

# Feature Engineering
Let's 

In [None]:
# feature engineering
df['eventTimestamp'] = pd.to_numeric(df['eventTimestamp'], errors='coerce')
df['eventTimestamp'] = pd.to_datetime(df['eventTimestamp'] / 1000, unit='s')

df['year'] = df['eventTimestamp'].dt.year
df['month'] = df['eventTimestamp'].dt.month
df['day'] = df['eventTimestamp'].dt.day
df['hour'] = df['eventTimestamp'].dt.hour
df['minute'] = df['eventTimestamp'].dt.minute
df['second'] = df['eventTimestamp'].dt.second
df['day_of_week'] = df['eventTimestamp'].dt.dayofweek 

print(df)

In [None]:

# Split the dataset into 90% training and 10% testing sets
# and I would randomly select 3 apps 
train_data, test_data = train_test_split(df, test_size=0.15, random_state=42)

#
# I want to make sure we test our model on apps which are not in the training set
# For this I would randomly select 4 apps which their % of data is below 5


# Filter records associated with new apps from the training set
train_data_new_apps = train_data[train_data['bundleId'].isin(random_apps)]

# Remove the filtered records from the training set
train_data = train_data[~train_data['bundleId'].isin(random_apps)]

# Concatenate the existing testing set with records associated with new apps
test_data = pd.concat([test_data, train_data_new_apps])

# Shuffle the testing set to ensure randomness
test_data = test_data.sample(frac=1, random_state=42)

# Verify the distribution of apps in the training set
print("apps in training set:", train_data['bundleId'].unique())

# Verify the distribution of apps in the testing set
print("apps in testing set:", test_data['bundleId'].unique())

# Check the percentage of data after this modification
# Calculate proportions
train_proportion = len(train_data) / len(df)
test_proportion = len(test_data) / len(df)

print("Proportion of training data:", round(train_proportion,4)*100)
print("Proportion of testing data:", round(test_proportion,4)*100)


In [None]:
# train an XGBOOST for this task
!pip install xgboost
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Select specific columns for training
columns_to_use = ['bidFloorPrice', 'year', 'month', 'day', 'hour', 'minute', 'second', 'day_of_week']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df[columns_to_use], df['sentPrice'], test_size=0.2, random_state=42)





# Initialize XGBoost Regressor
model = xgb.XGBRegressor()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Evaluate the model accuary on this data:
train_data_new_apps[columns_to_use],train_data_new_apps['sentPrice']

# Make predictions on the testing set
y_pred_new_apps = model.predict(train_data_new_apps[columns_to_use])
mse_new_apps = mean_squared_error(train_data_new_apps['sentPrice'], y_pred_new_apps)
print("new apps mse is: ",mse_new_apps)

# Optionally, you can also print feature importances
feature_importances = model.feature_importances_
print("Feature Importances:", feature_importances)

In [None]:
# Now, lets create clusters of the embeddings
#def clustter_embeddings(cos_sim_df):
    # 


# Print the modified DataFrame

def cluster_apps_by_similarity(cos_sim_df):
    clusters = {}  # Dict to store clusters
    cluster_counter = 0  # Counter for cluster numbers

    for current_app, row in cos_sim_df.iterrows():
        if any(current_app in apps for apps in clusters.values()):
            continue
            
        # Find the column (app name) with the highest value in the current row
        max_similarity_app = row.idxmax()
        
        # Check if the app with the highest similarity is already in a cluster
        assigned_cluster = None
        for cluster_num, apps_in_cluster in clusters.items():
            if max_similarity_app in apps_in_cluster:
                assigned_cluster = cluster_num
                break
        
        # If the app is already in a cluster, assign the current app to the same cluster
        # Otherwise, create a new cluster with both apps
        if assigned_cluster is not None:
            clusters[assigned_cluster].append(current_app)
        else:
            cluster_counter += 1
            clusters[cluster_counter] = [max_similarity_app, current_app]

    return clusters
        

cluster_apps_by_similarity(cos_sim_df)