In [None]:
import json, requests, time
from sqlalchemy import create_engine
import pandas as pd

engine = create_engine('postgresql://postgres:argmax@pg:5432/postgres')


# Data

Every time a user opens a mobile app, an auction is going on behind the scenes. The highest bidder gets to advertise his ad to the user.

## Auctions Table


In [None]:
sql_query = 'SELECT * FROM auctions;'
with engine.connect() as db_con:
    df = pd.read_sql(sql_query, con=db_con)

df

## App Vectors table

We've gathered the first few sentences from the app store description and embedded it with a [model](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1)


In [None]:
sql_query = f'''
SELECT
    *
FROM app_vectors
'''
has_embedding = False
while not has_embedding:
    with engine.connect() as db_con:
        df = pd.read_sql(sql_query, con=db_con)
    has_embedding = (~df["embedding"].isna()).all()
    if not has_embedding:
        print("Waiting for embeddings...")
        time.sleep(15)

df


We can use the `<=>` operator to run vector search within the database


In [None]:

vec = json.loads(df.embedding[0]) # get the first embedding
print ("Embedding size: {l}".format(l=len(vec)))

sql_query = f'''
SELECT
    "bundleId"
FROM app_vectors
ORDER BY embedding<=>'{json.dumps(vec)}'
'''
with engine.connect() as db_con:
    df = pd.read_sql(sql_query, con=db_con)

df

# What you need to do

## The hypothesis

We assume that apps with similar desciptions, would have a similar asking price in the auctions (`sentPrice` column).

Use cosine similarity (`<=>`) on the embeddings to find similar apps, and any statistical tools you find suitable to prove or disprove this hypothesis.

## Is it consistent?

There are several other features in the auctions table (such as `CountryCode` and `OS`),
Do your findings hold for those as well?


In [None]:
!pip install scikit-learn
!pip install seaborn


In [None]:
# First - let's evalute the quality of the embedding, I would assume that similar apps would have similar embeddings. 
# Let's create a confusion matrix with the cosine similarity values
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
from sklearn.metrics.pairwise import cosine_similarity

# Convert string representations of lists to actual lists
df_of_embeddings['embedding'] = df_of_embeddings['embedding'].apply(json.loads)

# Initialize a matrix to store cosine similarities
num_embeddings = len(df_of_embeddings)
cos_sim_matrix = np.zeros((num_embeddings, num_embeddings))

# Compute cosine similarity between each pair of embeddings
for i in range(num_embeddings):
    for j in range(num_embeddings):
        if i != j:
            cos_sim_matrix[i, j] = cosine_similarity([df_of_embeddings['embedding'][i]], [df_of_embeddings['embedding'][j]])[0, 0]

# Create DataFrame for cosine similarity matrix
cos_sim_df = pd.DataFrame(cos_sim_matrix, columns=df_of_embeddings['bundleId'], index=df_of_embeddings['bundleId'])

In [None]:
# Create heatmap of cosine similarity values
plt.figure(figsize=(10, 8))
sns.heatmap(cos_sim_df, annot=True, cmap='coolwarm')
plt.title('Cosine Similarity Heatmap')
plt.xlabel('bundleId')
plt.ylabel('bundleId')
plt.show()

In [None]:
# conducting subjective assessment of similar apps based on their cosine similarty.
# for example - com.volt.dresstoimpress and 1569586264 got cosine similarity of (1)

# The com.volt.dresstoimpress app content is "Choose the appropriate outfit to make it through different social events!"
# The 1569586264 app content is "Choose the appropriate outfit to make it through different social events!"

# This analysis suggest that these might be the same apps!

# ----------------------------------------------------------------------------------------------------------------

# let's also anaylse 2 embeddings which are not identical but their cosine similiarty is high. (0.92)
# for example: - com.loop.match3d & 1502447854
# The com.loop.match3d app content is "Get ready for a new, challenging and original matching pairs brain game"
# The 1502447854 app content is ""Get ready for a new, challenging and original matching pairs game.
# these descriptions are really close!

# ----------------------------------------------------------------------------------------------------------------

# lastly, let's pick a couple of apps with medium similiarty (0.5)
# The com.tintash.nailsalon app content is:
# "It is manicure madness over here and it�s your time to become the greatest Nail Salon of 2021! All you need to do is scrape, clip, paint, polish and perfect your client�s nails and you will be raking in the money in no time! Just don�t mess up! People don�t like when you accidentally pull their nails off. Ouch!"

# the com.volt.dresstoimpress app content is:
#"Choose the appropriate outfit to make it through different social events!"

# conclusion - not so similar 

# ----------------------------------------------------------------------------------------------------------------

# final conclusion - the embeddings works fine :)

In [None]:
# lets learn more about our data. We can check the apps percenatge of records and the descriptives
# Group by 'bundleId' and calculate descriptive statistics for 'sentPrice'

grouped_stats = df.groupby('bundleId')['sentPrice'].describe()

# Calculate percentage of each group out of all records
grouped_stats['percentage'] = grouped_stats['count'] / len(df) * 100

print(grouped_stats)


# visualize the discriptive statistics of each app
# it can be learned that the some apps have a lot of records and some have only a few
# the embeddings might help us 

In [None]:
# Plot histogram for each group
grouped = df.groupby('bundleId')
# Calculate the number of subplots needed
num_plots = len(grouped)

# Create subplots
fig, axes = plt.subplots(1, num_plots, figsize=(num_plots * 8, 6))

# Plot histogram for each group
for i, (name, group) in enumerate(grouped):
    axes[i].hist(group['sentPrice'], bins=20, alpha=0.7)
    axes[i].set_title(f'Distribution of sentPrice for {name}')
    axes[i].set_xlabel('sentPrice')
    axes[i].set_ylabel('Frequency')
    axes[i].grid(True)

plt.tight_layout()
plt.show()
#There is some similarity between the apps

In [None]:
# feature engineering
df['eventTimestamp'] = pd.to_numeric(df['eventTimestamp'], errors='coerce')
df['eventTimestamp'] = pd.to_datetime(df['eventTimestamp'] / 1000, unit='s')

df['year'] = df['eventTimestamp'].dt.year
df['month'] = df['eventTimestamp'].dt.month
df['day'] = df['eventTimestamp'].dt.day
df['hour'] = df['eventTimestamp'].dt.hour
df['minute'] = df['eventTimestamp'].dt.minute
df['second'] = df['eventTimestamp'].dt.second
df['day_of_week'] = df['eventTimestamp'].dt.dayofweek 

print(df)