Initializes the Spark session and imports required libraries for data processing and distance calculations

In [1]:
from spark.starter import start_spark
from pyspark.sql import SparkSession
from scipy.spatial.distance import pdist, squareform

Configures the Spark session, sets a configuration property, and loads a CSV file into a Spark DataFrame for content-based filtering

In [2]:
spark = start_spark("Content_based Filtering", memory="16g")
spark.conf.set("spark.sql.analyzer.failAmbiguousSelfJoin", "false")

# Initialize SparkSession
spark = SparkSession.builder.appName("Load Content_based Data").getOrCreate()

# Path to the CSV file
file_path = "/root/games.csv"

# Load the CSV into a Spark DataFrame
df_games = spark.read.csv(file_path, header=True)


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/19 08:51:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/02/19 08:51:59 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Converts the Spark DataFrame to Pandas, processes operator tags into binary features, and displays the resulting tag matrix

In [3]:
df_games_shaped = df_games.toPandas()

operator_tags = set(g for G in df_games_shaped['operator_tags'] for g in G)
for g in operator_tags:
    df_games_shaped[g] = df_games_shaped.operator_tags.transform(lambda x: int(g in x))
  
df_games_shaped_operator_tags = df_games_shaped.iloc[:, 76:]
display(df_games_shaped_operator_tags)

25/02/19 08:52:07 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

Unnamed: 0,e,a,A,G,j,i,f,"""",g,I,...,W,v,{,m,E,D,s,w,y,t
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1044,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1045,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1046,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1047,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


Computes the Euclidean distances between rows of the matrix and prints the resulting distance matrix

In [4]:
# Berechnung der euklidischen Distanzen zwischen den Zeilen der Matrix
euclidean_distances = squareform(pdist(df_games_shaped_operator_tags, metric='euclidean'))

print(f"Dimensions of our genres Euclidean distance matrix: {euclidean_distances.shape}")
print(euclidean_distances)

Dimensions of our genres Euclidean distance matrix: (1049, 1049)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


Maps operator tags to indices, creates a lookup dictionary, and retrieves the index for a specific tag

In [5]:
df_games_shaped['operator_tags'] = df_games_shaped['operator_tags'].astype(str)
game_index_mapping = {tag: index for index, tag in enumerate(df_games_shaped['operator_tags'])}
print(game_index_mapping)

# Beispiel: Abrufen des Indexes für ein spezifisches Tag
specific_tag = '"{""iom"":true}"'
if specific_tag in game_index_mapping:
    index_value = game_index_mapping[specific_tag]
    print(index_value)
else:
    print(f"Tag {specific_tag} not found.")

{'{}': 1048, '"{""iom"":true}"': 791, '"{""Dragons"":true}"': 1000, '"{""Football"":true}"': 60, '"{""Diamonds"":true}"': 883, '"{""Adventure"":true}"': 131, '"{"" Adventure"":true}"': 146, '"{"" Buy feature "":true}"': 560, '"{""Dice"":true}"': 212, '"{""Book"":true}"': 489, '"{""Easter"":true}"': 222, '"{""Sweet"":true}"': 466, '"{""Book"":true': 762, '"{""Buy Feature"":true': 857, '"{""CrashGame"":true}"': 530, '"{""Bonus buy feature"":true}"': 397, '"{""Bonus buy feature"":true': 340, '"{""Hold and Win"":true}"': 847, '"{""Hold&Win"":true}"': 880, '"{"" Buy feature "":true': 786, '"{""SpeedGames"":true}"': 404, '"{""Poker"":true}"': 405, '"{""Poker"":true': 408, '"{""Spring"":true}"': 1005, '"{""iom"":false}"': 800, '"{""Irish"":true}"': 1026, '"{""rename by provider"":true}"': 501, '"{""Fruits"":false': 1028, '"{""SF"":true}"': 528, '"{""Hold&win"":true}"': 564, '"{""God"":true': 562, '"{""RNG Blackjack"":true}"': 596, '"{""RNG Blackjack"":true': 597, '"{""RNG Roulette"":true': 60

Generates game recommendations based on similarity scores derived from Euclidean distances for a given tag

In [6]:
def generate_game_recommendations(base_tag, num_recommendations=10):
    base_index = game_index_mapping[base_tag]  # Index des Spiels mit dem gegebenen Tag abrufen
    similarity_scores = [(i, euclidean_distances[base_index, i]) for i in range(len(euclidean_distances))]

    # Ergebnisse nach Ähnlichkeit sortieren (aufsteigend, da geringere Distanz höhere Ähnlichkeit bedeutet)
    similarity_scores.sort(key=lambda pair: pair[1])

    # Top-N ähnliche Spiele auswählen (ohne das Spiel selbst)
    top_similar_games = [score[0] for score in similarity_scores if score[0] != base_index][:num_recommendations]

    print(f"Da Sie {base_tag} gespielt hast, könnten dir diese Spiele auch gefallen:")
    for game_id in df_games_shaped.loc[top_similar_games, 'game_id']:
        print(game_id)

tag = '"{""iom"":true}"'
generate_game_recommendations(tag, 10)

Da Sie "{""iom"":true}" gespielt hast, könnten dir diese Spiele auch gefallen:
35663
35666
39305
39307
39309
39310
39311
39312
39313
39314


Stop the Spark session to release cluster resources

In [7]:
spark.stop()