In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import mutual_info_regression

# Visualizing the BGG dataset
## Loading in the dataset

In [None]:
df = pd.read_excel('./data/BGG_Complete_Cleaned_Dataset.xlsx')

## Visualizations
#### Amount of users across the primary domains of a game

In [None]:
# This shows the total number of owned users grouped by primary domain.
df['Primary Domain'] = df['Domains'].str.split(', ').str[0]

owned_users = df.groupby('Primary Domain')['Owned Users'].sum().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
owned_users.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title("Number of Owned Users Across Game Domains", fontsize=16)
plt.ylabel("Total Owned Users", fontsize=12)
plt.xlabel("Game Domains", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

#### The relationship between complexity average and user ratings (rating average)

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='Complexity Average', y='Rating Average', hue='Primary Domain', alpha=1.0)
plt.title("Complexity vs Rating Average by Domain", fontsize=16)
plt.xlabel("Complexity Average", fontsize=12)
plt.ylabel("Rating Average", fontsize=12)
plt.legend(title='Primary Domain', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

#### The relationship between the amount of players required to play the game and the duration of the game

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Min Players', y='Play Time')
plt.title("Play Time Distribution by Minimum Players", fontsize=16)
plt.xlabel("Minimum Players", fontsize=12)
plt.ylabel("Play Time (minutes)", fontsize=12)
plt.tight_layout()
plt.show()

#### The distribution of average ratings across all games

In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(df['Rating Average'], kde=True, bins=20, color='blue')
plt.title("Distribution of Average Ratings", fontsize=16)
plt.xlabel("Rating Average", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.tight_layout()
plt.show()

#### The amount of games by the minimum age requirement

In [None]:
min_age_counts = df['Min Age'].value_counts().sort_index()
plt.figure(figsize=(8, 6))
min_age_counts.plot(kind='bar', color='orange', edgecolor='black')
plt.title("Number of Games by Minimum Age", fontsize=16)
plt.xlabel("Minimum Age", fontsize=12)
plt.ylabel("Number of Games", fontsize=12)
plt.tight_layout()
plt.show()

#### Complexity across game domains

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Primary Domain', y='Complexity Average')
plt.title("Complexity by Game Domain", fontsize=16)
plt.xlabel("Game Domain", fontsize=12)
plt.ylabel("Complexity Average", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

#### Game duration variations across all the games

In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(df['Play Time'], kde=True, bins=20, color='blue')
plt.title("Distribution of Play Time", fontsize=16)
plt.xlabel("Play Time (minutes)", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.tight_layout()
plt.show()

## Visualizing the needed features to predict the needed targets
### Correlations

In [None]:
df_corr = df.drop(columns=['Primary Domain'])
df_corr['Domains'] = df_corr['Domains'].astype('category').cat.codes
df_corr['Mechanics_Categories'] = df_corr['Mechanics_Categories'].astype('category').cat.codes
correlation_matrix = df_corr.corr()

#### Correlation scores Complexity Average

In [None]:
corr_matrix_complexity = correlation_matrix['Complexity Average'].sort_values(ascending=False)
corr_matrix_complexity

#### Correlation scores Rating Average

In [None]:
corr_matrix_complexity = correlation_matrix['Rating Average'].sort_values(ascending=False)
corr_matrix_complexity

#### Correlation scores Owned Users

In [None]:
corr_matrix_complexity = correlation_matrix['Owned Users'].sort_values(ascending=False)
corr_matrix_complexity

### Mutual information

In [None]:
df_mutual_information = df.drop(columns=['Strategy Games', 'Abstract Games', 'Thematic Games', 'Party Games', 'Wargames', 'Customizable Games', 'Children\'s Games', 'Family Games', 'Other', 'Action and Turn Management', 'Resource Management', 'Interaction and Conflict', 'Game Progression and Mechanics', 'Auxiliary Mechanics', 'Narrative and Thematic', 'Movement and Positioning', 'Other', 'Specialized Mechanics', 'Strategic Elements', 'Primary Domain']).dropna()

df_mutual_information['Domains'] = df_mutual_information['Domains'].astype('category').cat.codes
df_mutual_information['Mechanics_Categories'] = df_mutual_information['Mechanics_Categories'].astype('category').cat.codes

#### Complexity Average

In [None]:
target = df_mutual_information['Complexity Average'].values
features = df_mutual_information.drop(columns=['Complexity Average'])
mutual_info = mutual_info_regression(features, target, random_state=42, n_neighbors=5, discrete_features='auto')

mutual_info_df = pd.DataFrame({
    'Feature': features.columns,
    'Mutual Information': mutual_info
})

mutual_info_df = mutual_info_df.sort_values(by='Mutual Information', ascending=False)
plt.figure(figsize=(10, 6))
plt.barh(mutual_info_df['Feature'], mutual_info_df['Mutual Information'])
plt.xlabel('Mutual Information')
plt.ylabel('Complexity Average')
plt.title('Mutual Information for the Complexity Average Column')
plt.gca().invert_yaxis()
plt.show()

#### Rating Average

In [None]:
# Define target and features
target = df_mutual_information['Rating Average'].values
features = df_mutual_information.drop(columns=['Rating Average'])
mutual_info = mutual_info_regression(features, target, random_state=42, n_neighbors=5, discrete_features='auto')

mutual_info_df = pd.DataFrame({
    'Feature': features.columns,
    'Mutual Information': mutual_info
})

mutual_info_df = mutual_info_df.sort_values(by='Mutual Information', ascending=False)
plt.figure(figsize=(10, 6))
plt.barh(mutual_info_df['Feature'], mutual_info_df['Mutual Information'])
plt.xlabel('Mutual Information')
plt.ylabel('Rating Average')
plt.title('Mutual Information for the Rating Average Column')
plt.gca().invert_yaxis()
plt.show()

#### Owned Users

In [None]:
# Define target and features
target = df_mutual_information['Owned Users'].values
features = df_mutual_information.drop(columns=['Owned Users'])
mutual_info = mutual_info_regression(features, target, random_state=42, n_neighbors=5, discrete_features='auto')

mutual_info_df = pd.DataFrame({
    'Feature': features.columns,
    'Mutual Information': mutual_info
})

mutual_info_df = mutual_info_df.sort_values(by='Mutual Information', ascending=False)
plt.figure(figsize=(10, 6))
plt.barh(mutual_info_df['Feature'], mutual_info_df['Mutual Information'])
plt.xlabel('Mutual Information')
plt.ylabel('Owned Users')
plt.title('Mutual Information for the Owned Users Column')
plt.gca().invert_yaxis()
plt.show()