In [58]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [59]:
# Load the CSV file
file_path = 'resources/board_games.csv'  
df = pd.read_csv(file_path)

# Display the first few rows and the summary of the DataFrame
print(df.head())
print(df.columns)

   game_id                                        description  \
0        1  Die Macher is a game about seven sequential po...   
1        2  Dragonmaster is a trick-taking card game based...   
2        3  Part of the Knizia tile-laying trilogy, Samura...   
3        4  When you see the triangular box and the luxuri...   
4        5  In Acquire, each player strategically invests ...   

                                          image  max_players  max_playtime  \
0   //cf.geekdo-images.com/images/pic159509.jpg            5           240   
1   //cf.geekdo-images.com/images/pic184174.jpg            4            30   
2  //cf.geekdo-images.com/images/pic3211873.jpg            4            60   
3   //cf.geekdo-images.com/images/pic285299.jpg            4            60   
4   //cf.geekdo-images.com/images/pic342163.jpg            6            90   

   min_age  min_players  min_playtime            name  playing_time  ...  \
0       14            3           240      Die Macher           

In [60]:
# Check for missing values
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])


image              1
thumbnail          1
artist          2773
category          94
compilation    10122
designer         126
expansion       7780
family          2808
mechanic         950
publisher          3
dtype: int64


In [61]:
len(df)

10532

In [62]:
# Remove duplicates
df = df.drop_duplicates()

# Check the number of rows before and after
print("Number of rows after removing duplicates:", len(df))


Number of rows after removing duplicates: 10532


In [63]:
# Example: Standardize categorical variables (e.g., lowercase the 'category' column)
df['category'] = df['category'].str.lower()

# Display unique categories to confirm changes
print(df['category'].unique())


['economic,negotiation,political' 'card game,fantasy'
 'abstract strategy,medieval' ... 'bluffing,deduction,dice'
 'fantasy,fighting,miniatures,sports'
 'bluffing,horror,maze,movies / tv / radio theme,science fiction']


In [64]:
# Example: Standardize categorical variables (e.g., lowercase the 'category' column)
df['mechanic'] = df['mechanic'].str.lower()

# Display unique categories to confirm changes
print(df['category'].unique())

['economic,negotiation,political' 'card game,fantasy'
 'abstract strategy,medieval' ... 'bluffing,deduction,dice'
 'fantasy,fighting,miniatures,sports'
 'bluffing,horror,maze,movies / tv / radio theme,science fiction']


In [65]:
# Check data types
print(df.dtypes)


game_id             int64
description        object
image              object
max_players         int64
max_playtime        int64
min_age             int64
min_players         int64
min_playtime        int64
name               object
playing_time        int64
thumbnail          object
year_published      int64
artist             object
category           object
compilation        object
designer           object
expansion          object
family             object
mechanic           object
publisher          object
average_rating    float64
users_rated         int64
dtype: object


In [66]:
# Drop columns that are not needed or have excessive missing values
columns_to_drop = ['image', 'thumbnail', 'compilation', 'year_published', 'artist', 'designer', 'publisher', 'family']
df.drop(columns=columns_to_drop, inplace=True)

# Check the DataFrame to confirm the columns have been dropped
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10532 entries, 0 to 10531
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   game_id         10532 non-null  int64  
 1   description     10532 non-null  object 
 2   max_players     10532 non-null  int64  
 3   max_playtime    10532 non-null  int64  
 4   min_age         10532 non-null  int64  
 5   min_players     10532 non-null  int64  
 6   min_playtime    10532 non-null  int64  
 7   name            10532 non-null  object 
 8   playing_time    10532 non-null  int64  
 9   category        10438 non-null  object 
 10  expansion       2752 non-null   object 
 11  mechanic        9582 non-null   object 
 12  average_rating  10532 non-null  float64
 13  users_rated     10532 non-null  int64  
dtypes: float64(1), int64(8), object(5)
memory usage: 1.1+ MB
None


In [67]:
# Check for missing values before filling
remaining_missing_values = df.isnull().sum()
print(remaining_missing_values[remaining_missing_values > 0])


category       94
expansion    7780
mechanic      950
dtype: int64


In [68]:
# Category counts
df['category_count'] = df['category'].apply(lambda x: len(str(x).split(',')))

df

Unnamed: 0,game_id,description,max_players,max_playtime,min_age,min_players,min_playtime,name,playing_time,category,expansion,mechanic,average_rating,users_rated,category_count
0,1,Die Macher is a game about seven sequential po...,5,240,14,3,240,Die Macher,240,"economic,negotiation,political",,"area control / area influence,auction/bidding,...",7.66508,4498,3
1,2,Dragonmaster is a trick-taking card game based...,4,30,12,3,30,Dragonmaster,30,"card game,fantasy",,trick-taking,6.60815,478,2
2,3,"Part of the Knizia tile-laying trilogy, Samura...",4,60,10,2,30,Samurai,60,"abstract strategy,medieval",,"area control / area influence,hand management,...",7.44119,12019,2
3,4,When you see the triangular box and the luxuri...,4,60,12,2,60,Tal der Könige,60,ancient,,"action point allowance system,area control / a...",6.60675,314,1
4,5,"In Acquire, each player strategically invests ...",6,90,12,3,90,Acquire,90,economic,,"hand management,stock holding,tile placement",7.35830,15195,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10527,214996,Description from the publisher:&#10;&#10;Silve...,2,480,12,1,30,"Silver Bayonet: The First Team in Vietnam, 196...",480,"vietnam war,wargame",,hex-and-counter,8.35333,75,2
10528,215437,"Codex: Card-Time Strategy is a customizable, n...",5,45,13,2,45,Codex: Card-Time Strategy – Core Set,45,"card game,fantasy,fighting,video game theme",Codex: Card-Time Strategy – Flagstone Dominion...,"deck / pool building,hand management,variable ...",8.08780,82,4
10529,215471,Time to walk about town and take some pictures...,4,20,12,2,20,Wind the Film!,20,card game,,"hand management,set collection",7.28016,63,1
10530,216201,The race is on for the robots of the Robo Rall...,6,120,12,2,20,Robo Rally (2016),120,"miniatures,racing,science fiction",,"action / movement programming,grid movement,mo...",7.45871,341,3


In [69]:
# Mechanic counts
df['mechanic_count'] = df['mechanic'].apply(lambda x: len(str(x).split(',')))

df

Unnamed: 0,game_id,description,max_players,max_playtime,min_age,min_players,min_playtime,name,playing_time,category,expansion,mechanic,average_rating,users_rated,category_count,mechanic_count
0,1,Die Macher is a game about seven sequential po...,5,240,14,3,240,Die Macher,240,"economic,negotiation,political",,"area control / area influence,auction/bidding,...",7.66508,4498,3,5
1,2,Dragonmaster is a trick-taking card game based...,4,30,12,3,30,Dragonmaster,30,"card game,fantasy",,trick-taking,6.60815,478,2,1
2,3,"Part of the Knizia tile-laying trilogy, Samura...",4,60,10,2,30,Samurai,60,"abstract strategy,medieval",,"area control / area influence,hand management,...",7.44119,12019,2,4
3,4,When you see the triangular box and the luxuri...,4,60,12,2,60,Tal der Könige,60,ancient,,"action point allowance system,area control / a...",6.60675,314,1,4
4,5,"In Acquire, each player strategically invests ...",6,90,12,3,90,Acquire,90,economic,,"hand management,stock holding,tile placement",7.35830,15195,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10527,214996,Description from the publisher:&#10;&#10;Silve...,2,480,12,1,30,"Silver Bayonet: The First Team in Vietnam, 196...",480,"vietnam war,wargame",,hex-and-counter,8.35333,75,2,1
10528,215437,"Codex: Card-Time Strategy is a customizable, n...",5,45,13,2,45,Codex: Card-Time Strategy – Core Set,45,"card game,fantasy,fighting,video game theme",Codex: Card-Time Strategy – Flagstone Dominion...,"deck / pool building,hand management,variable ...",8.08780,82,4,3
10529,215471,Time to walk about town and take some pictures...,4,20,12,2,20,Wind the Film!,20,card game,,"hand management,set collection",7.28016,63,1,2
10530,216201,The race is on for the robots of the Robo Rall...,6,120,12,2,20,Robo Rally (2016),120,"miniatures,racing,science fiction",,"action / movement programming,grid movement,mo...",7.45871,341,3,4


In [70]:
# Replace NaN with None in the 'expansion' column
df['expansion'] = df['expansion'].replace({np.nan: None})

# Create a new column 'has_expansion' that says 'Yes' if 'expansion' has a value, otherwise 'No'
df['has_expansion'] = df['expansion'].apply(lambda x: 'Yes' if x is not None else 'No')

# Display the dataframe to check the changes
print(df[['expansion', 'has_expansion']].head())

  expansion has_expansion
0      None            No
1      None            No
2      None            No
3      None            No
4      None            No


In [73]:
df.has_expansion.value_counts()

has_expansion
No     7780
Yes    2752
Name: count, dtype: int64

In [74]:
# Check the unique values in the 'category' column
unique_categories = df['category'].unique()

# To print each unique category
for category in unique_categories:
    print(category)


economic,negotiation,political
card game,fantasy
abstract strategy,medieval
ancient
economic
civilization,nautical
abstract strategy
civilization,fantasy
exploration
fantasy,travel
card game,farming,negotiation
ancient,mythology
negotiation
bluffing,negotiation,science fiction
collectible components,dice,fighting,print & play
miniatures,racing,science fiction
american west,bluffing,city building
science fiction
adventure,exploration,fantasy
fantasy,political,wargame
civilization,negotiation,political,science fiction,space exploration,wargame
exploration,fantasy,wargame
civilization,economic,medieval,renaissance
economic,political,wargame
card game,humor,negotiation,political
fighting,humor
adventure,electronic,exploration,fantasy,fighting
adventure,exploration,fantasy,fighting,miniatures
abstract strategy,american west
adventure,horror,novel-based
science fiction,wargame
fantasy,wargame
deduction,word game
bluffing,fantasy,novel-based
bluffing,civilization,fantasy,negotiation,political

In [75]:
# Define the genre dictionary
genre = {
    'board game': [
        'Math', 'City Building', 'Music', 'Ancient', 'Party Game', 
        'Spies/Secret Agents', 'Religious', 'Video Game Theme', 
        'World War I', 'World War II', 'Medical', 'Farming', 
        'Bluffing', 'Territory Building', 'Movies / TV / Radio theme', 
        'Animals', 'Modern Warfare', 'Trivia', 'Collectible Components', 
        'American Revolutionary War', 'Transportation', 'Memory', 
        'Murder/Mystery', 'Science Fiction', 'Fighting', 'Zombies', 
        'Word Game', 'Exploration', 'Miniatures', 'Wargame', 
        'Political', 'Adventure', 'Negotiation', 'Mafia', 
        'Educational', 'Renaissance', 'Vietnam War', 'Horror', 
        'Racing', 'Aviation / Flight', 'Unknown', 'Abstract Strategy', 
        'Environmental', 'Industry / Manufacturing', 'Prehistoric', 
        'American Civil War', 'Comic Book / Strip', 'Novel-based', 
        'Age of Reason', 'Sports', 'Pike and Shot', 'Card Game', 
        'Civilization', 'Print & Play', 'Deduction', 'Maze', 
        'Real-time', 'American Indian Wars', 'Number', 
        'American West', 'Korean War', 'Puzzle', 'Pirates', 
        'Civil War', 'Action / Dexterity', 'Trains', 'Post-Napoleonic', 
        'Space Exploration', 'Children\'s Game', 'Expansion for Base-game', 
        'Dice', 'Arabian', 'Electronic', 'Fantasy', 
        'Napoleonic', 'Humor', 'Medieval', 'Travel', 
        'Mythology', 'Mature / Adult', 'Book', 'Nautical', 
        'Game System', 'Economic'
    ]
}

# Updated get_genres function
def get_genres(row):
    genres = {}
    # Check if 'category' is not NaN
    if pd.notna(row['category']):
        raw_genres = row['category'].lower().split(', ')  # Using 'category' for raw genre data
        for canonical_genre in genre['board game']:
            # Check if the canonical genre is in the raw genres
            genres[canonical_genre] = int(canonical_genre in raw_genres)
    return genres



In [77]:
# Check the unique values in the 'category' column
unique_categories = df['mechanic'].unique()

# To print each unique category
for mechanic in unique_categories:
    print(mechanic)

area control / area influence,auction/bidding,dice rolling,hand management,simultaneous action selection
trick-taking
area control / area influence,hand management,set collection,tile placement
action point allowance system,area control / area influence,auction/bidding,set collection
hand management,stock holding,tile placement
dice rolling
area enclosure,pattern building,pattern recognition,tile placement
modular board
area control / area influence,tile placement
card drafting,hand management,point to point movement,route/network building
hand management,set collection,trading
auction/bidding,press your luck,set collection
dice rolling,hand management,modular board,route/network building,trading
roll / spin and move,set collection,simultaneous action selection
hand management,variable player powers
auction/bidding
dice rolling,press your luck
action / movement programming,grid movement,modular board,simultaneous action selection
tile placement,voting
action point allowance system
acti

In [80]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10532 entries, 0 to 10531
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   game_id         10532 non-null  int64  
 1   description     10532 non-null  object 
 2   max_players     10532 non-null  int64  
 3   max_playtime    10532 non-null  int64  
 4   min_age         10532 non-null  int64  
 5   min_players     10532 non-null  int64  
 6   min_playtime    10532 non-null  int64  
 7   name            10532 non-null  object 
 8   playing_time    10532 non-null  int64  
 9   category        10438 non-null  object 
 10  expansion       2752 non-null   object 
 11  mechanic        9582 non-null   object 
 12  average_rating  10532 non-null  float64
 13  users_rated     10532 non-null  int64  
 14  category_count  10532 non-null  int64  
 15  mechanic_count  10532 non-null  int64  
 16  has_expansion   10532 non-null  object 
dtypes: float64(1), int64(10), objec

In [81]:
df

Unnamed: 0,game_id,description,max_players,max_playtime,min_age,min_players,min_playtime,name,playing_time,category,expansion,mechanic,average_rating,users_rated,category_count,mechanic_count,has_expansion
0,1,Die Macher is a game about seven sequential po...,5,240,14,3,240,Die Macher,240,"economic,negotiation,political",,"area control / area influence,auction/bidding,...",7.66508,4498,3,5,No
1,2,Dragonmaster is a trick-taking card game based...,4,30,12,3,30,Dragonmaster,30,"card game,fantasy",,trick-taking,6.60815,478,2,1,No
2,3,"Part of the Knizia tile-laying trilogy, Samura...",4,60,10,2,30,Samurai,60,"abstract strategy,medieval",,"area control / area influence,hand management,...",7.44119,12019,2,4,No
3,4,When you see the triangular box and the luxuri...,4,60,12,2,60,Tal der Könige,60,ancient,,"action point allowance system,area control / a...",6.60675,314,1,4,No
4,5,"In Acquire, each player strategically invests ...",6,90,12,3,90,Acquire,90,economic,,"hand management,stock holding,tile placement",7.35830,15195,1,3,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10527,214996,Description from the publisher:&#10;&#10;Silve...,2,480,12,1,30,"Silver Bayonet: The First Team in Vietnam, 196...",480,"vietnam war,wargame",,hex-and-counter,8.35333,75,2,1,No
10528,215437,"Codex: Card-Time Strategy is a customizable, n...",5,45,13,2,45,Codex: Card-Time Strategy – Core Set,45,"card game,fantasy,fighting,video game theme",Codex: Card-Time Strategy – Flagstone Dominion...,"deck / pool building,hand management,variable ...",8.08780,82,4,3,Yes
10529,215471,Time to walk about town and take some pictures...,4,20,12,2,20,Wind the Film!,20,card game,,"hand management,set collection",7.28016,63,1,2,No
10530,216201,The race is on for the robots of the Robo Rall...,6,120,12,2,20,Robo Rally (2016),120,"miniatures,racing,science fiction",,"action / movement programming,grid movement,mo...",7.45871,341,3,4,No


In [82]:
# Save the cleaned DataFrame to a CSV file in the resources folder
df.to_csv('resources/clean.csv', index=False)


# Summary Statistics
Numerical Columns:
Count: All columns have 10,532 entries.
Mean Values:
max_players: 5.66
max_playtime: 91.34 minutes
min_age: 9.71 years
min_players: 2.07
min_playtime: 80.88 minutes
playing_time: 91.34 minutes
average_rating: 6.37
users_rated: 870.08
Standard Deviations: Varying values across columns, indicating diverse data distribution.
Min/Max Values:
Players: 0 to 999
Playtime: 0 to 60,000 minutes
Rating: 1.38 to 9.00
Users Rated: 50 to 67,655
Unique Values
Description: Various detailed game descriptions, with some highlighting gameplay mechanics, history, and components.
Name: A diverse list of game titles including "Die Macher", "Dragonmaster", and "Robo Rally (2016)".
Category: Unique categories like "economic, negotiation, political" and "card game, fantasy".
Expansion: Listings of expansions with many games having multiple expansions or none.