In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
INPUT_PATH = 'resources/board_games.csv'
CATEGORY_SEP = ','
GENRES = {
    "History/War": ["Ancient", "World War I", "World War II", "American Revolutionary War", "Vietnam War", 
                    "American Civil War", "American Indian Wars", "Napoleonic", "Post-Napoleonic", "Korean War", 
                    "American West", "Age of Reason", "Pike and Shot", "Prehistoric"],
    
    "Fantasy/Adventure": ["Fantasy", "Mythology", "Adventure", "Exploration", "Pirates", "Medieval"],
    
    "Science Fiction/Futuristic": ["Science Fiction", "Space Exploration", "Zombies", "Video Game Theme"],
    
    "Strategy": ["Wargame", "Abstract Strategy", "Political", "Civilization", "Territory Building", 
                 "Transportation", "Environmental", "Industry / Manufacturing", "Economic", "Trains"],
    
    "Educational/Intellectual": ["Math", "Educational", "Number", "Puzzle", "Trivia"],
    
    "Social/Party": ["Party Game", "Bluffing", "Negotiation", "Murder/Mystery", "Mafia", "Word Game"],
    
    "Entertainment/Pop Culture": ["Movies / TV / Radio theme", "Comic Book / Strip", "Novel-based", "Book", 
                                  "Humor", "Music"],
    
    "Children's/Family": ["Children's Game", "Memory", "Dice", "Expansion for Base-game", "Collectible Components"],
    
    "Real-life/Simulation": ["Farming", "Racing", "City Building", "Modern Warfare", "Aviation / Flight", "Medical", 
                             "Transportation", "Nautical", "Travel"],
    
    "Mystery/Crime": ["Spies/Secret Agents", "Murder/Mystery", "Deduction"],
    
    "Horror": ["Horror", "Zombies", "Murder/Mystery"],
    
    "Religion/Mythology": ["Religious", "Mythology"],
    
    "Unknown/Miscellaneous": ["Unknown", "Action / Dexterity", "Maze", "Electronic", "Game System", 
                              "Real-time", "Print & Play", "Mature / Adult", "Card Game", "Arabian"]
}

In [3]:
# Load the CSV file 
df = pd.read_csv(INPUT_PATH)

# Display the first few rows and the summary of the DataFrame
print(df.columns)
print(df.shape)
df.head()

Index(['game_id', 'description', 'image', 'max_players', 'max_playtime',
       'min_age', 'min_players', 'min_playtime', 'name', 'playing_time',
       'thumbnail', 'year_published', 'artist', 'category', 'compilation',
       'designer', 'expansion', 'family', 'mechanic', 'publisher',
       'average_rating', 'users_rated'],
      dtype='object')
(10532, 22)


Unnamed: 0,game_id,description,image,max_players,max_playtime,min_age,min_players,min_playtime,name,playing_time,...,artist,category,compilation,designer,expansion,family,mechanic,publisher,average_rating,users_rated
0,1,Die Macher is a game about seven sequential po...,//cf.geekdo-images.com/images/pic159509.jpg,5,240,14,3,240,Die Macher,240,...,Marcus Gschwendtner,"Economic,Negotiation,Political",,Karl-Heinz Schmiel,,"Country: Germany,Valley Games Classic Line","Area Control / Area Influence,Auction/Bidding,...","Hans im Glück Verlags-GmbH,Moskito Spiele,Vall...",7.66508,4498
1,2,Dragonmaster is a trick-taking card game based...,//cf.geekdo-images.com/images/pic184174.jpg,4,30,12,3,30,Dragonmaster,30,...,Bob Pepper,"Card Game,Fantasy",,"G. W. ""Jerry"" D'Arcey",,Animals: Dragons,Trick-taking,"E.S. Lowe,Milton Bradley",6.60815,478
2,3,"Part of the Knizia tile-laying trilogy, Samura...",//cf.geekdo-images.com/images/pic3211873.jpg,4,60,10,2,30,Samurai,60,...,Franz Vohwinkel,"Abstract Strategy,Medieval",,Reiner Knizia,,"Asian Theme,Country: Japan,Knizia tile-laying ...","Area Control / Area Influence,Hand Management,...","999 Games,ABACUSSPIELE,Astrel Games,Ceilikan J...",7.44119,12019
3,4,When you see the triangular box and the luxuri...,//cf.geekdo-images.com/images/pic285299.jpg,4,60,12,2,60,Tal der Könige,60,...,,Ancient,,Christian Beierer,,"Country: Egypt,Promotional Board Games","Action Point Allowance System,Area Control / A...",KOSMOS,6.60675,314
4,5,"In Acquire, each player strategically invests ...",//cf.geekdo-images.com/images/pic342163.jpg,6,90,12,3,90,Acquire,90,...,"Scott Okumura,Peter Whitley",Economic,,Sid Sackson,,3M Bookshelf Series,"Hand Management,Stock Holding,Tile Placement","3M,Avalon Hill,Avalon Hill (Hasbro),Dujardin,G...",7.3583,15195


In [4]:
# Check for missing values
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

image              1
thumbnail          1
artist          2773
category          94
compilation    10122
designer         126
expansion       7780
family          2808
mechanic         950
publisher          3
dtype: int64


In [5]:
# Drop columns that are not needed or have excessive missing values
columns_to_drop = ['image', 'thumbnail', 'compilation', 'year_published', 'artist', 'designer', 'publisher', 'family' ,'expansion']
df.drop(columns=columns_to_drop, inplace=True)

# Check the DataFrame to confirm the columns have been dropped
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10532 entries, 0 to 10531
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   game_id         10532 non-null  int64  
 1   description     10532 non-null  object 
 2   max_players     10532 non-null  int64  
 3   max_playtime    10532 non-null  int64  
 4   min_age         10532 non-null  int64  
 5   min_players     10532 non-null  int64  
 6   min_playtime    10532 non-null  int64  
 7   name            10532 non-null  object 
 8   playing_time    10532 non-null  int64  
 9   category        10438 non-null  object 
 10  mechanic        9582 non-null   object 
 11  average_rating  10532 non-null  float64
 12  users_rated     10532 non-null  int64  
dtypes: float64(1), int64(8), object(4)
memory usage: 1.0+ MB
None


In [6]:
# Remove duplicates
df = df.drop_duplicates()

# Check the number of rows before and after
print("Number of rows after removing duplicates:", len(df))

Number of rows after removing duplicates: 10532


In [7]:
df.head()

Unnamed: 0,game_id,description,max_players,max_playtime,min_age,min_players,min_playtime,name,playing_time,category,mechanic,average_rating,users_rated
0,1,Die Macher is a game about seven sequential po...,5,240,14,3,240,Die Macher,240,"Economic,Negotiation,Political","Area Control / Area Influence,Auction/Bidding,...",7.66508,4498
1,2,Dragonmaster is a trick-taking card game based...,4,30,12,3,30,Dragonmaster,30,"Card Game,Fantasy",Trick-taking,6.60815,478
2,3,"Part of the Knizia tile-laying trilogy, Samura...",4,60,10,2,30,Samurai,60,"Abstract Strategy,Medieval","Area Control / Area Influence,Hand Management,...",7.44119,12019
3,4,When you see the triangular box and the luxuri...,4,60,12,2,60,Tal der Könige,60,Ancient,"Action Point Allowance System,Area Control / A...",6.60675,314
4,5,"In Acquire, each player strategically invests ...",6,90,12,3,90,Acquire,90,Economic,"Hand Management,Stock Holding,Tile Placement",7.3583,15195


In [8]:
# get all orig  cat for to make GENRES dictionary
orignal_categories = list(set(df.category.str.cat(sep=CATEGORY_SEP).split(CATEGORY_SEP)))
# orignal_categories

In [9]:
def get_genres(row):
    raw_categories = set(row['category'].split(CATEGORY_SEP))
    # Now use GENRES and raw_genres to produce a dictionary genres
    genres = {}
    for good_genre, original_genre_strings in GENRES.items():       
        genres[good_genre] = not original_genre_strings.isdisjoint(raw_categories)
       
    return genres.values()

In [10]:
df[list(GENRES.keys())] = df.apply(get_genres, axis='columns', result_type='expand')
print(df.columns)
print(df.shape)
df.head()

AttributeError: 'list' object has no attribute 'isdisjoint'

In [16]:
# for cat in orignal_categories:
#     df.category.str.contains(cat).sum()

In [None]:
{cat: df.category.str.contains(cat).sum() for cat in orignal_categories}