In [120]:
import pandas as pd


In [121]:
# Load the CSV file
file_path = 'resources/board_games.csv'  # Replace with your file path
df = pd.read_csv(file_path)

# Display the first few rows and the summary of the DataFrame
print(df.head())
print(df.info())

   game_id                                        description  \
0        1  Die Macher is a game about seven sequential po...   
1        2  Dragonmaster is a trick-taking card game based...   
2        3  Part of the Knizia tile-laying trilogy, Samura...   
3        4  When you see the triangular box and the luxuri...   
4        5  In Acquire, each player strategically invests ...   

                                          image  max_players  max_playtime  \
0   //cf.geekdo-images.com/images/pic159509.jpg            5           240   
1   //cf.geekdo-images.com/images/pic184174.jpg            4            30   
2  //cf.geekdo-images.com/images/pic3211873.jpg            4            60   
3   //cf.geekdo-images.com/images/pic285299.jpg            4            60   
4   //cf.geekdo-images.com/images/pic342163.jpg            6            90   

   min_age  min_players  min_playtime            name  playing_time  ...  \
0       14            3           240      Die Macher           

In [122]:
# Check for missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10532 entries, 0 to 10531
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   game_id         10532 non-null  int64  
 1   description     10532 non-null  object 
 2   image           10531 non-null  object 
 3   max_players     10532 non-null  int64  
 4   max_playtime    10532 non-null  int64  
 5   min_age         10532 non-null  int64  
 6   min_players     10532 non-null  int64  
 7   min_playtime    10532 non-null  int64  
 8   name            10532 non-null  object 
 9   playing_time    10532 non-null  int64  
 10  thumbnail       10531 non-null  object 
 11  year_published  10532 non-null  int64  
 12  artist          7759 non-null   object 
 13  category        10438 non-null  object 
 14  compilation     410 non-null    object 
 15  designer        10406 non-null  object 
 16  expansion       2752 non-null   object 
 17  family          7724 non-null  

In [123]:
# Check the shape of the DataFrame
df.shape

(10532, 22)

In [124]:
# Check for missing values
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

image              1
thumbnail          1
artist          2773
category          94
compilation    10122
designer         126
expansion       7780
family          2808
mechanic         950
publisher          3
dtype: int64


In [125]:
# Drop columns that are not needed or have excessive missing values
columns_to_drop = ['image', 'thumbnail', 'compilation', 'year_published', 'artist', 'designer']
df.drop(columns=columns_to_drop, inplace=True)

# Check the DataFrame to confirm the columns have been dropped
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10532 entries, 0 to 10531
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   game_id         10532 non-null  int64  
 1   description     10532 non-null  object 
 2   max_players     10532 non-null  int64  
 3   max_playtime    10532 non-null  int64  
 4   min_age         10532 non-null  int64  
 5   min_players     10532 non-null  int64  
 6   min_playtime    10532 non-null  int64  
 7   name            10532 non-null  object 
 8   playing_time    10532 non-null  int64  
 9   category        10438 non-null  object 
 10  expansion       2752 non-null   object 
 11  family          7724 non-null   object 
 12  mechanic        9582 non-null   object 
 13  publisher       10529 non-null  object 
 14  average_rating  10532 non-null  float64
 15  users_rated     10532 non-null  int64  
dtypes: float64(1), int64(8), object(7)
memory usage: 1.3+ MB
None


In [126]:
# Fill missing values in specific columns without using inplace
df['category'] = df['category'].fillna('Unknown')
df['expansion'] = df['expansion'].fillna('None')
df['family'] = df['family'].fillna('None')
df['mechanic'] = df['mechanic'].fillna('Unknown')
df['publisher'] = df['publisher'].fillna('Unknown')

# Verify if any missing values remain
remaining_missing_values = df.isnull().sum()
print(remaining_missing_values[remaining_missing_values > 0])

Series([], dtype: int64)


In [127]:
# Check the DataFrame again
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10532 entries, 0 to 10531
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   game_id         10532 non-null  int64  
 1   description     10532 non-null  object 
 2   max_players     10532 non-null  int64  
 3   max_playtime    10532 non-null  int64  
 4   min_age         10532 non-null  int64  
 5   min_players     10532 non-null  int64  
 6   min_playtime    10532 non-null  int64  
 7   name            10532 non-null  object 
 8   playing_time    10532 non-null  int64  
 9   category        10532 non-null  object 
 10  expansion       10532 non-null  object 
 11  family          10532 non-null  object 
 12  mechanic        10532 non-null  object 
 13  publisher       10532 non-null  object 
 14  average_rating  10532 non-null  float64
 15  users_rated     10532 non-null  int64  
dtypes: float64(1), int64(8), object(7)
memory usage: 1.3+ MB


In [128]:
df.shape

(10532, 16)

In [129]:
# Assuming the DataFrame is named df and the column is 'mechanic'

# First, we'll extract all unique values in the mechanic column
unique_mechanics = df['mechanic'].dropna().unique()

# Then, we can join them into a single string and find unique delimiters
mechanic_string = ', '.join(unique_mechanics)

# Display the unique delimiters
delimiters = set(mechanic_string)  # Using a set to find unique characters
print("Unique delimiters found in the mechanic column:", delimiters)


Unique delimiters found in the mechanic column: {'O', 'N', 'E', 'g', 'k', ',', 'S', 'I', 'u', 'd', 'V', 'v', 'r', 'b', 'a', 'x', 'p', 'W', '/', 'e', 't', 'y', 'P', 'C', 'B', 'w', 'U', 'L', 'c', 'h', 'R', 'n', 'D', 'i', ' ', 'l', 'M', 'Y', 'f', 's', 'T', 'H', 'G', 'A', '-', 'o', 'm'}


In [130]:
# Display a few rows from the 'category' column
print(df['category'].head(10))  # Change the number to see more or fewer rows


0    Economic,Negotiation,Political
1                 Card Game,Fantasy
2        Abstract Strategy,Medieval
3                           Ancient
4                          Economic
5             Civilization,Nautical
6                 Abstract Strategy
7              Civilization,Fantasy
8                       Exploration
9                    Fantasy,Travel
Name: category, dtype: object


In [133]:
# Split the 'category' column by comma and expand into separate columns
df['first_value'] = df['category'].str.split(',').str[0]

# Display the first few rows of the split categories    
df.columns


Index(['game_id', 'description', 'max_players', 'max_playtime', 'min_age',
       'min_players', 'min_playtime', 'name', 'playing_time', 'category',
       'expansion', 'family', 'mechanic', 'publisher', 'average_rating',
       'users_rated', 'first_value'],
      dtype='object')

In [117]:
category_split.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,Economic,Negotiation,Political,,,,,,,,,,,
1,Card Game,Fantasy,,,,,,,,,,,,
2,Abstract Strategy,Medieval,,,,,,,,,,,,
3,Ancient,,,,,,,,,,,,,
4,Economic,,,,,,,,,,,,,


In [118]:
# Display a few rows from the 'category' column
print(df['category'].head(10))  # Change the number to see more or fewer rows

0    Economic,Negotiation,Political
1                 Card Game,Fantasy
2        Abstract Strategy,Medieval
3                           Ancient
4                          Economic
5             Civilization,Nautical
6                 Abstract Strategy
7              Civilization,Fantasy
8                       Exploration
9                    Fantasy,Travel
Name: category, dtype: object
