In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/content/video_game.csv')

# Display the first few rows and basic info to understand the data
print("DataFrame head:")
display(df.head())
print("\nDataFrame info:")
df.info()

DataFrame head:


Unnamed: 0.1,Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_players,EU_players,JP_players,Other_players,Global_players,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8.0,322.0,Nintendo,E
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,,
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8.0,192.0,Nintendo,E
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,,,,,,



DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16719 entries, 0 to 16718
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       16719 non-null  int64  
 1   Name             16717 non-null  object 
 2   Platform         16719 non-null  object 
 3   Year_of_Release  16450 non-null  float64
 4   Genre            16717 non-null  object 
 5   Publisher        16665 non-null  object 
 6   NA_players       16719 non-null  float64
 7   EU_players       16719 non-null  float64
 8   JP_players       16719 non-null  float64
 9   Other_players    16719 non-null  float64
 10  Global_players   16719 non-null  float64
 11  Critic_Score     8137 non-null   float64
 12  Critic_Count     8137 non-null   float64
 13  User_Score       10015 non-null  object 
 14  User_Count       7590 non-null   float64
 15  Developer        10096 non-null  object 
 16  Rating           9950 non-null   object 


First, let's inspect the `Year_of_Release` column. If it's not already numeric, we'll convert it, handling any non-numeric values gracefully.

In [2]:
# Convert 'Year_of_Release' to numeric, coercing errors to NaN
df['Year_of_Release'] = pd.to_numeric(df['Year_of_Release'], errors='coerce')

# Drop rows where 'Year_of_Release' is NaN after conversion, as they are uninterpretable
df.dropna(subset=['Year_of_Release'], inplace=True)

# Convert 'Year_of_Release' to integer type after dropping NaNs
df['Year_of_Release'] = df['Year_of_Release'].astype(int)

# Filter for the 2008-2018 timeframe
timeframe_df = df[(df['Year_of_Release'] >= 2008) & (df['Year_of_Release'] <= 2018)]

print("Filtered DataFrame head (2008-2018):")
display(timeframe_df.head())

Filtered DataFrame head (2008-2018):


Unnamed: 0.1,Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_players,EU_players,JP_players,Other_players,Global_players,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
2,3,Mario Kart Wii,Wii,2008,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E
3,4,Wii Sports Resort,Wii,2009,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8.0,192.0,Nintendo,E
8,9,New Super Mario Bros. Wii,Wii,2009,Platform,Nintendo,14.44,6.94,4.7,2.24,28.32,87.0,80.0,8.4,594.0,Nintendo,E
14,15,Kinect Adventures!,X360,2010,Misc,Microsoft Game Studios,15.0,4.89,0.24,1.69,21.81,61.0,45.0,6.3,106.0,Good Science Studio,E
15,16,Wii Fit Plus,Wii,2009,Sports,Nintendo,9.01,8.49,2.53,1.77,21.79,80.0,33.0,7.4,52.0,Nintendo,E


Now, let's aggregate the global sales by year, for each specified region (NA, EU, JP, Other). This is analogous to a `GROUP BY` clause in SQL, followed by `SUM` aggregations for each sales column.

In [4]:
# Define the sales columns for aggregation
sales_columns = ['NA_players', 'EU_players', 'JP_players', 'Other_players']

# Aggregate sales by Year_of_Release
aggregated_sales = timeframe_df.groupby('Year_of_Release')[sales_columns].sum().reset_index()

# Display the aggregated sales
print("Aggregated Global Sales by Year (2008-2018):")
display(aggregated_sales)

Aggregated Global Sales by Year (2008-2018):


Unnamed: 0,Year_of_Release,NA_players,EU_players,JP_players,Other_players
0,2008,348.69,181.14,60.25,81.42
1,2009,335.55,187.94,61.89,73.44
2,2010,300.65,171.42,59.49,58.57
3,2011,238.79,162.97,53.07,52.75
4,2012,153.26,114.59,51.8,36.19
5,2013,153.65,121.55,47.69,38.35
6,2014,132.27,122.74,39.69,36.83
7,2015,106.86,96.72,34.09,30.31
8,2016,44.93,51.22,19.31,14.48
9,2017,0.0,0.0,0.06,0.0


This table shows the total sales for North America, Europe, Japan, and other regions, aggregated by year from 2008 to 2018. This structure is similar to what you would get from an SQL query like:

```sql
SELECT
    Year_of_Release,
    SUM(NA_Sales) AS NA_Sales,
    SUM(EU_Sales) AS EU_Sales,
    SUM(JP_Sales) AS JP_Sales,
    SUM(Other_Sales) AS Other_Sales
FROM
    video_game_table
WHERE
    Year_of_Release >= 2008 AND Year_of_Release <= 2018
GROUP BY
    Year_of_Release
ORDER BY
    Year_of_Release;
```

### 1. Handling Null Values in Legacy Metadata

Many datasets contain missing information, often represented as `NaN` (Not a Number) or `None`. For 'legacy metadata' columns, a common strategy is to fill numerical missing values with 0 (implying 'no score' or 'no count') and categorical missing values with 'Unknown' to maintain data integrity without dropping potentially valuable rows. We also need to specifically handle the `User_Score` column, which is currently an object type and might contain 'tbd' values before conversion to numeric.

This is similar to `UPDATE` statements in SQL to set default values for `NULL` entries.

In [5]:
# Make a copy of the original DataFrame to work on for validation, if `df` was modified previously
# If df is already the original, this step is just for clarity.
df_validation = df.copy()

# --- Handle User_Score column first ---
# Convert 'User_Score' to numeric, coercing errors (e.g., 'tbd') to NaN
df_validation['User_Score'] = pd.to_numeric(df_validation['User_Score'], errors='coerce')
# Fill NaN User_Score values with 0 (or a more appropriate placeholder if 'no score' is distinct from '0 score')
df_validation['User_Score'].fillna(0, inplace=True)

# --- Handle other numerical null values by filling with 0 ---
numeric_cols_with_nulls = ['Critic_Score', 'Critic_Count', 'User_Count']
for col in numeric_cols_with_nulls:
    if col in df_validation.columns:
        df_validation[col].fillna(0, inplace=True)

# --- Handle categorical null values by filling with 'Unknown' ---
categorical_cols_with_nulls = ['Name', 'Genre', 'Publisher', 'Developer', 'Rating']
for col in categorical_cols_with_nulls:
    if col in df_validation.columns:
        df_validation[col].fillna('Unknown', inplace=True)

# Display null counts after handling
print("Null counts after handling:")
display(df_validation.isnull().sum()[df_validation.isnull().sum() > 0])

print("\nFirst 5 rows with User_Score and other filled values:")
display(df_validation[['Name', 'User_Score', 'Critic_Score', 'Developer', 'Rating']].head())

Null counts after handling:


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_validation['User_Score'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_validation[col].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting value

Unnamed: 0,0



First 5 rows with User_Score and other filled values:


Unnamed: 0,Name,User_Score,Critic_Score,Developer,Rating
0,Wii Sports,8.0,76.0,Nintendo,E
1,Super Mario Bros.,0.0,0.0,Unknown,Unknown
2,Mario Kart Wii,8.3,82.0,Nintendo,E
3,Wii Sports Resort,8.0,80.0,Nintendo,E
4,Pokemon Red/Pokemon Blue,0.0,0.0,Unknown,Unknown


### 2. Ensuring Consistent Naming Conventions for Cross-Platform Titles

Ensuring consistent naming conventions is crucial for accurate analysis, especially for cross-platform titles. Minor variations (e.g., capitalization, extra spaces, special characters, or platform suffixes) can make the same game appear as different entries.

We'll create a `cleaned_name` column by applying several string cleaning operations. Then, we'll group by this `cleaned_name` and identify cases where a single cleaned name corresponds to multiple original `Name` entries, indicating an inconsistency.

In SQL, this would involve using string functions like `LOWER()`, `TRIM()`, `REPLACE()`, and then `GROUP BY` and `HAVING COUNT(DISTINCT Name) > 1`.

In [6]:
import re

# Function to clean game names
def clean_game_name(name):
    if pd.isna(name) or name == 'Unknown': # Handle NaN or 'Unknown' already filled
        return name
    name = str(name).lower() # Convert to lowercase
    name = re.sub(r'[^a-z0-9 ]', '', name) # Remove special characters, keep letters, numbers, spaces
    name = re.sub(r'\s+', ' ', name).strip() # Replace multiple spaces with single, strip whitespace

    # Optionally, remove common suffixes like platform names or years if they are not part of the core title
    # This part is highly dependent on desired level of standardization
    # For example, removing 'wii', 'ps3', 'xbox 360', 'ps4', 'xbox one' etc.
    # For this exercise, we'll keep it general to avoid over-cleaning without specific instructions.

    return name

# Apply the cleaning function to create a new 'cleaned_name' column
df_validation['cleaned_name'] = df_validation['Name'].apply(clean_game_name)

# Identify games with inconsistent naming
inconsistent_names = df_validation.groupby('cleaned_name')['Name'].nunique()
inconsistent_names = inconsistent_names[inconsistent_names > 1]

print("\nGames with potential naming inconsistencies (cleaned_name -> original unique names count):")
display(inconsistent_names.head(10))

# Display examples of actual inconsistent names for a few cases
if not inconsistent_names.empty:
    print("\nExamples of inconsistent names:")
    for cleaned_name_example in inconsistent_names.head(3).index:
        print(f"\nCleaned Name: '{cleaned_name_example}'")
        original_names = df_validation[df_validation['cleaned_name'] == cleaned_name_example]['Name'].unique()
        for original_name in original_names:
            print(f"  - {original_name}")
else:
    print("No major naming inconsistencies found based on current cleaning rules.")


Games with potential naming inconsistencies (cleaned_name -> original unique names count):


Unnamed: 0_level_0,Name
cleaned_name,Unnamed: 1_level_1
cars maternational championship,2
dragon warrior iii,2
dynasty warriors gundam,2
f1 race,2
fifa 2001 major league soccer,2
fifa road to world cup 98,2
fifa world cup germany 2006,2
formula 1 championship edition,2
go diego go great dinosaur rescue,2
go diego go safari rescue,3



Examples of inconsistent names:

Cleaned Name: 'cars maternational championship'
  - Cars: Mater-National Championship
  - Cars Mater-National Championship

Cleaned Name: 'dragon warrior iii'
  - Dragon Warrior III
  - Dragon Warrior I&II

Cleaned Name: 'dynasty warriors gundam'
  - Dynasty Warriors Gundam
  - Dynasty Warriors: Gundam
