# 1. Packages

In [3]:
# --- Core data handling
import pandas as pd 
import numpy as np 
import pyarrow

# --- Visualizations
import matplotlib.pyplot as plt
import seaborn as sns 
import plotly.express as px
import plotly.graph_objects as go 

# --- Statistics and Modeling
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import category_encoders as ce

# --- File I/O & utilities
import openpyxl
from datetime import datetime

# 2. Data information

In [4]:
df = pd.read_csv ('games.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16715 entries, 0 to 16714
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             16713 non-null  object 
 1   Platform         16715 non-null  object 
 2   Year_of_Release  16446 non-null  float64
 3   Genre            16713 non-null  object 
 4   NA_sales         16715 non-null  float64
 5   EU_sales         16715 non-null  float64
 6   JP_sales         16715 non-null  float64
 7   Other_sales      16715 non-null  float64
 8   Critic_Score     8137 non-null   float64
 9   User_Score       10014 non-null  object 
 10  Rating           9949 non-null   object 
dtypes: float64(6), object(5)
memory usage: 1.4+ MB


# 3. Data cleaning and preparation

### üßπ Data Preparation and Cleaning

During this stage, the dataset was cleaned and standardized to ensure that all columns had the proper format for analysis and that only relevant records remained.

First, all column names were cleaned by removing extra spaces and converting them to lowercase. This provided a consistent and easy-to-use naming convention throughout the project.

Next, the **year_of_release** column was converted from *float* to *integer* (`Int64`). Since years should not appear with decimal values, this step ensured that each year was represented correctly while still allowing missing entries to remain as null values.

The **user_score** column contained some non-numeric entries such as ‚ÄúTBD‚Äù (To Be Determined), which indicate that a user score had not yet been assigned. These values were replaced with missing values (`NaN`), and the column was then converted to a numeric format. This allows mathematical and statistical operations to be performed safely on user scores.

Similarly, the **rating** column, which represents the ESRB classification of each game, had missing values that were replaced with the word **‚ÄúUnknown.‚Äù** This ensures that games without an official rating can still be included in summaries and visualizations without producing errors.

After standardizing column formats, rows missing critical information such as the **game name** or **genre** were removed. Records without these attributes cannot contribute meaningful insights to the analysis, as they cannot be identified or categorized by type.

To maintain data integrity, duplicate entries were also removed based on the combination of **name** and **platform**, keeping only the first occurrence of each unique pair. Additionally, games missing the **year_of_release** were dropped since the release year is essential for studying temporal trends and forecasting.

Finally, a new column called **total_sales** was created by summing the regional sales across **North America, Europe, Japan, and Other regions.** This provides a unified measure of each game‚Äôs overall commercial performance and will be used extensively in the exploratory and comparative analyses.

Through these transformations, the dataset is now clean, consistent, and fully ready for exploratory data analysis, ensuring that all future findings are based on reliable and well-structured information.


In [12]:
# Columnns standardization and cleaning
df.columns = df.columns.str.strip().str.lower()

# Convert release year to integer, keeping NaN for missing entries 
df["year_of_release"] = (
    pd.to_numeric (df["year_of_release"], errors="coerce")
    .round()
    .astype ("Int64")
)

# Replace 'tbd' with NaN and convert user_score to numeric
us = df["user_score"].astype (str).str.strip().str.lower()
us = us.replace ({"tbd": np.nan})
df["user_score"] = pd.to_numeric (us, errors="coerce")

# Replace NaN in rating with 'unknown'
df["rating"] = df["rating"].replace (np.nan, "unknown")


# Remove games without name and genre
df = df.dropna (subset=["name", "genre"])

# Remove duplicates based on name and platform, keeping the first occurrence
df = df.drop_duplicates (subset=["name", "platform"], keep="first") 

# Remove missing values in year_of_release
df = df.dropna (subset = ["year_of_release"])

# Create a new column for total sales
df["total_sales"] = df[["na_sales", "eu_sales", "jp_sales", "other_sales"]].sum (axis=1)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16441 entries, 0 to 16714
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   name             16441 non-null  object 
 1   platform         16441 non-null  object 
 2   year_of_release  16441 non-null  Int64  
 3   genre            16441 non-null  object 
 4   na_sales         16441 non-null  float64
 5   eu_sales         16441 non-null  float64
 6   jp_sales         16441 non-null  float64
 7   other_sales      16441 non-null  float64
 8   critic_score     7980 non-null   float64
 9   user_score       7460 non-null   float64
 10  rating           16441 non-null  object 
 11  total_sales      16441 non-null  float64
dtypes: Int64(1), float64(7), object(4)
memory usage: 1.6+ MB
