# 1. Packages

In [3]:
# --- Core data handling
import pandas as pd 
import numpy as np 
import pyarrow

# --- Visualizations
import matplotlib.pyplot as plt
import seaborn as sns 
import plotly.express as px
import plotly.graph_objects as go 

# --- Statistics and Modeling
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import category_encoders as ce

# --- File I/O & utilities
import openpyxl
from datetime import datetime

# 2. Data information

In [6]:
df = pd.read_csv ('games.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16715 entries, 0 to 16714
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             16713 non-null  object 
 1   Platform         16715 non-null  object 
 2   Year_of_Release  16446 non-null  float64
 3   Genre            16713 non-null  object 
 4   NA_sales         16715 non-null  float64
 5   EU_sales         16715 non-null  float64
 6   JP_sales         16715 non-null  float64
 7   Other_sales      16715 non-null  float64
 8   Critic_Score     8137 non-null   float64
 9   User_Score       10014 non-null  object 
 10  Rating           9949 non-null   object 
dtypes: float64(6), object(5)
memory usage: 1.4+ MB


# 3. Data cleaning and preparation

### üßπ Data Preparation and Cleaning

During this stage, the dataset was prepared and standardized to ensure that all columns had the correct format for analysis.  

First, all column names were cleaned by removing extra spaces and converting them to lowercase. This provided a consistent naming style, making it easier to reference and manipulate columns throughout the project.  

Next, the **year_of_release** column was converted from *float* to *integer* (`Int64`). Since years should not appear as decimal numbers, this change ensured a proper numeric format while allowing missing values to remain without causing errors during analysis.  

For the **user_score** column, some records contained the abbreviation ‚ÄúTBD‚Äù (To Be Determined), meaning that a score had not yet been assigned. These entries were replaced with missing values (`NaN`) and the column was converted to a *float* type. This transformation allows mathematical operations and statistical analyses to be performed safely on user ratings.  

Similarly, the **critic_score** column was converted to a numeric format to keep consistency between user and critic evaluations, enabling direct comparison and correlation in later stages of the project.  

Finally, the **rating** column, which represents the ESRB classification of each game, contained some missing values. These were replaced with the word **‚ÄúUnknown‚Äù** to prevent issues when grouping or visualizing data, while still keeping those records in the dataset.  

Together, these transformations ensure that the dataset is clean, consistent, and fully ready for the exploratory data analysis phase.

In [None]:
df.columns = df.columns.str.strip().str.lower()
df["year_of_release"] = (
    pd.to_numeric (df["year_of_release"], errors="coerce")
    .round()
    .astype ("Int64")
)

us = df["user_score"].astype (str).str.strip().str.lower()
us = us.replace ({"tbd": np.nan})
df["user_score"] = pd.to_numeric (us, errors="coerce")

df["critic_score"] = pd.to_numeric (df["critic_score"], errors = "coerce")
df["rating"] = df["rating"].replace (np.nan, "unknown")
