## Loading necessary libraries and the Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
file_path = '/content/drive/MyDrive/MidProject/Video_Games.csv'
df = pd.read_csv(file_path)

## Overviewing the data

In [4]:
df.head()

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8.0,322.0,Nintendo,E
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,,
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8.0,192.0,Nintendo,E
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,,,,,,


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16719 entries, 0 to 16718
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             16717 non-null  object 
 1   Platform         16719 non-null  object 
 2   Year_of_Release  16450 non-null  float64
 3   Genre            16717 non-null  object 
 4   Publisher        16665 non-null  object 
 5   NA_Sales         16719 non-null  float64
 6   EU_Sales         16719 non-null  float64
 7   JP_Sales         16719 non-null  float64
 8   Other_Sales      16719 non-null  float64
 9   Global_Sales     16719 non-null  float64
 10  Critic_Score     8137 non-null   float64
 11  Critic_Count     8137 non-null   float64
 12  User_Score       10015 non-null  object 
 13  User_Count       7590 non-null   float64
 14  Developer        10096 non-null  object 
 15  Rating           9950 non-null   object 
dtypes: float64(9), object(7)
memory usage: 2.0+ MB


In [6]:
df.isna().sum()

Unnamed: 0,0
Name,2
Platform,0
Year_of_Release,269
Genre,2
Publisher,54
NA_Sales,0
EU_Sales,0
JP_Sales,0
Other_Sales,0
Global_Sales,0


## EDA through Plotly Visualisations

In [7]:
# To make plotly plots work offline
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

In [34]:
import plotly.express as px

fig = px.scatter(data_frame=df, x='Global_Sales', y='Critic_Score',
                 color='Genre', hover_name='Name', title='Global Sales vs. Critic Scores by Genre')
fig.show()

In [9]:
fig = px.box(data_frame=df, x='Publisher', y='Critic_Score', color='Platform',
             hover_name='Name', title='Distribution of Critic Scores by Publisher and Platform')
fig.show()

In [10]:
fig = px.bar(data_frame=df, x='Genre', y='Global_Sales', color='Platform',
             hover_name='Name', title='Total Sales by Genre and Platform')
fig.show()

In [18]:
# Select only the numeric columns before calculating the correlation
numeric_df = df.select_dtypes(include=np.number)
z = numeric_df.corr()

fig = px.imshow(z, color_continuous_scale='RdBu',
                title='Correlation Heatmap for Video Game Sales Data')
fig.show()

In [19]:
fig = px.violin(data_frame=df, x='Genre', y='Critic_Score', color='Platform',
             hover_name='Name', title='Distribution of Critic Scores by Genre and Platform')
fig.show()

In [20]:
fig = px.pie(data_frame=df, names='Platform', values='Global_Sales',
             title='Proportion of Total Sales by Platform')
fig.show()

In [21]:
fig = px.scatter_matrix(data_frame=df, dimensions=['Global_Sales', 'Critic_Score', 'User_Score'],
                        color='Platform', title='Scatter Matrix of Sales and Scores by Platform')
fig.show()

In [22]:
fig = px.scatter(data_frame=df, x='Critic_Score', y='Global_Sales', size='EU_Sales',
                 color='Genre', hover_name='Name',
                 title='Relationship between Critic Scores, Global Sales, and Genre')
fig.show()

In [23]:
fig = px.bar(data_frame=df, x='Year_of_Release', y=['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales'],
             title='Sales Comparison across Different Regions', barmode='stack')

fig.show()

## Generating Report

In [25]:
!pip install ydata-profiling
import ydata_profiling as pp # Changed import to ydata_profiling

# Generate the EDA report
report = pp.ProfileReport(df)

# Save the report to an HTML file
report.to_file(output_file='report.html')

Collecting ydata-profiling
  Downloading ydata_profiling-4.12.2-py2.py3-none-any.whl.metadata (20 kB)
Collecting visions<0.8.0,>=0.7.5 (from visions[type_image_path]<0.8.0,>=0.7.5->ydata-profiling)
  Downloading visions-0.7.6-py3-none-any.whl.metadata (11 kB)
Collecting htmlmin==0.1.12 (from ydata-profiling)
  Downloading htmlmin-0.1.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting phik<0.13,>=0.11.1 (from ydata-profiling)
  Downloading phik-0.12.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting multimethod<2,>=1.4 (from ydata-profiling)
  Downloading multimethod-1.12-py3-none-any.whl.metadata (9.6 kB)
Collecting imagehash==4.3.1 (from ydata-profiling)
  Downloading ImageHash-4.3.1-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting dacite>=1.8 (from ydata-profiling)
  Downloading dacite-1.8.1-py3-none-any.whl.metadata (15 kB)
Collecting PyWavelets (from imagehash==4.3.1->ydata-profiling)
  Downloading pywavelets-1.

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [26]:
from IPython.display import HTML
HTML(filename='report.html')

Output hidden; open in https://colab.research.google.com to view.

# The End