In [178]:
import numpy as np
import pandas as pd
from tabulate import tabulate
import seaborn as sns
import matplotlib.ticker as mtick
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split

df = pd.read_csv("best-selling game consoles.csv")
df.head(20)

Unnamed: 0,Console Name,Type,Company,Released Year,Discontinuation Year,Units sold (million),Remarks
0,PlayStation 2,Home,Sony,2000,2013,155.0,Final sales are greater than 155 million
1,Nintendo DS,Handheld,Nintendo,2004,2013,154.02,
2,Nintendo Switch,Hybrid,Nintendo,2017,0,122.55,
3,Game Boy,Handheld,Nintendo,1989,2003,64.42,The Game Boy (1989) and the Game Boy Color (19...
4,Game Boy Color,Handheld,Nintendo,1998,2003,44.06,
5,PlayStation 4,Home,Sony,2013,0,117.2,
6,PlayStation,Home,Sony,1994,2006,102.49,
7,Wii,Home,Nintendo,2006,2013,101.63,
8,PlayStation 3,Home,Sony,2006,2017,87.4,Final sales are greater than 87.4 million
9,Xbox 360,Home,Microsoft,2005,2016,84.0,Final sales are greater than 84 million


<h3>This dataset provides information on the best-selling game consoles of all time. It includes essential
details such as the console name, manufacturer, release year, total units sold, and additional relevant
information.<h3>

<h2>Data info<h2>

In [179]:
df.info()

df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Console Name          51 non-null     object 
 1   Type                  51 non-null     object 
 2   Company               51 non-null     object 
 3   Released Year         51 non-null     int64  
 4   Discontinuation Year  51 non-null     int64  
 5   Units sold (million)  51 non-null     float64
 6   Remarks               16 non-null     object 
dtypes: float64(1), int64(2), object(4)
memory usage: 2.9+ KB


Unnamed: 0,Released Year,Discontinuation Year,Units sold (million)
count,51.0,51.0,51.0
mean,1996.058824,1844.803922,35.549412
std,12.378064,543.657319,42.608844
min,1976.0,0.0,1.0
25%,1987.5,1992.0,3.2
50%,1994.0,1998.0,13.56
75%,2004.5,2011.5,60.205
max,2020.0,2020.0,155.0


<h4>Here I basic information about the dataset. For example; there is 51 gaming consoles, and I can see what dtype all columns are. In "describe" I see the descriptive statistics on the numerical variables<h4>

<h4>Description of dataset variables

Console Name: The name of the game console.

Type: The type of console (e.g., home console, handheld console).

Company: The company that manufactured the console.

Released Year: The year in which the console was first released.

Discontinuation Year: The year in which the console was discontinued (if applicable).

Units Sold (million): The total number of units sold worldwide, in millions.

Remarks: Any additional remarks or comments about the console (if applicable).<h4>

<h2>Data cleaning<h2>

In [180]:
df.dtypes

Console Name             object
Type                     object
Company                  object
Released Year             int64
Discontinuation Year      int64
Units sold (million)    float64
Remarks                  object
dtype: object

<h4>Checking for duplicates<h4>

In [181]:
df.duplicated()

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
32    False
33    False
34    False
35    False
36    False
37    False
38    False
39    False
40    False
41    False
42    False
43    False
44    False
45    False
46    False
47    False
48    False
49    False
50    False
dtype: bool

<h4>
Here im looking for missing values.
The result show that there are 35 missing vaules in "Remarks"<h4>

In [182]:
df.isnull().sum()

Console Name             0
Type                     0
Company                  0
Released Year            0
Discontinuation Year     0
Units sold (million)     0
Remarks                 35
dtype: int64

In [183]:
df = df.drop('Remarks', axis=1)

<h4>
Descriptive statistics on categorical variables
<h4>

In [189]:
df.mode()

Unnamed: 0,Console Name,Type,Company,Released Year,Discontinuation Year,Units sold (million)
0,Atari 2600,Home,Nintendo,1990.0,0.0,1.0
1,Atari 5200,,,,1996.0,
2,Atari 7800,,,,1998.0,
3,Atari Lynx,,,,2003.0,
4,ColecoVision,,,,,
5,Color TV-Game,,,,,
6,Dendy(Famiclone),,,,,
7,Dreamcast,,,,,
8,Famicom Disk System,,,,,
9,Game & Watch,,,,,


In [190]:
count = df.value_counts()
print(count)

Console Name                       Type                 Company                  Released Year  Discontinuation Year  Units sold (million)
Atari 2600                         Home                 Atari                    1977           1992                  30.00                   1
Sega Genesis/Mega Drive            Home                 Sega                     1988           1997                  30.75                   1
PlayStation                        Home                 Sony                     1994           2006                  102.49                  1
PlayStation 2                      Home                 Sony                     2000           2013                  155.00                  1
PlayStation 3                      Home                 Sony                     2006           2017                  87.40                   1
PlayStation 4                      Home                 Sony                     2013           0                     117.20                 