In [None]:
import numpy as np
import pandas as pd

In [None]:
# Read the CSV file with the specified encoding
VG = pd.read_csv("vgsales.csv")

# Display the first 15 rows
VG.head(20)

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37
5,6,Tetris,GB,1989.0,Puzzle,Nintendo,23.2,2.26,4.22,0.58,30.26
6,7,New Super Mario Bros.,DS,2006.0,Platform,Nintendo,11.38,9.23,6.5,2.9,30.01
7,8,Wii Play,Wii,2006.0,Misc,Nintendo,14.03,9.2,2.93,2.85,29.02
8,9,New Super Mario Bros. Wii,Wii,2009.0,Platform,Nintendo,14.59,7.06,4.7,2.26,28.62
9,10,Duck Hunt,NES,1984.0,Shooter,Nintendo,26.93,0.63,0.28,0.47,28.31


In [None]:
VG.shape

(16598, 11)

In [None]:
columns = VG.isnull().sum()
print(columns)

Rank              0
Name              0
Platform          0
Year            271
Genre             0
Publisher        58
NA_Sales          0
EU_Sales          0
JP_Sales          0
Other_Sales       0
Global_Sales      0
dtype: int64


In [None]:
#encoding
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in VG.columns:
    if VG[col].dtype == 'object':
        VG[col] = le.fit_transform(VG[col])

VG.dtypes

Rank              int64
Name              int64
Platform          int64
Year            float64
Genre             int64
Publisher         int64
NA_Sales        float64
EU_Sales        float64
JP_Sales        float64
Other_Sales     float64
Global_Sales    float64
cluster           int32
dtype: object

In [None]:
#remove rows with null values
VG.dropna(subset=['Year'])

In [None]:
#clusters
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
VG_imputed = pd.DataFrame(imputer.fit_transform(VG))

kmeans = KMeans(n_clusters = 4, random_state = 0)
kmeans.fit(VG_imputed)

#Create a new column and assign cluster labels to each row
labels = kmeans.labels_
print(labels)
print(len(labels))

VG['cluster'] = labels
VG.head(5)



[0 0 2 ... 3 1 3]
16598


Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,cluster
0,1,11007,26,2006.0,10,359,41.49,29.02,3.77,8.46,82.74,0
1,2,9327,11,1985.0,4,359,29.08,3.58,6.81,0.77,40.24,0
2,3,5573,26,2008.0,6,359,15.85,12.88,3.79,3.31,35.82,2
3,4,11009,26,2009.0,10,359,15.75,11.01,3.28,2.96,33.0,0
4,5,7346,5,1996.0,7,359,11.27,8.89,10.22,1.0,31.37,0


In [None]:
#training a kNN model
#import KNeighborsClassifier from sklearn.neighbors
from sklearn.neighbors import KNeighborsClassifier

# Specify the columns to use for training i.e. Features (X) and the target column (y). Remeber the tagret is the cluster column from the clustering step
imputer = SimpleImputer(strategy="mean")
X = imputer.fit_transform(VG.drop('cluster', axis = 1))
y = VG['cluster']

# Train / fit Features and Targets to the KNeighborsClassifier model
KNN = KNeighborsClassifier(n_neighbors = 3, metric = 'euclidean')


from sklearn.model_selection import train_test_split as tts

Xtr, Xts, ytr, yts = tts(X, y, test_size = 0.2, random_state = 0)

KNN.fit(Xtr, ytr)


In [None]:
#final dataframe
VG.head(10)

Clustered data-points according to Genre and Year


In [None]:
from matplotlib import pyplot as plt
_df_7.plot(kind='scatter', x='Year', y='Genre', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
#classification of data
Classification = pd.read_csv('vgsales.csv', encoding='cp1252')

y_pred = KNN.predict(Xts)

print(y_pred[:5])
print(yts[:5])

In [None]:
#implement confusion matrix
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(yts, y_pred)

print (cm)

In [None]:
#visualisation
import matplotlib.pyplot as plt
import seaborn as sns

sns.heatmap(cm, annot = True, fmt = 'd')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show

In [None]:
#accuracy of the model
print(classification_report(yts, y_pred))