# `Predicting Video Game Sales`

In [195]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## 1) Importing the Dataset

In [196]:
df = pd.read_csv("./best-selling-gameboy.csv")

In [197]:
df.head()

Unnamed: 0,Game,Developer(s),Publisher,Platform,Release date,Sales
0,Pokémon Red / Green / Blue / Yellow,Game Freak,Nintendo,Game Boy,1996-02-27,46020000
1,Tetris,Nintendo R&D1,Nintendo,Game Boy,1989-06-14,35000000
2,Pokémon Gold / Silver / Crystal,Game Freak,Nintendo,Game Boy Color,1999-11-21,29490000
3,Super Mario Land,Nintendo R&D1,Nintendo,Game Boy,1989-04-21,18140000
4,Super Mario Land 2: 6 Golden Coins,Nintendo R&D1,Nintendo,Game Boy,1992-10-21,11180000


In [198]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66 entries, 0 to 65
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Game          66 non-null     object
 1   Developer(s)  66 non-null     object
 2   Publisher     66 non-null     object
 3   Platform      66 non-null     object
 4   Release date  66 non-null     object
 5   Sales         66 non-null     int64 
dtypes: int64(1), object(5)
memory usage: 3.2+ KB


## 2) Data Preprocessing

### Label Encoding

In [199]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df["Game"] = le.fit_transform(df["Game"])
df["Developer(s)"] = le.fit_transform(df["Developer(s)"])
df["Publisher"] = le.fit_transform(df["Publisher"])
df["Platform"] = le.fit_transform(df["Platform"])
df["Release date"] = le.fit_transform(df["Release date"])

In [200]:
df.head()

Unnamed: 0,Game,Developer(s),Publisher,Platform,Release date,Sales
0,31,6,6,0,35,46020000
1,44,20,6,0,2,35000000
2,29,6,6,1,52,29490000
3,39,20,6,0,0,18140000
4,40,20,6,0,21,11180000


### Checking Column Datatypes

In [201]:
df.dtypes

Game            int32
Developer(s)    int32
Publisher       int32
Platform        int32
Release date    int32
Sales           int64
dtype: object

## 3) Building the Model

### Splitting into X and y

In [202]:
X = df.drop("Sales", axis=1)
y = df["Sales"]

### Splitting into Train and Test

In [203]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42)

In [204]:
X_train.shape, X_test.shape

((52, 5), (14, 5))

In [205]:
y_train.shape, y_test.shape

((52,), (14,))

### Trying different models

In [206]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, mean_squared_error

models = {
    "Logistic Regression": LogisticRegression(max_iter=100000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": svm.SVC(),
    "K Nearest Neighbors": KNeighborsClassifier(n_neighbors=3),
}

### Training & Evaluating Models

In [207]:

for model_name, model in models.items():

    # Training the model
    model.fit(X_train, y_train)

    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred)
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted')
    model_test_precision = precision_score(
        y_test, y_test_pred, average='weighted')
    model_test_recall = recall_score(y_test, y_test_pred, average='weighted')

    model_test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

    # Training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred)
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted')
    model_train_precision = precision_score(
        y_train, y_train_pred, average='weighted')
    model_train_recall = recall_score(
        y_train, y_train_pred, average='weighted')

    model_train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))

    print(model_name)

    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print('- F1 score: {:4f}'.format(model_train_f1))
    print('- Precision: {:4f}'.format(model_train_precision))
    print('- Recall: {:4f}'.format(model_train_recall))
    print("- RMSE: {:.4f}".format(model_train_rmse))

    print('----------------------------------')

    print('Model performance for Test set')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy))
    print('- Fl score: {:.4f}'.format(model_test_f1))
    print('- Precision: {:.4f}'.format(model_test_precision))
    print('- Recall: {:.4f}'.format(model_test_recall))
    print("- RMSE: {:.4f}".format(model_test_rmse))

    print('='*35)
    print('\n')

Logistic Regression
Model performance for Training set
- Accuracy: 0.8846
- F1 score: 0.863434
- Precision: 0.849359
- Recall: 0.884615
- RMSE: 1412844.2399
----------------------------------
Model performance for Test set
- Accuracy: 0.2143
- Fl score: 0.2109
- Precision: 0.2262
- Recall: 0.2143
- RMSE: 12003422.4286


Decision Tree
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
- RMSE: 0.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.0714
- Fl score: 0.0714
- Precision: 0.0714
- Recall: 0.0714
- RMSE: 13440654.4802


Random Forest
Model performance for Training set
- Accuracy: 1.0000
- F1 score: 1.000000
- Precision: 1.000000
- Recall: 1.000000
- RMSE: 0.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.2857
- Fl score: 0.2857
- Precision: 0.2857
- Recall: 0.2857
- RMSE: 12574598.2611


Support Vector Machine
Model performance for Training set


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
