In [50]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

df = pd.read_csv('vgsales.csv')
analyzed_sales = 'Global_Sales'

In [72]:
df = df[df.Year < 2015]
df = df.dropna()
for platform in df.Platform.unique():
    if df[df.Platform == platform].shape[0] <= 20:
        df = df[df.Platform != platform]

for publisher in df.Publisher.unique():
    if df[df.Publisher == publisher].shape[0] <= 20:
        df = df[df.Publisher != publisher]

In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13263 entries, 0 to 16595
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Rank          13263 non-null  int64  
 1   Name          13263 non-null  object 
 2   Platform      13263 non-null  object 
 3   Year          13263 non-null  float64
 4   Genre         13263 non-null  object 
 5   Publisher     13263 non-null  object 
 6   NA_Sales      13263 non-null  float64
 7   EU_Sales      13263 non-null  float64
 8   JP_Sales      13263 non-null  float64
 9   Other_Sales   13263 non-null  float64
 10  Global_Sales  13263 non-null  float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.2+ MB


In [75]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(df, test_size=0.3, random_state=12)
y_train, y_test = X_train[analyzed_sales], X_test[analyzed_sales]
X_train = X_train.drop(['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales'], axis=1)
X_test = X_test.drop(['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales'], axis=1)

In [76]:
from sklearn.base import BaseEstimator, TransformerMixin

class ClearAttributes(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.drop(['Rank', 'Name'], axis=1)
        return X

In [77]:
tmp = ClearAttributes()
clean_data = tmp.transform(X_train)
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9284 entries, 16040 to 6497
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Platform   9284 non-null   object 
 1   Year       9284 non-null   float64
 2   Genre      9284 non-null   object 
 3   Publisher  9284 non-null   object 
dtypes: float64(1), object(3)
memory usage: 362.7+ KB


In [78]:
from sklearn.preprocessing import OneHotEncoder

cat_attribs = ['Platform', 'Genre', 'Publisher']
jv_cat = X_train[cat_attribs]
cat_encoder = OneHotEncoder()
jv_cat_1hot = cat_encoder.fit_transform(jv_cat)

In [79]:
cat_encoder.categories_

[array(['2600', '3DS', 'DC', 'DS', 'GB', 'GBA', 'GC', 'GEN', 'N64', 'NES',
        'PC', 'PS', 'PS2', 'PS3', 'PS4', 'PSP', 'PSV', 'SAT', 'SNES',
        'Wii', 'WiiU', 'X360', 'XB', 'XOne'], dtype=object),
 array(['Action', 'Adventure', 'Fighting', 'Misc', 'Platform', 'Puzzle',
        'Racing', 'Role-Playing', 'Shooter', 'Simulation', 'Sports',
        'Strategy'], dtype=object),
 array(['3DO', '505 Games', '5pb', 'Acclaim Entertainment', 'Activision',
        'Activision Value', 'Alchemist', 'Arc System Works', 'Atari',
        'Atlus', 'Avanquest', 'BAM! Entertainment', 'Banpresto',
        'Bethesda Softworks', 'Black Bean Games', 'Capcom', 'Codemasters',
        'Crave Entertainment', 'D3Publisher', 'DTP Entertainment',
        'Deep Silver', 'Destineer', 'Disney Interactive Studios',
        'Eidos Interactive', 'Electronic Arts', 'Empire Interactive',
        'Enix Corporation', 'Focus Home Interactive', 'GT Interactive',
        'Game Factory', 'Global Star', 'Hudson Soft', 'Id

In [80]:
from sklearn.compose import ColumnTransformer

clean_attributes = ['Rank', 'Year', 'Name']

pipeline = ColumnTransformer([
    ('clean', tmp, clean_attributes),
    ('cat', OneHotEncoder(), cat_attribs)
])

jv_prepared = pipeline.fit_transform(X_train)

In [81]:
jv_prepared

<9284x116 sparse matrix of type '<class 'numpy.float64'>'
	with 37136 stored elements in Compressed Sparse Row format>

In [82]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(jv_prepared, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [83]:
some_data = X_train.iloc[:5]
some_labels = y_train.iloc[:5]
some_data_prepared = pipeline.transform(some_data)
print("Predicions: ", lin_reg.predict(some_data_prepared))

Predicions:  [1.04380005 0.78124899 0.79057787 1.18685535 0.95088462]


In [84]:
from sklearn.metrics import mean_squared_error

jv_predictions = lin_reg.predict(jv_prepared)
lin_mse = mean_squared_error(y_train, jv_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

1.7302284975780098

In [85]:
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor()
dtr.fit(jv_prepared, y_train)

print("Predicions: ", dtr.predict(some_data_prepared))

Predicions:  [0.05       0.16       1.625      0.98       1.01666667]


In [86]:
jv_predictions = dtr.predict(jv_prepared)
dtr_mse = mean_squared_error(y_train, jv_predictions)
dtr_rmse = np.sqrt(dtr_mse)
dtr_rmse

0.911888872484564

In [88]:
X_test_prepared = pipeline.transform(X_test[X_test.Platform != '3DO'])
final_predictions = lin_reg.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

1.2040585491162552