# Import necessary libraries

In [None]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, confusion_matrix, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
# target encoding for all categorical variables
import category_encoders as ce
import xgboost



# Import Datasets + Merging + Cleaning

Importing First Dataset - Video Game sales

In [None]:
vgsales=pd.read_csv('vgsales.csv')
vgsales.head()

In [None]:
vgsales.describe()

Checking For Empty Columns

In [None]:
vgsales.isnull().sum()

Removing Empty Columns

In [None]:
missing_rows = vgsales[vgsales.isnull().any(axis=1)]

# Print rows with missing values
print("Rows with missing values:")
print(missing_rows)

In [None]:
# Dataframe after removing all the rows with empty fields
cleaned_vg = vgsales.dropna() 
cleaned_vg.isnull().sum()

In [None]:
cleaned_vg.info()

In [None]:
cleaned_vg['Year'].value_counts()

Removing Irrelevent Years

In [None]:
cleaned_vg = cleaned_vg[cleaned_vg['Year'] < 2017]
cleaned_vg['Year'].value_counts()

Importing Second Dataset - Video Game Ratings

In [None]:
vg_rate = pd.read_csv('all_games.csv')

vg_rate['release_date'] = pd.to_datetime(vg_rate['release_date'])

# Extract year and create new column
vg_rate['Release_Year'] = vg_rate['release_date'].dt.year

vg_rate = vg_rate[vg_rate['Release_Year'] < 2017]


Changing the Platforms of vg_rate to Short-forms, to Match the Columns of Both Datasets

In [None]:
print(vg_rate['platform'].unique())
print(cleaned_vg['Platform'].unique())

word_replacement = {
    ' Nintendo 64' : 'N64',
    ' PlayStation' : 'PS',
    ' PlayStation 3' : 'PS3',
    ' Dreamcast' : 'DC',
    ' Xbox 360' : 'X360',
    ' Wii' : 'Wii',
    ' Xbox One' : 'XOne',
    ' PlayStation 2' : 'PS2',
    ' PlayStation 4' : 'PS4',
    ' GameCube' : 'GC',
    ' Xbox' : 'XB',
    ' PC' : 'PC',
    ' Game Boy Advance' : 'GBA',
    ' 3DS' : '3DS',
    ' DS' : 'DS',
    ' Wii U' : 'WiiU',
    ' PlayStation Vita' : 'PSV',
    ' PSP' : 'PSP'
}

vg_rate['platform'].replace(word_replacement, inplace=True)

vg_rate.head()

Merging Both Datasets Based on Name, Year and Platform

In [None]:
combined_vg = pd.merge(cleaned_vg, vg_rate, left_on=['Name', 'Year', 'Platform'], right_on=['name', 'Release_Year', 'platform'], how='inner')

combined_vg = combined_vg[['Rank', 'Name', 'Platform', 'Year', 'Genre', 'Publisher', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales', 'meta_score', 'user_review']]

Adding Missing Games

In [None]:
# Mainline pokemon games
mainline_pokemon = [
                    {'Rank': 26, 'Name': 'Pokemon Ruby/Pokemon Sapphire', 'Platform': 'GBA', 'Year': 2002, 'Genre': 'Role-Playing', 'Publisher': 'Nintendo', 'NA_Sales': 6.06, 'EU_Sales': 3.9, 'JP_Sales': 5.38, 'Other_Sales': 0.5, 'Global_Sales': 15.85, 'meta_score': 82, 'user_review': 8.6},
                    {'Rank': 50, 'Name': 'Pokemon FireRed/Pokemon LeafGreen', 'Platform': 'GBA', 'Year': 2004, 'Genre': 'Role-Playing', 'Publisher': 'Nintendo', 'NA_Sales': 4.34, 'EU_Sales': 2.65, 'JP_Sales': 3.15, 'Other_Sales': 0.35, 'Global_Sales': 10.49, 'meta_score': 81, 'user_review': 8.5},
                    {'Rank': 131, 'Name': 'Pokemon Emerald', 'Platform': 'GBA', 'Year': 2004, 'Genre': 'Role-Playing', 'Publisher': 'Nintendo', 'NA_Sales': 2.57, 'EU_Sales': 1.58, 'JP_Sales': 2.06, 'Other_Sales': 0.21, 'Global_Sales': 6.41, 'meta_score': 76, 'user_review': 8.9},
                    {'Rank': 21, 'Name': 'Pokemon Diamond/Pokemon Pearl', 'Platform': 'DS', 'Year': 2006, 'Genre': 'Role-Playing', 'Publisher': 'Nintendo', 'NA_Sales': 6.42, 'EU_Sales': 4.52, 'JP_Sales': 6.04, 'Other_Sales': 1.37, 'Global_Sales': 18.36, 'meta_score': 85, 'user_review': 8.2},
                    {'Rank': 89, 'Name': 'Pokemon Platinum', 'Platform': 'DS', 'Year': 2008, 'Genre': 'Role-Playing', 'Publisher': 'Nintendo', 'NA_Sales': 2.82, 'EU_Sales': 1.78, 'JP_Sales': 2.69, 'Other_Sales': 0.55, 'Global_Sales': 7.84, 'meta_score': 83, 'user_review': 8.9},
                    {'Rank': 46, 'Name': 'Pokemon HeartGold/Pokemon SoulSilver', 'Platform': 'DS', 'Year': 2009, 'Genre': 'Role-Playing', 'Publisher': 'Nintendo', 'NA_Sales': 4.4, 'EU_Sales': 2.77, 'JP_Sales': 3.96, 'Other_Sales': 0.77, 'Global_Sales': 11.9, 'meta_score': 87, 'user_review': 9.1},
                    {'Rank': 27, 'Name': 'Pokemon Black/Pokemon White', 'Platform': 'DS', 'Year': 2010, 'Genre': 'Role-Playing', 'Publisher': 'Nintendo', 'NA_Sales': 5.57, 'EU_Sales': 3.28, 'JP_Sales': 5.65, 'Other_Sales': 0.82, 'Global_Sales': 15.32, 'meta_score': 87, 'user_review': 7.7},
                    {'Rank': 82, 'Name': 'Pokemon Black 2/Pokemon White 2', 'Platform': 'DS', 'Year': 2012, 'Genre': 'Role-Playing', 'Publisher': 'Nintendo', 'NA_Sales': 2.91, 'EU_Sales': 1.86, 'JP_Sales': 3.14, 'Other_Sales': 0.43, 'Global_Sales': 8.33, 'meta_score': 80, 'user_review': 7.9},
                    {'Rank': 33, 'Name': 'Pokemon X/Pokemon Y', 'Platform': '3DS', 'Year': 2013, 'Genre': 'Role-Playing', 'Publisher': 'Nintendo', 'NA_Sales': 5.17, 'EU_Sales': 4.05, 'JP_Sales': 4.34, 'Other_Sales': 0.79, 'Global_Sales': 14.35, 'meta_score': 87, 'user_review': 7.5},
                    {'Rank': 50, 'Name': 'Pokemon Omega Ruby/Pokemon Alpha Sapphire', 'Platform': '3DS', 'Year': 2014, 'Genre': 'Role-Playing', 'Publisher': 'Nintendo', 'NA_Sales': 4.23, 'EU_Sales': 3.37, 'JP_Sales': 3.08, 'Other_Sales': 0.65, 'Global_Sales': 11.33, 'meta_score': 82, 'user_review': 7.5},
                    ]

mainline_pokemon = pd.DataFrame(mainline_pokemon)

combined_vg = pd.concat([combined_vg, mainline_pokemon], ignore_index=True)

In [None]:
# Non-mainline pokemon games
missing_rows = [{'Rank':605, 'Name':'Pokemon Colosseum', 'Platform':'GC', 'Year':2003, 'Genre':'Role-Playing','Publisher':'Nintendo','NA_Sales':1.21,'EU_Sales':0.57,'JP_Sales':0.7,'Other_Sales':0.07,'Global_Sales':2.54,'meta_score':81,'user_review':8.6},
                             {'Rank':826, 'Name':'Pokemon Mystery Dungeon: Blue Rescue Team', 'Platform':'DS', 'Year':2005, 'Genre':'Role-Playing','Publisher':'Nintendo','NA_Sales':1.16,'EU_Sales':0.06, 'JP_Sales':0.83,'Other_Sales':0,'Global_Sales':2.05,'meta_score':62,'user_review':8},
                             {'Rank':1816, 'Name':'Pokemon Mystery Dungeon: Red Rescue Team', 'Platform':'GBA', 'Year':2005, 'Genre':'Role-Playing','Publisher':'Nintendo','NA_Sales':0.81,'EU_Sales':0.3,'JP_Sales':0,'Other_Sales':0.02,'Global_Sales':1.13,'meta_score':67,'user_review':8.3},
                             {'Rank':548, 'Name':'Pokemon Stadium 2', 'Platform':'N64', 'Year':2000, 'Genre':'Strategy','Publisher':'Nintendo','NA_Sales':1.02,'EU_Sales':0.36,'JP_Sales':1.13,'Other_Sales':0.23,'Global_Sales':2.73,'meta_score':78,'user_review':8.3},
                             {'Rank':1684, 'Name':'PokePark Wii: Pikachu\'s Adventure', 'Platform':'Wii', 'Year':2009, 'Genre':'Adventure','Publisher':'Nintendo','NA_Sales':0.55,'EU_Sales':0.17,'JP_Sales':0.42,'Other_Sales':0.06,'Global_Sales':1.2,'meta_score':62,'user_review':7.5}]

missing_rows=pd.DataFrame(missing_rows)
combined_vg = pd.concat([combined_vg, missing_rows], ignore_index=True)

Further Cleaning of Data to Remove Games with "tbd"

In [None]:
# convert 'user_review' to string type
combined_vg['user_review'] = combined_vg['user_review'].astype(str)

# remove rows with tbd in user review
combined_vg = combined_vg[~combined_vg['user_review'].str.contains('tbd')]

# reconvert 'user_review' back to float type
combined_vg['user_review'] = combined_vg['user_review'].astype(float)

combined_vg.shape

Removing Other_Sales and Global_Sales

In [None]:
# Dropping Other_Sales and Global_Sales
combined_vg.drop(['Other_Sales', "Global_Sales"], axis=1, inplace=True)
combined_vg.head()

# Exploratory Data Analysis

In [None]:
user_review = combined_vg['user_review']
na_sales = combined_vg['NA_Sales']
eu_sales = combined_vg['EU_Sales']
jp_sales = combined_vg['JP_Sales']
year = combined_vg['Year']

sales_vars = ['NA_Sales', 'EU_Sales', 'JP_Sales']

Box Plots of Sales

In [None]:
# Create subplots for the boxplots
plt.figure(figsize=(16, 10))

# Loop through each sales variable
for i, var in enumerate(['NA_Sales', 'EU_Sales', 'JP_Sales']):
    # Create a boxplot for the current variable
    plt.subplot(2, 3, i + 1)
    sb.boxplot(x=var, data=combined_vg)
    plt.title(f'Boxplot of {var}')
    plt.xlabel('')
    plt.ylabel('Sales')

plt.tight_layout()
plt.show()

combined_vg[['NA_Sales', 'EU_Sales', 'JP_Sales']].describe()

Box Plot of Sales, Focusing on Median

In [None]:
# Loop through each sales variable
for i, var in enumerate(['NA_Sales', 'EU_Sales', 'JP_Sales']):
    # Create a boxplot for the current variable
    plt.subplot(2, 3, i + 1)
    sb.boxplot(x=var, data=combined_vg, showfliers=False)  # Exclude outliers from the boxplot
    
    # Get the median value for the current variable
    median_value = combined_vg[var].median()
    
    # Set custom y-axis limits centered around the median
    plt.ylim(median_value - 1.5 * combined_vg[var].std(), median_value + 1.5 * combined_vg[var].std())
    
    plt.title(f'Boxplot of {var}')
    plt.xlabel('')
    plt.ylabel('Sales (in millions)')

plt.tight_layout()
plt.show()



Pie Chart Showing the Distribution of Regional Sales

In [None]:
area = combined_vg[['NA_Sales','EU_Sales','JP_Sales']]
area = area.melt(var_name='Area',value_name='Total_Sales')
area = area.groupby('Area')['Total_Sales'].sum().reset_index()

plt.pie(area['Total_Sales'],labels=area['Area'], autopct='%.2f%%')
plt.show() 

Horizontal Bar Plot of Publishers and Regional Sales

In [None]:
f, axes = plt.subplots(1, 3, figsize=(24, 12))

# Flatten the axes array for easier indexing
axes = axes.flatten()

for i, var in enumerate(sales_vars):
    # Group the data by 'Publisher' and calculate the sum of sales for each publisher
    publisher_sales = combined_vg.groupby('Publisher')[var].sum().sort_values(ascending=False)
    
    # Select only the top 10 publishers based on sales
    top_20_publishers = publisher_sales.head(20)

    # Plot the bar plot for top 10 publisher sales
    sb.barplot(x=top_20_publishers.values, y=top_20_publishers.index, ax=axes[i], order=top_20_publishers.index)

    # Set labels and title for each subplot
    axes[i].set_xlabel('Sales')
    axes[i].set_ylabel('Publisher')
    axes[i].set_title(f'Top 20 Publishers by Sales for {var}')

# Adjust layout
plt.tight_layout()

# Show plot
plt.show()

Horizontal Bar Plot of Different Genres and Regional Sales 

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(24, 12))

genre_na = combined_vg.groupby('Genre')['NA_Sales'].sum().sort_values(ascending=False)
sb.barplot(x=genre_na, y=genre_na.index, ax=ax[0], order=genre_na.index)
ax[0].set_title('NA Sales by Genre')
ax[0].set_xlabel('NA Sales')
ax[0].set_ylabel('Genre')

genre_eu = combined_vg.groupby('Genre')['EU_Sales'].sum().sort_values(ascending=False)
sb.barplot(x=genre_eu, y=genre_eu.index, ax=ax[1], order=genre_eu.index)
ax[1].set_title('EU Sales by Genre')
ax[1].set_xlabel('EU Sales')
ax[1].set_ylabel('Genre')

genre_jp = combined_vg.groupby('Genre')['JP_Sales'].sum().sort_values(ascending=False)
sb.barplot(x=genre_jp, y=genre_jp.index, ax=ax[2], order=genre_jp.index)
ax[2].set_title('JP Sales by Genre')
ax[2].set_xlabel('JP Sales')
ax[2].set_ylabel('Genre')

plt.tight_layout()
plt.show()


Horizontal Bar Plot of Different Platforms and Regional Sales

In [None]:
f, axes = plt.subplots(1, 3, figsize=(24, 12))

platform_sales = combined_vg.groupby('Platform')['NA_Sales'].sum().sort_values(ascending=False)
sb.barplot(x=platform_sales.values, y=platform_sales.index, ax=axes[0])
axes[0].set_xlabel('NA')
axes[0].set_ylabel('Platform')
axes[0].set_title('NA Sales by Platform')

platform_sales = combined_vg.groupby('Platform')['EU_Sales'].sum().sort_values(ascending=False)
sb.barplot(x=platform_sales.values, y=platform_sales.index, ax=axes[1])
axes[1].set_xlabel('EU')
axes[1].set_ylabel('Platform')
axes[1].set_title('EU Sales by Platform')

platform_sales = combined_vg.groupby('Platform')['JP_Sales'].sum().sort_values(ascending=False)
sb.barplot(x=platform_sales.values, y=platform_sales.index, ax=axes[2])
axes[2].set_xlabel('JP')
axes[2].set_ylabel('Platform')
axes[2].set_title('JP Sales by Platform')

plt.show()

Line plot of Sales Over the Years

In [None]:
nasale = combined_vg.groupby('Year')['NA_Sales'].sum().reset_index()
plt.plot(nasale['Year'], nasale['NA_Sales'], color='orange', label='NA Sales') 

eusale = combined_vg.groupby('Year')['EU_Sales'].sum().reset_index()
plt.plot(eusale['Year'], eusale['EU_Sales'], label='EU Sales')  

jpsale = combined_vg.groupby('Year')['JP_Sales'].sum().reset_index()
plt.plot(jpsale['Year'], jpsale['JP_Sales'], label='JP Sales')

# Rotate x-axis labels
plt.xticks(rotation=45, ha='right')
# Rotate y-axis labels
plt.yticks(rotation=90, ha='right')

# Add legends
plt.legend()

plt.show()  # display

Line Plot of Sales Differences

In [None]:
f, axes = plt.subplots(1, 3, figsize=(24, 12))

# Flatten the axes array for easier indexing
axes = axes.flatten()

for i, var in enumerate(sales_vars):
    # Group the data by 'Year' and calculate the sum of sales for each year
    sales_by_year = combined_vg.groupby('Year')[var].sum()

    # Calculate the difference in sales between consecutive years
    sales_diff = sales_by_year.diff()

    # Plot the line plot for the differences in sales between consecutive years
    axes[i].plot(sales_diff.index, sales_diff, marker='o', linestyle='-')

    # Add labels and title
    axes[i].set_xlabel('Year')
    axes[i].set_ylabel(f'Difference in {var}')
    axes[i].set_title(f'Difference in {var} Over Consecutive Years')

    # Set tick locations and labels for the x-axis
    axes[i].set_xticks(range(int(sales_diff.index.min()), int(sales_diff.index.max()) + 1))
    axes[i].tick_params(axis='x', rotation=90)  # Rotate x-axis labels by 90 degrees

    # Show grid
    axes[i].grid(True)

plt.tight_layout()

plt.show()


Scatter Plot of Meta Score and Regional Sales

In [None]:
f, axes = plt.subplots(1, 3, figsize=(24, 12))

# Flatten the axes array for easier indexing
axes = axes.flatten()

# Plot scatter plot for each sales variable
for i, var in enumerate(sales_vars):
    sb.scatterplot(x='meta_score', y=var, data=combined_vg, ax=axes[i])
    axes[i].set_xlabel('Meta Score')
    axes[i].set_ylabel(var)

plt.tight_layout()
plt.show()

Scatter Plot of User Review and Regional Sales

In [None]:
f, axes = plt.subplots(1, 3, figsize=(24, 12))

# Flatten the axes array for easier indexing
axes = axes.flatten()

# Plot scatter plot for each sales variable
for i, var in enumerate(sales_vars):
    sb.scatterplot(x='user_review', y=var, data=combined_vg, ax=axes[i])
    axes[i].set_xlabel('User Review')
    axes[i].set_ylabel(var)

plt.tight_layout()
plt.show()

Heat Map of All the Numeric Variables

In [None]:
# Filter out only the numeric columns
numeric_columns = combined_vg.select_dtypes(include=['float64', 'int64'])

# Calculate the correlation matrix
corr_matrix = numeric_columns.corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))
sb.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap of Numeric Variables')
plt.show()


# Target Encoding

Changing the Datatype of Year to String

In [None]:
# Year is a categorical variable, but it is numeric in the dataset. Thus, we need to change it to string before we can do target encoding
combined_vg['Year'] = combined_vg['Year'].astype(str) 

Target Encoding based on NA_Sales, EU_Sales and JP_Sales

In [None]:
# Initialize the TargetEncoder
encoder = ce.TargetEncoder(cols=['Year', 'Genre', 'Publisher', 'Platform'])

# Fit and transform the data
na_encoded = encoder.fit_transform(combined_vg.drop(columns='NA_Sales'), combined_vg['NA_Sales'])
eu_encoded = encoder.fit_transform(combined_vg.drop(columns='EU_Sales'), combined_vg['EU_Sales'])
jp_encoded = encoder.fit_transform(combined_vg.drop(columns='JP_Sales'), combined_vg['JP_Sales'])

# Concatenate the encoded features with the original DataFrame
vg_na = pd.concat([na_encoded, combined_vg['NA_Sales']], axis=1)
vg_eu = pd.concat([eu_encoded, combined_vg['EU_Sales']], axis=1)
vg_jp = pd.concat([jp_encoded, combined_vg['JP_Sales']], axis=1)


Display Encoded Dataset Based on NA Sales

In [None]:
# NA Encoded Data
vg_na.head()

Display Endcoded Dataset Based on EU Sales

In [None]:
# EU Encoded Data
vg_eu.head()

Display Endcoded Dataset Based on JP Sales

In [None]:
# JP Encoded Data
vg_jp.head()

# Machine Learning

# Linear Regression

Linear Regresion for NA

In [None]:
# Split data into training and testing sets
x = pd.DataFrame(vg_na.drop(columns=['NA_Sales', 'EU_Sales', 'JP_Sales', 'Name', 'Rank'])) # Predictor
y = pd.DataFrame(vg_na['NA_Sales']) # Response
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 42)

print("Shape of train set:", X_train.shape, y_train.shape)
print("Shape of test set:", X_test.shape, y_test.shape)
# Create a Linear Regression object
linreg = LinearRegression()

# Train the Linear Regression model
linreg.fit(X_train,y_train)

# Coefficients of the Linear Regression line
print('Intercept \t: b = ', linreg.intercept_)
print('Coefficients \t: a = ', linreg.coef_)

# Predict the NA_Sales from Predictors
y_train_pred = linreg.predict(X_train)
y_test_pred = linreg.predict(X_test)

# Plot the Predictions vs the True values
f, axes = plt.subplots(1, 2, figsize=(24, 12))
axes[0].scatter(y_train, y_train_pred, color = "blue")
axes[0].plot(y_train, y_train, 'r-', linewidth = 1)
axes[0].set_xlabel("True values of NA Sales (Train)")
axes[0].set_ylabel("Predicted values of NA Sales (Train)")
axes[1].scatter(y_test, y_test_pred, color = "green")
axes[1].plot(y_test, y_test, 'r-', linewidth = 1)
axes[1].set_xlabel("True values of NA Sales (Test)")
axes[1].set_ylabel("Predicted values of the NA Sales (Test)")
plt.show()

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_train, y_train))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_train, y_train_pred))
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(y_train, y_train_pred)))
print()

# Check the Prediction Accuracy (on Test Data)
print("Prediction Accuracy of Model \tTest Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_test, y_test))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_test, y_test_pred))
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(y_test, y_test_pred)))
print()

Predicting NA Sales for specific games using the Linear Regression model derived above

In [None]:
NA_Sales_pred = vg_na[vg_na["Name"].isin(["Grand Theft Auto: Vice City Stories", "Destiny", "FIFA 15"])]
NA_Sales_pred

In [None]:
predictors = ["Platform", "Year", "Genre", "Publisher", "meta_score", "user_review"] #predictors

# Extract Predictors for Prediction
X_pred = pd.DataFrame(NA_Sales_pred[predictors])

# Predict Response corresponding to Predictors
y_pred = linreg.predict(X_pred)
y_pred

In [None]:
# Summarize the Actuals, Predictions and Errors
y_pred = pd.DataFrame(y_pred, columns = ["PredTotal"], index = NA_Sales_pred.index)
NA_Sales_acc = pd.concat([NA_Sales_pred[["Name", "NA_Sales", 'Platform']], y_pred], axis = 1)

y_errs = 100 * abs(NA_Sales_acc["NA_Sales"] - NA_Sales_acc["PredTotal"]) / NA_Sales_acc["NA_Sales"]
y_errs = pd.DataFrame(y_errs, columns = ["Error Percentage"], index = NA_Sales_pred.index)
NA_Sales_acc = pd.concat([NA_Sales_acc, y_errs], axis = 1)

NA_Sales_acc

Not a very good prediction as the errors are quite high

Linear Regression for EU

In [None]:
# Split data into training and testing sets
x = pd.DataFrame(vg_eu.drop(columns=['NA_Sales', 'EU_Sales', 'JP_Sales', 'Name', 'Rank'])) #predictor
y = pd.DataFrame(vg_eu['EU_Sales']) #response
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 42)

print("Shape of train set:", X_train.shape, y_train.shape)
print("Shape of test set:", X_test.shape, y_test.shape)
# Create a Linear Regression object
linreg = LinearRegression()

# Train the Linear Regression model
linreg.fit(X_train,y_train)

# Coefficients of the Linear Regression line
print('Intercept \t: b = ', linreg.intercept_)
print('Coefficients \t: a = ', linreg.coef_)

# Predict the EU_Sales from Predictors
y_train_pred = linreg.predict(X_train)
y_test_pred = linreg.predict(X_test)

# Plot the Predictions vs the True values
f, axes = plt.subplots(1, 2, figsize=(24, 12))
axes[0].scatter(y_train, y_train_pred, color = "blue")
axes[0].plot(y_train, y_train, 'r-', linewidth = 1)
axes[0].set_xlabel("True values of EU Sales (Train)")
axes[0].set_ylabel("Predicted values of EU Sales (Train)")
axes[1].scatter(y_test, y_test_pred, color = "green")
axes[1].plot(y_test, y_test, 'r-', linewidth = 1)
axes[1].set_xlabel("True values of EU Sales (Test)")
axes[1].set_ylabel("Predicted values of the EU Sales (Test)")
plt.show()

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_train, y_train))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_train, y_train_pred))
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(y_train, y_train_pred)))
print()

# Check the Prediction Accuracy (on Test Data)
print("Prediction Accuracy of Model \tTest Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_test, y_test))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_test, y_test_pred))
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(y_test, y_test_pred)))
print()

Predicting EU Sales for specific games using the Linear Regression model derived above

In [None]:
EU_Sales_pred = vg_eu[vg_eu["Name"].isin(["Grand Theft Auto: Vice City Stories", "Destiny", "FIFA 15"])]
EU_Sales_pred

In [None]:
predictors = ["Platform", "Year", "Genre", "Publisher", "meta_score", "user_review"] #predictors

# Extract Predictors for Prediction
X_pred = pd.DataFrame(EU_Sales_pred[predictors])

# Predict Response corresponding to Predictors
y_pred = linreg.predict(X_pred)
y_pred

In [None]:
# Summarize the Actuals, Predictions and Errors
y_pred = pd.DataFrame(y_pred, columns = ["PredTotal"], index = EU_Sales_pred.index)
EU_Sales_acc = pd.concat([EU_Sales_pred[["Name", "EU_Sales"]], y_pred], axis = 1)

y_errs = 100 * abs(EU_Sales_acc["EU_Sales"] - EU_Sales_acc["PredTotal"]) / EU_Sales_acc["EU_Sales"]
y_errs = pd.DataFrame(y_errs, columns = ["Error percentage"], index = EU_Sales_pred.index)
EU_Sales_acc = pd.concat([EU_Sales_acc, y_errs], axis = 1)

EU_Sales_acc

Linear Regression for JP

In [None]:
# Split data into training and testing sets
x = pd.DataFrame(vg_jp.drop(columns=['NA_Sales', 'EU_Sales', 'JP_Sales', 'Name', 'Rank'])) #predictor
y = pd.DataFrame(vg_jp['JP_Sales']) #response
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 42)

print("Shape of train set:", X_train.shape, y_train.shape)
print("Shape of test set:", X_test.shape, y_test.shape)
# Create a Linear Regression object
linreg = LinearRegression()

# Train the Linear Regression model
linreg.fit(X_train,y_train)

# Coefficients of the Linear Regression line
print('Intercept \t: b = ', linreg.intercept_)
print('Coefficients \t: a = ', linreg.coef_)

# Predict the JP_Sales from Predictors
y_train_pred = linreg.predict(X_train)
y_test_pred = linreg.predict(X_test)

# Plot the Predictions vs the True values
f, axes = plt.subplots(1, 2, figsize=(24, 12))
axes[0].scatter(y_train, y_train_pred, color = "blue")
axes[0].plot(y_train, y_train, 'r-', linewidth = 1)
axes[0].set_xlabel("True values of JP Sales (Train)")
axes[0].set_ylabel("Predicted values of JP Sales (Train)")
axes[1].scatter(y_test, y_test_pred, color = "green")
axes[1].plot(y_test, y_test, 'r-', linewidth = 1)
axes[1].set_xlabel("True values of JP Sales (Test)")
axes[1].set_ylabel("Predicted values of the JP Sales (Test)")
plt.show()

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_train, y_train))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_train, y_train_pred))
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(y_train, y_train_pred)))
print()

# Check the Prediction Accuracy (on Test Data)
print("Prediction Accuracy of Model \tTest Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_test, y_test))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_test, y_test_pred))
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(y_test, y_test_pred)))
print()

Predicting JP Sales for specific games using the Linear Regression model derived above

In [None]:
JP_Sales_pred = vg_jp[vg_jp["Name"].isin(["Grand Theft Auto: Vice City Stories", "Destiny", "FIFA 15"])]
JP_Sales_pred

In [None]:
predictors = ["Platform", "Year", "Genre", "Publisher", "meta_score", "user_review"] #predictors

# Extract Predictors for Prediction
X_pred = pd.DataFrame(JP_Sales_pred[predictors])

# Predict Response corresponding to Predictors
y_pred = linreg.predict(X_pred)
y_pred

In [None]:
# Summarize the Actuals, Predictions and Errors
y_pred = pd.DataFrame(y_pred, columns = ["PredTotal"], index = JP_Sales_pred.index)
JP_Sales_acc = pd.concat([JP_Sales_pred[["Name", "JP_Sales"]], y_pred], axis = 1)

y_errs = 100 * abs(JP_Sales_acc["JP_Sales"] - JP_Sales_acc["PredTotal"]) / JP_Sales_acc["JP_Sales"]
y_errs = pd.DataFrame(y_errs, columns = ["Error percentage"], index = JP_Sales_pred.index)
JP_Sales_acc = pd.concat([JP_Sales_acc, y_errs], axis = 1)

JP_Sales_acc

# Random Forest 


# Random Forest Regression for NA

In [None]:
x = pd.DataFrame(vg_na.drop(columns=['NA_Sales', 'EU_Sales', 'JP_Sales', 'Name', 'Rank'])) #predictor
y = pd.DataFrame(vg_na['NA_Sales']) #response
y = np.ravel(y)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 42)

## Using Grid Search to find the best parameters for our data
Find the best parameters to use for the Random Forest Regression by Gridsearch

In [None]:
rf_tuning = RandomForestRegressor(random_state = 20) #define the hyperparameters to search over
param_grid = {
   'n_estimators': [ 100, 150, 200],
   'max_features': ['auto', 'sqrt', 'log2'],
   'max_depth' : [3,4,5,6,7]
}
gs = GridSearchCV(estimator = rf_tuning, param_grid = param_grid, cv = 5)
gs.fit(X_train, y_train)
gs.best_params_ 

In [None]:

#fit the model

rf = RandomForestRegressor(n_estimators = 200, max_features = 'sqrt', max_depth = 7, random_state = 18, oob_score=True) #used the parameters found
#predict the results of the test set with the model trained on the training set values
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
y_pred_train = rf.predict(X_train)

# Check the Goodness of Fit (on Train Data)
print('Goodness of fit of Model \tTrain Dataset')
print('R^2 Score: ', r2_score(y_train, y_pred_train))
print('Mean Absolute Error (MAE): ', mean_absolute_error(y_train, y_pred_train))
print('Mean Squared Error (MSE): ', mean_squared_error(y_train, y_pred_train)) 
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(y_train, y_pred_train)))

# Check the Prediction Accuracy (on Test Data)
print('\nPrediction Accuracy of Model \tTest Dataset')
print('R^2 Score: ', r2_score(y_test, y_pred))
print('Mean Absolute Error (MAE): ', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error (MSE): ', mean_squared_error(y_test, y_pred)) 
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(y_test, y_pred)))

random_forest_out_of_bag = RandomForestRegressor(oob_score = True, random_state = 42)
random_forest_out_of_bag.fit(X_train, y_train)
print('\nOut of Bag Score: ', random_forest_out_of_bag.oob_score_) 

Plotting the True Values against the predicted Values

In [None]:
# Plot the Predictions vs the True values(Train)
f, axes = plt.subplots(1, 2, figsize=(24, 12))
axes[0].scatter(y_train, y_pred_train, color = "green") #can delete this also doesnt really help w our analysis
axes[0].plot(y_train, y_train, 'black', linewidth = 3)
axes[0].set_xlabel("True values (Train)")
axes[0].set_ylabel("Predicted values (Train)")

# Plot the Predictions vs the True values(Test)
axes[1].scatter(y_test, y_pred, color = "blue")
axes[1].plot(y_test, y_test, 'black', linewidth = 3)
axes[1].set_xlabel("True values (Test)")
axes[1].set_ylabel("Predicted values (Test)")
plt.show()

Bag score or OOB score is the type of validation technique that is mainly used in bagging algorithms to validate the bagging algorithm. Here a small part of the validation data is taken from the mainstream of the data and the predictions on the particular validation data are done and compared with the other results.

The main advantage that the OOB score offers is that here the validation data is not seen by the bagging algorithm and that is why the results on the OOB score are the true results that indicated the actual performance of the bagging algorithm.


increasing the number of estimators increases the mse and rmse slightly, but the random forest model seems accurate enough
dont need this anymore cuz i tuned the model to find the best parameters

## Prediction of NA Sales

In [None]:
# Extract Predictors for Prediction
X_pred = pd.DataFrame(NA_Sales_pred[predictors])

# Predict Response corresponding to Predictors
y_pred = rf.predict(X_pred)
y_pred

In [None]:
# Summarize the Actuals, Predictions and Errors
y_pred = pd.DataFrame(y_pred, columns = ["PredTotal"], index = NA_Sales_pred.index)
NA_Sales_acc = pd.concat([NA_Sales_pred[["Name", "NA_Sales"]], y_pred], axis = 1)

y_errs = 100 * abs(NA_Sales_acc["NA_Sales"] - NA_Sales_acc["PredTotal"]) / NA_Sales_acc["NA_Sales"]
y_errs = pd.DataFrame(y_errs, columns = ["Error Percentage"], index = NA_Sales_pred.index)
NA_Sales_acc = pd.concat([NA_Sales_acc, y_errs], axis = 1)

NA_Sales_acc

## Feature Importance

In [None]:
x = pd.DataFrame(vg_na.drop(columns=['NA_Sales', 'EU_Sales', 'JP_Sales', 'Name', 'Rank'])) #predictor
y = pd.DataFrame(vg_na['NA_Sales']) #response
y= np.ravel(y)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 42)

# create the classifier with n_estimators = 300
rf = RandomForestRegressor(n_estimators = 200, max_features = 'sqrt', max_depth = 7, random_state = 18, oob_score=True)

# fit the model to the training set
rf.fit(X_train, y_train)

feature_scores = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
feature_scores

meta score is the most important feature while genre is the least significant one.

## Remove least significant feature from dataset
Since Genre has the least significance, we plan to remove Genre from this dataset.

In [None]:
x = pd.DataFrame(vg_na.drop(columns=['NA_Sales', 'EU_Sales', 'JP_Sales', 'Name', 'Rank', 'Genre'])) #predictor
y = pd.DataFrame(vg_na['NA_Sales']) #response
y = np.ravel(y)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 42)
#fit the model

rf = RandomForestRegressor(n_estimators = 200, max_features = 'sqrt', max_depth = 7, random_state = 18, oob_score=True) #used the parameters found
#predict the results of the test set with the model trained on the training set values
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
y_pred_train = rf.predict(X_train)

# Check the Goodness of Fit (on Train Data)
print('Goodness of fit of Model \tTrain Dataset')
print('R^2 Score: ', r2_score(y_train, y_pred_train))
print('Mean Absolute Error (MAE): ', mean_absolute_error(y_train, y_pred_train))
print('Mean Squared Error (MSE): ', mean_squared_error(y_train, y_pred_train)) 
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(y_train, y_pred_train)))

# Check the Prediction Accuracy (on Test Data)
print('\nPrediction Accuracy of Model \tTest Dataset')
#print('Model accu#racy score with 10 decision-trees :', accuracy_score(y_test, y_pred))
print('R^2 Score: ', r2_score(y_test, y_pred))
print('Mean Absolute Error (MAE): ', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error (MSE): ', mean_squared_error(y_test, y_pred)) 
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(y_test, y_pred)))

random_forest_out_of_bag = RandomForestRegressor(oob_score = True, random_state = 42)
random_forest_out_of_bag.fit(X_train, y_train)
print('Out of Bag Score: ', random_forest_out_of_bag.oob_score_) 

In [None]:
# Plot the Predictions vs the True values(Train)
f, axes = plt.subplots(1, 2, figsize=(24, 12))
axes[0].scatter(y_train, y_pred_train, color = "green") #can delete this also doesnt really help w our analysis
axes[0].plot(y_train, y_train, 'black', linewidth = 3)
axes[0].set_xlabel("True values (Train)")
axes[0].set_ylabel("Predicted values (Train)")

# Plot the Predictions vs the True values(Test)
axes[1].scatter(y_test, y_pred, color = "blue")
axes[1].plot(y_test, y_test, 'black', linewidth = 3)
axes[1].set_xlabel("True values (Test)")
axes[1].set_ylabel("Predicted values (Test)")
plt.show()

## Prediction after removal of Genre

In [None]:
# Update of predictors
predictors_2 = ["Platform", "Year", "Publisher", "meta_score", "user_review"] 

# Extract Predictors for Prediction
X_pred = pd.DataFrame(NA_Sales_pred[predictors_2])

# Predict Response corresponding to Predictors
y_pred = rf.predict(X_pred)
y_pred

In [None]:
# Summarize the Actuals, Predictions and Errors
y_pred = pd.DataFrame(y_pred, columns = ["PredTotal"], index = NA_Sales_pred.index)
NA_Sales_acc = pd.concat([NA_Sales_pred[["Name", "NA_Sales"]], y_pred], axis = 1)

y_errs = 100 * abs(NA_Sales_acc["NA_Sales"] - NA_Sales_acc["PredTotal"]) / NA_Sales_acc["NA_Sales"]
y_errs = pd.DataFrame(y_errs, columns = ["Error Percentage"], index = NA_Sales_pred.index)
NA_Sales_acc = pd.concat([NA_Sales_acc, y_errs], axis = 1)

NA_Sales_acc

# Random Forest Regression for EU

In [None]:
x = pd.DataFrame(vg_eu.drop(columns=['NA_Sales', 'EU_Sales', 'JP_Sales', 'Name', 'Rank'])) #predictor
y = pd.DataFrame(vg_eu['EU_Sales']) #response
y = np.ravel(y)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 42)

## Using Grid Search to find the best parameters for our data
Find the best parameters to use for the Random Forest Regression by Gridsearch

In [None]:
rf_tuning = RandomForestRegressor(random_state = 20) #define the hyperparameters to search over
param_grid = {
   'n_estimators': [ 100, 150, 200],
   'max_features': ['auto', 'sqrt', 'log2'],
   'max_depth' : [3,4,5,6,7]
}
gs = GridSearchCV(estimator = rf_tuning, param_grid = param_grid, cv = 5)
gs.fit(X_train, y_train)
gs.best_params_ 

In [None]:
x = pd.DataFrame(vg_eu.drop(columns=['NA_Sales', 'EU_Sales', 'JP_Sales', 'Name', 'Rank'])) #predictor
y = pd.DataFrame(vg_eu['EU_Sales']) #response
y = np.ravel(y)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 42)
#fit the model

rf = RandomForestRegressor(n_estimators = 200, max_features = 'sqrt', max_depth = 7, random_state = 18, oob_score=True) #used the parameters found
#predict the results of the test set with the model trained on the training set values
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
y_pred_train = rf.predict(X_train)

# Check the Goodness of Fit (on Train Data)
print('Goodness of fit of Model \tTrain Dataset')
print('R^2 Score: ', r2_score(y_train, y_pred_train))
print('Mean Absolute Error (MAE): ', mean_absolute_error(y_train, y_pred_train))
print('Mean Squared Error (MSE): ', mean_squared_error(y_train, y_pred_train)) 
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(y_train, y_pred_train)))

# Check the Prediction Accuracy (on Test Data)
print('\nPrediction Accuracy of Model \tTest Dataset')
#print('Model accu#racy score with 10 decision-trees :', accuracy_score(y_test, y_pred))
print('R^2 Score: ', r2_score(y_test, y_pred))
print('Mean Absolute Error (MAE): ', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error (MSE): ', mean_squared_error(y_test, y_pred)) 
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(y_test, y_pred)))

random_forest_out_of_bag = RandomForestRegressor(oob_score = True, random_state = 42)
random_forest_out_of_bag.fit(X_train, y_train)
print('\nOut of Bag Score: ', random_forest_out_of_bag.oob_score_) 

Plotting the True Values against the predicted Values

In [None]:
# Plot the Predictions vs the True values(Train)
f, axes = plt.subplots(1, 2, figsize=(24, 12))
axes[0].scatter(y_train, y_pred_train, color = "green") #can delete this also doesnt really help w our analysis
axes[0].plot(y_train, y_train, 'black', linewidth = 3)
axes[0].set_xlabel("True values (Train)")
axes[0].set_ylabel("Predicted values (Train)")

# Plot the Predictions vs the True values(Test)
axes[1].scatter(y_test, y_pred, color = "blue")
axes[1].plot(y_test, y_test, 'black', linewidth = 3)
axes[1].set_xlabel("True values (Test)")
axes[1].set_ylabel("Predicted values (Test)")
plt.show()

## Feature Importance

In [None]:
x = pd.DataFrame(vg_eu.drop(columns=['NA_Sales', 'EU_Sales', 'JP_Sales', 'Name', 'Rank'])) #predictor
y = pd.DataFrame(vg_eu['EU_Sales']) #response
y= np.ravel(y)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 42)

# create the classifier with n_estimators = 300
rf = RandomForestRegressor(n_estimators=300, random_state=42)

# fit the model to the training set
rf.fit(X_train, y_train)

feature_scores = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
feature_scores

### Prediction of EU Sales

In [None]:
# Extract Predictors for Prediction
X_pred = pd.DataFrame(EU_Sales_pred[predictors])

# Predict Response corresponding to Predictors
y_pred = rf.predict(X_pred)
y_pred

In [None]:
# Summarize the Actuals, Predictions and Errors
y_pred = pd.DataFrame(y_pred, columns = ["PredTotal"], index = EU_Sales_pred.index)
EU_Sales_acc = pd.concat([EU_Sales_pred[["Name", "EU_Sales"]], y_pred], axis = 1)

y_errs = 100 * abs(EU_Sales_acc["EU_Sales"] - EU_Sales_acc["PredTotal"]) / EU_Sales_acc["EU_Sales"]
y_errs = pd.DataFrame(y_errs, columns = ["Error Percentage"], index = EU_Sales_pred.index)
EU_Sales_acc = pd.concat([EU_Sales_acc, y_errs], axis = 1)

EU_Sales_acc

## Remove least significant feature from our dataset 
Remove Genre from dataset

In [None]:
x = pd.DataFrame(vg_eu.drop(columns=['NA_Sales', 'EU_Sales', 'JP_Sales', 'Name', 'Rank', 'Genre'])) #predictor
y = pd.DataFrame(vg_eu['EU_Sales']) #response
y = np.ravel(y)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 42)
#fit the model

rf = RandomForestRegressor(n_estimators = 200, max_features = 'sqrt', max_depth = 7, random_state = 18, oob_score=True) #used the parameters found
#predict the results of the test set with the model trained on the training set values
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
y_pred_train = rf.predict(X_train)

# Check the Goodness of Fit (on Train Data)
print('Goodness of fit of Model \tTrain Dataset')
print('R^2 Score: ', r2_score(y_train, y_pred_train))
print('Mean Absolute Error (MAE): ', mean_absolute_error(y_train, y_pred_train))
print('Mean Squared Error (MSE): ', mean_squared_error(y_train, y_pred_train)) 
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(y_train, y_pred_train)))

# Check the Prediction Accuracy (on Test Data)
print('\nPrediction Accuracy of Model \tTest Dataset')
print('R^2 Score: ', r2_score(y_test, y_pred))
print('Mean Absolute Error (MAE): ', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error (MSE): ', mean_squared_error(y_test, y_pred)) 
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(y_test, y_pred)))

random_forest_out_of_bag = RandomForestRegressor(oob_score = True, random_state = 42)
random_forest_out_of_bag.fit(X_train, y_train)
print('\nOut of Bag Score: ', random_forest_out_of_bag.oob_score_) 

In [None]:
# Plot the Predictions vs the True values(Train)
f, axes = plt.subplots(1, 2, figsize=(24, 12))
axes[0].scatter(y_train, y_pred_train, color = "green") 
axes[0].plot(y_train, y_train, 'black', linewidth = 3)
axes[0].set_xlabel("True values (Train)")
axes[0].set_ylabel("Predicted values (Train)")

# Plot the Predictions vs the True values(Test)
axes[1].scatter(y_test, y_pred, color = "blue")
axes[1].plot(y_test, y_test, 'black', linewidth = 3)
axes[1].set_xlabel("True values (Test)")
axes[1].set_ylabel("Predicted values (Test)")
plt.show()

## Prediction after removal of Genre

In [None]:
# Update of predictors
predictors_2 = ["Platform", "Year", "Publisher", "meta_score", "user_review"] 

# Extract Predictors for Prediction
X_pred = pd.DataFrame(EU_Sales_pred[predictors_2])

# Predict Response corresponding to Predictors
y_pred = rf.predict(X_pred)
y_pred

In [None]:
# Summarize the Actuals, Predictions and Errors
y_pred = pd.DataFrame(y_pred, columns = ["PredTotal"], index = EU_Sales_pred.index)
EU_Sales_acc = pd.concat([EU_Sales_pred[["Name", "EU_Sales"]], y_pred], axis = 1)

y_errs = 100 * abs(EU_Sales_acc["EU_Sales"] - EU_Sales_acc["PredTotal"]) / EU_Sales_acc["EU_Sales"]
y_errs = pd.DataFrame(y_errs, columns = ["Error Percentage"], index = EU_Sales_pred.index)
EU_Sales_acc = pd.concat([EU_Sales_acc, y_errs], axis = 1)

EU_Sales_acc

# Japan

In [None]:
x = pd.DataFrame(vg_jp.drop(columns=['NA_Sales', 'EU_Sales', 'JP_Sales', 'Name', 'Rank'])) #predictor
y = pd.DataFrame(vg_jp['JP_Sales']) #response
y = np.ravel(y)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 42)

## Using Grid Search to find the best parameters for our data
Find the best parameters to use for the Random Forest Regression by Gridsearch

In [None]:
rf_tuning = RandomForestRegressor(random_state = 20) #define the hyperparameters to search over
param_grid = {
   'n_estimators': [ 100, 150, 200],
   'max_features': ['auto', 'sqrt', 'log2'],
   'max_depth' : [3,4,5,6,7]
}
gs = GridSearchCV(estimator = rf_tuning, param_grid = param_grid, cv = 5)
gs.fit(X_train, y_train)
gs.best_params_ 

In [None]:
rf = RandomForestRegressor(n_estimators = 100, max_features = 'sqrt', max_depth = 6, random_state = 18, oob_score=True) #used the parameters found
#predict the results of the test set with the model trained on the training set values
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
y_pred_train = rf.predict(X_train)

# Check the Goodness of Fit (on Train Data)
print('Goodness of fit of Model \tTrain Dataset')
print('R^2 Score: ', r2_score(y_train, y_pred_train))
print('Mean Absolute Error (MAE): ', mean_absolute_error(y_train, y_pred_train))
print('Mean Squared Error (MSE): ', mean_squared_error(y_train, y_pred_train)) 
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(y_train, y_pred_train)))

# Check the Prediction Accuracy (on Test Data)
print('\nPrediction Accuracy of Model \tTest Dataset')
#print('Model accu#racy score with 10 decision-trees :', accuracy_score(y_test, y_pred))
print('R^2 Score: ', r2_score(y_test, y_pred))
print('Mean Absolute Error (MAE): ', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error (MSE): ', mean_squared_error(y_test, y_pred)) 
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(y_test, y_pred)))

random_forest_out_of_bag = RandomForestRegressor(oob_score = True, random_state = 42)
random_forest_out_of_bag.fit(X_train, y_train)
print('\nOut of Bag Score: ', random_forest_out_of_bag.oob_score_) 

Plotting the True Values against the predicted Values

In [None]:
# Plot the Predictions vs the True values(Train)
f, axes = plt.subplots(1, 2, figsize=(24, 12))
axes[0].scatter(y_train, y_pred_train, color = "green") 
axes[0].plot(y_train, y_train, 'black', linewidth = 3)
axes[0].set_xlabel("True values (Train)")
axes[0].set_ylabel("Predicted values (Train)")

# Plot the Predictions vs the True values(Test)
axes[1].scatter(y_test, y_pred, color = "blue")
axes[1].plot(y_test, y_test, 'black', linewidth = 3)
axes[1].set_xlabel("True values (Test)")
axes[1].set_ylabel("Predicted values (Test)")
plt.show()

## Predicting Sales for JP

In [None]:
# Extract Predictors for Prediction
X_pred = pd.DataFrame(JP_Sales_pred[predictors])

# Predict Response corresponding to Predictors
y_pred = rf.predict(X_pred)
y_pred

In [None]:
# Summarize the Actuals, Predictions and Errors
y_pred = pd.DataFrame(y_pred, columns = ["PredTotal"], index = JP_Sales_pred.index)
JP_Sales_acc = pd.concat([JP_Sales_pred[["Name", "JP_Sales"]], y_pred], axis = 1)

y_errs = 100 * abs(JP_Sales_acc["JP_Sales"] - JP_Sales_acc["PredTotal"]) / JP_Sales_acc["JP_Sales"]
y_errs = pd.DataFrame(y_errs, columns = ["Error Percentage"], index = JP_Sales_pred.index)
JP_Sales_acc = pd.concat([JP_Sales_acc, y_errs], axis = 1)

JP_Sales_acc

# Feature Importance

In [None]:
x = pd.DataFrame(vg_jp.drop(columns=['NA_Sales', 'EU_Sales', 'JP_Sales', 'Name', 'Rank'])) #predictor
y = pd.DataFrame(vg_jp['JP_Sales']) #response
y= np.ravel(y)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 42)

# create the classifier with n_estimators = 300
rf = RandomForestRegressor(n_estimators = 100, max_features = 'sqrt', max_depth = 6, random_state = 18, oob_score=True)

# fit the model to the training set
rf.fit(X_train, y_train)

feature_scores = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=False)
feature_scores

Publisher is the most important feature while Platform is the least important feature

### Removing Platform from the predictors

In [None]:
x = pd.DataFrame(vg_jp.drop(columns=['NA_Sales', 'EU_Sales', 'JP_Sales', 'Name', 'Rank', 'Platform'])) #predictor
y = pd.DataFrame(vg_jp['JP_Sales']) #response
y = np.ravel(y)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 42)
#fit the model

rf = RandomForestRegressor(n_estimators = 100, max_features = 'sqrt', max_depth = 6, random_state = 18, oob_score=True) #used the parameters found
#predict the results of the test set with the model trained on the training set values
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
y_pred_train = rf.predict(X_train)

# Check the Goodness of Fit (on Train Data)
print('Goodness of fit of Model \tTrain Dataset')
print('R^2 Score: ', r2_score(y_train, y_pred_train))
print('Mean Absolute Error (MAE): ', mean_absolute_error(y_train, y_pred_train))
print('Mean Squared Error (MSE): ', mean_squared_error(y_train, y_pred_train)) 
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(y_train, y_pred_train)))

# Check the Prediction Accuracy (on Test Data)
print('\nPrediction Accuracy of Model \tTest Dataset')
#print('Model accu#racy score with 10 decision-trees :', accuracy_score(y_test, y_pred))
print('R^2 Score: ', r2_score(y_test, y_pred))
print('Mean Absolute Error (MAE): ', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error (MSE): ', mean_squared_error(y_test, y_pred)) 
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(y_test, y_pred)))

random_forest_out_of_bag = RandomForestRegressor(oob_score = True, random_state = 42)
random_forest_out_of_bag.fit(X_train, y_train)
print('\nOut of Bag Score: ', random_forest_out_of_bag.oob_score_) 

In [None]:
# Plot the Predictions vs the True values(Train)
f, axes = plt.subplots(1, 2, figsize=(24, 12))
axes[0].scatter(y_train, y_pred_train, color = "green") #can delete this also doesnt really help w our analysis
axes[0].plot(y_train, y_train, 'black', linewidth = 3)
axes[0].set_xlabel("True values (Train)")
axes[0].set_ylabel("Predicted values (Train)")

# Plot the Predictions vs the True values(Test)
axes[1].scatter(y_test, y_pred, color = "blue")
axes[1].plot(y_test, y_test, 'black', linewidth = 3)
axes[1].set_xlabel("True values (Test)")
axes[1].set_ylabel("Predicted values (Test)")
plt.show()

## Prediction of JP Sales after removing Platform

In [None]:
# Update of predictors
predictors_2 = ["Year", "Genre", "Publisher", "meta_score", "user_review"] 

# Extract Predictors for Prediction
X_pred = pd.DataFrame(JP_Sales_pred[predictors_2])

# Predict Response corresponding to Predictors
y_pred = rf.predict(X_pred)
y_pred

In [None]:
# Summarize the Actuals, Predictions and Errors
y_pred = pd.DataFrame(y_pred, columns = ["PredTotal"], index = JP_Sales_pred.index)
JP_Sales_acc = pd.concat([JP_Sales_pred[["Name", "JP_Sales"]], y_pred], axis = 1)

y_errs = 100 * abs(JP_Sales_acc["JP_Sales"] - JP_Sales_acc["PredTotal"]) / JP_Sales_acc["JP_Sales"]
y_errs = pd.DataFrame(y_errs, columns = ["Error Percentage"], index = JP_Sales_pred.index)
JP_Sales_acc = pd.concat([JP_Sales_acc, y_errs], axis = 1)

JP_Sales_acc

## XGBOOST

# NA

Tuning the xgboost model

In [None]:
xgb = xgboost.XGBRegressor(gamma=0, subsample=0.75,
                           colsample_bytree=1, random_state = 42)
x = pd.DataFrame(vg_na.drop(columns=['NA_Sales', 'EU_Sales', 'JP_Sales', 'Name', 'Rank'])) #predictor
y = pd.DataFrame(vg_na['NA_Sales']) #response
y = np.ravel(y)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 42)

param_grid = {#"max_depth":    [4, 5, 6, 7],
              "n_estimators": [200,300,400],
              "learning_rate": [0.02, 0.04, 0.05, 0.07, 0.08]}

# try out every combination of the above values
search = GridSearchCV(xgb, param_grid, cv=5).fit(X_train, y_train)

print("The best hyperparameters are ",search.best_params_)

In [None]:
xgb = xgboost.XGBRegressor(n_estimators=200, learning_rate=0.02, gamma=0, subsample=0.75,colsample_bytree=1, max_depth=7, random_state = 42)

xgb.fit(X_train,y_train)

y_train_pred = xgb.predict(X_train)
y_test_pred = xgb.predict(X_test)

f, axes = plt.subplots(1, 2, figsize=(24, 12))
axes[0].scatter(y_train, y_train_pred, color = "blue")
axes[0].plot(y_train, y_train, 'r-', linewidth = 4)
axes[0].set_xlabel("True values of NA Sales (Train)")
axes[0].set_ylabel("Predicted values NA Sales (Train)")
axes[1].scatter(y_test, y_test_pred, color = "green")
axes[1].plot(y_test, y_test, 'r-', linewidth = 4)
axes[1].set_xlabel("True values of NA Sales (Test)")
axes[1].set_ylabel("Predicted values of NA Sales (Test)")
plt.show()

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Explained Variance (R^2) \t:", xgb.score(X_train, y_train))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_train, y_train_pred))
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(y_train, y_train_pred)))
print()

# Check the Prediction Accuracy (on Test Data)
print("Prediction Accuracy of Model \tTest Dataset")
print("Explained Variance (R^2) \t:", xgb.score(X_test, y_test))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_test, y_test_pred))
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(y_test, y_test_pred)))
print()

Prediction

In [None]:
# Extract Predictors for Prediction
X_pred = pd.DataFrame(NA_Sales_pred[predictors])

# Predict Response corresponding to Predictors
y_pred = xgb.predict(X_pred)
y_pred

In [None]:
# Summarize the Actuals, Predictions and Errors
y_pred = pd.DataFrame(y_pred, columns = ["PredTotal"], index = NA_Sales_pred.index)
NA_Sales_acc = pd.concat([NA_Sales_pred[["Name", "NA_Sales"]], y_pred], axis = 1)

y_errs = 100 * abs(NA_Sales_acc["NA_Sales"] - NA_Sales_acc["PredTotal"]) / NA_Sales_acc["NA_Sales"]
y_errs = pd.DataFrame(y_errs, columns = ["Error Percentage"], index = NA_Sales_pred.index)
NA_Sales_acc = pd.concat([NA_Sales_acc, y_errs], axis = 1)

NA_Sales_acc

## EU

In [None]:
xgb = xgboost.XGBRegressor(gamma=0, subsample=0.75,
                           colsample_bytree=1, random_state = 42)
x = pd.DataFrame(vg_eu.drop(columns=['NA_Sales', 'EU_Sales', 'JP_Sales', 'Name', 'Rank'])) #predictor
y = pd.DataFrame(vg_eu['EU_Sales']) #response
y = np.ravel(y)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 42)

param_grid = {#"max_depth":    [4, 5, 6, 7],
              "n_estimators": [200,300,400],
              "learning_rate": [0.02, 0.04, 0.05, 0.07, 0.08]}

# try out every combination of the above values
search = GridSearchCV(xgb, param_grid, cv=5).fit(X_train, y_train)

print("The best hyperparameters are ",search.best_params_)

In [None]:
xgb = xgboost.XGBRegressor(n_estimators=200, learning_rate=0.02, gamma=0, subsample=0.75,colsample_bytree=1, max_depth=7, random_state = 42)

xgb.fit(X_train,y_train)

y_train_pred = xgb.predict(X_train)
y_test_pred = xgb.predict(X_test)

f, axes = plt.subplots(1, 2, figsize=(24, 12))
axes[0].scatter(y_train, y_train_pred, color = "blue")
axes[0].plot(y_train, y_train, 'r-', linewidth = 4)
axes[0].set_xlabel("True values of EU Sales (Train)")
axes[0].set_ylabel("Predicted values EU Sales (Train)")
axes[1].scatter(y_test, y_test_pred, color = "green")
axes[1].plot(y_test, y_test, 'r-', linewidth = 4)
axes[1].set_xlabel("True values of EU Sales (Test)")
axes[1].set_ylabel("Predicted values of EU Sales (Test)")
plt.show()

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Explained Variance (R^2) \t:", xgb.score(X_train, y_train))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_train, y_train_pred))
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(y_train, y_train_pred)))
print()

# Check the Prediction Accuracy (on Test Data)
print("Prediction Accuracy of Model \tTest Dataset")
print("Explained Variance (R^2) \t:", xgb.score(X_test, y_test))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_test, y_test_pred))
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(y_test, y_test_pred)))
print()

In [None]:
# Extract Predictors for Prediction
X_pred = pd.DataFrame(EU_Sales_pred[predictors])

# Predict Response corresponding to Predictors
y_pred = xgb.predict(X_pred)
y_pred

In [None]:
# Summarize the Actuals, Predictions and Errors
y_pred = pd.DataFrame(y_pred, columns = ["PredTotal"], index = EU_Sales_pred.index)
EU_Sales_acc = pd.concat([EU_Sales_pred[["Name", "EU_Sales"]], y_pred], axis = 1)

y_errs = 100 * abs(EU_Sales_acc["EU_Sales"] - EU_Sales_acc["PredTotal"]) / EU_Sales_acc["EU_Sales"]
y_errs = pd.DataFrame(y_errs, columns = ["Error Percentage"], index = EU_Sales_pred.index)
EU_Sales_acc = pd.concat([EU_Sales_acc, y_errs], axis = 1)

EU_Sales_acc

# JP

In [None]:
xgb = xgboost.XGBRegressor(gamma=0, subsample=0.75,
                           colsample_bytree=1, random_state = 42)
x = pd.DataFrame(vg_jp.drop(columns=['NA_Sales', 'EU_Sales', 'JP_Sales', 'Name', 'Rank'])) #predictor
y = pd.DataFrame(vg_jp['JP_Sales']) #response
y = np.ravel(y)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 42)

param_grid = {"n_estimators": [200,300,400],
              "learning_rate": [0.02, 0.04, 0.05, 0.07, 0.08]}

# try out every combination of the above values
search = GridSearchCV(xgb, param_grid, cv=5).fit(X_train, y_train)

print("The best hyperparameters are ",search.best_params_)

In [None]:
xgb = xgboost.XGBRegressor(n_estimators=200, learning_rate=0.02, gamma=0, subsample=0.75,colsample_bytree=1, max_depth=7, random_state = 42)

xgb.fit(X_train,y_train)

y_train_pred = xgb.predict(X_train)
y_test_pred = xgb.predict(X_test)

f, axes = plt.subplots(1, 2, figsize=(24, 12))
axes[0].scatter(y_train, y_train_pred, color = "blue")
axes[0].plot(y_train, y_train, 'r-', linewidth = 4)
axes[0].set_xlabel("True values of JP Sales (Train)")
axes[0].set_ylabel("Predicted values JP Sales (Train)")
axes[1].scatter(y_test, y_test_pred, color = "green")
axes[1].plot(y_test, y_test, 'r-', linewidth = 4)
axes[1].set_xlabel("True values of JP Sales (Test)")
axes[1].set_ylabel("Predicted values of JP Sales (Test)")
plt.show()

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Explained Variance (R^2) \t:", xgb.score(X_train, y_train))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_train, y_train_pred))
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(y_train, y_train_pred)))
print()

# Check the Prediction Accuracy (on Test Data)
print("Prediction Accuracy of Model \tTest Dataset")
print("Explained Variance (R^2) \t:", xgb.score(X_test, y_test))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_test, y_test_pred))
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mean_squared_error(y_test, y_test_pred)))
print()

In [None]:
# Extract Predictors for Prediction
X_pred = pd.DataFrame(JP_Sales_pred[predictors])

# Predict Response corresponding to Predictors
y_pred = xgb.predict(X_pred)
y_pred

In [None]:
# Summarize the Actuals, Predictions and Errors
y_pred = pd.DataFrame(y_pred, columns = ["PredTotal"], index = JP_Sales_pred.index)
JP_Sales_acc = pd.concat([JP_Sales_pred[["Name", "JP_Sales"]], y_pred], axis = 1)

y_errs = 100 * abs(JP_Sales_acc["JP_Sales"] - JP_Sales_acc["PredTotal"]) / JP_Sales_acc["JP_Sales"]
y_errs = pd.DataFrame(y_errs, columns = ["Error Percentage"], index = JP_Sales_pred.index)
JP_Sales_acc = pd.concat([JP_Sales_acc, y_errs], axis = 1)

JP_Sales_acc