In [134]:
import pandas as pd
import numpy
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [2]:
vgsales_df = pd.read_csv("vgsales.csv")
vgsales_df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [3]:
vgsales_df = vgsales_df.dropna()

In [4]:
vgsales_df["Year"] = vgsales_df["Year"].astype(int)
vgsales_df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [5]:
vgsales_df.shape
vgsales_df.info()
vgsales_df.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 16291 entries, 0 to 16597
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Rank          16291 non-null  int64  
 1   Name          16291 non-null  object 
 2   Platform      16291 non-null  object 
 3   Year          16291 non-null  int32  
 4   Genre         16291 non-null  object 
 5   Publisher     16291 non-null  object 
 6   NA_Sales      16291 non-null  float64
 7   EU_Sales      16291 non-null  float64
 8   JP_Sales      16291 non-null  float64
 9   Other_Sales   16291 non-null  float64
 10  Global_Sales  16291 non-null  float64
dtypes: float64(5), int32(1), int64(1), object(4)
memory usage: 1.4+ MB


Unnamed: 0,Rank,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
count,16291.0,16291.0,16291.0,16291.0,16291.0,16291.0,16291.0
mean,8290.190228,2006.405561,0.265647,0.147731,0.078833,0.048426,0.54091
std,4792.65445,5.832412,0.822432,0.509303,0.311879,0.190083,1.567345
min,1.0,1980.0,0.0,0.0,0.0,0.0,0.01
25%,4132.5,2003.0,0.0,0.0,0.0,0.0,0.06
50%,8292.0,2007.0,0.08,0.02,0.0,0.01,0.17
75%,12439.5,2010.0,0.24,0.11,0.04,0.04,0.48
max,16600.0,2020.0,41.49,29.02,10.22,10.57,82.74


In [120]:
vgsales_copy_df = vgsales_df.copy()

In [121]:
vgsales_copy_df = vgsales_copy_df.drop(columns=['Publisher', 'Platform', 'Rank', 'Name', 
                                              'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales'])
vgsales_copy_df.head()

Unnamed: 0,Year,Genre,NA_Sales
0,2006,Sports,41.49
1,1985,Platform,29.08
2,2008,Racing,15.85
3,2009,Sports,15.75
4,1996,Role-Playing,11.27


In [122]:
vgsales_copy_df_encoded = pd.get_dummies(vgsales_copy_df, columns=['Genre'])
vgsales_copy_df_encoded.head()

Unnamed: 0,Year,NA_Sales,Genre_Action,Genre_Adventure,Genre_Fighting,Genre_Misc,Genre_Platform,Genre_Puzzle,Genre_Racing,Genre_Role-Playing,Genre_Shooter,Genre_Simulation,Genre_Sports,Genre_Strategy
0,2006,41.49,False,False,False,False,False,False,False,False,False,False,True,False
1,1985,29.08,False,False,False,False,True,False,False,False,False,False,False,False
2,2008,15.85,False,False,False,False,False,False,True,False,False,False,False,False
3,2009,15.75,False,False,False,False,False,False,False,False,False,False,True,False
4,1996,11.27,False,False,False,False,False,False,False,True,False,False,False,False


In [123]:
y = vgsales_copy_df_encoded['NA_Sales']
X = vgsales_copy_df_encoded.drop(columns='NA_Sales')

In [124]:
X.head()

Unnamed: 0,Year,Genre_Action,Genre_Adventure,Genre_Fighting,Genre_Misc,Genre_Platform,Genre_Puzzle,Genre_Racing,Genre_Role-Playing,Genre_Shooter,Genre_Simulation,Genre_Sports,Genre_Strategy
0,2006,False,False,False,False,False,False,False,False,False,False,True,False
1,1985,False,False,False,False,True,False,False,False,False,False,False,False
2,2008,False,False,False,False,False,False,True,False,False,False,False,False
3,2009,False,False,False,False,False,False,False,False,False,False,True,False
4,1996,False,False,False,False,False,False,False,True,False,False,False,False


In [125]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

print("train size X : ", X_train.shape)
print("train size y : ", y_train.shape)
print("test size X : ", X_test.shape)
print("test size y : ", y_test.shape)

train size X :  (12218, 13)
train size y :  (12218,)
test size X :  (4073, 13)
test size y :  (4073,)


In [126]:
scaler = StandardScaler()

X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)

X_test_scaled = X_scaler.transform(X_test)

In [127]:
model = LinearRegression()
lr_model = model.fit(X_train, y_train)
lr_model.score(X_train, y_train)

0.021365517104776077

In [128]:
y_pred = lr_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.5225070293288024


In [104]:
genres = []
for i in vgsales_genre['Genre']:
    if i not in genres:
        genres.append(i)
        
genres

['Sports',
 'Platform',
 'Racing',
 'Role-Playing',
 'Puzzle',
 'Misc',
 'Shooter',
 'Simulation',
 'Action',
 'Fighting',
 'Adventure',
 'Strategy']

In [105]:
sales_by_genre = {}
for genre, prediction in zip(genres, y_pred):
    sales_by_genre[genre] = prediction
    
best_genre = max(sales_by_genre, key=sales_by_genre.get)
print("Predicted sales by genre:", sales_by_genre)
print("Genre predicted to sell better in the future:", best_genre)

Predicted sales by genre: {'Sports': 0.24729620082435133, 'Platform': 0.39295651766527584, 'Racing': 0.48946156853992306, 'Role-Playing': 0.4001102188495196, 'Puzzle': 0.4408903300354794, 'Misc': 0.35423068971542904, 'Shooter': 0.04234939942223548, 'Simulation': 0.12795661842136496, 'Action': 0.05893374064968526, 'Fighting': 0.19842251812093537, 'Adventure': -0.00530951350386033, 'Strategy': 0.10279564331310809}
Genre predicted to sell better in the future: Racing


In [59]:
len(vgsales_df['Platform'].unique())

31

In [60]:
len(vgsales_df['Publisher'].unique())

576

In [None]:
Low R^2 Low Test MSE Your code is wrong
Low r^2 Hight test mse = underfit needs more featurs
High r^2 High test MSE = Over fit needs less features