# Importing required libraries

In [None]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import  accuracy_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
import plotly.graph_objects as go


# **Importing Data**

In [None]:
original_data = pd.read_csv("Video_Games_Sales_as_at_22_Dec_2016.csv")


In [None]:
original_data.head(5)

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8.0,322.0,Nintendo,E
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,,
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8.0,192.0,Nintendo,E
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37,,,,,,


# **Data Preprocessing**

In [None]:
vgs_data = original_data.copy()

## **Dropping unwanted columns**

In [None]:
#remove name and Sales columns otherwise it would be very easy , it'll just add them up
columns_to_drop = ["Name","NA_Sales","EU_Sales","JP_Sales","Other_Sales"]
vgs_data.drop(columns_to_drop,axis =1 ,inplace = True)

In [None]:
vgs_data.head(5)

Unnamed: 0,Platform,Year_of_Release,Genre,Publisher,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
0,Wii,2006.0,Sports,Nintendo,82.53,76.0,51.0,8.0,322.0,Nintendo,E
1,NES,1985.0,Platform,Nintendo,40.24,,,,,,
2,Wii,2008.0,Racing,Nintendo,35.52,82.0,73.0,8.3,709.0,Nintendo,E
3,Wii,2009.0,Sports,Nintendo,32.77,80.0,73.0,8.0,192.0,Nintendo,E
4,GB,1996.0,Role-Playing,Nintendo,31.37,,,,,,


In [None]:
#Checking for missing values
vgs_data.isnull().sum()

Platform              0
Year_of_Release     269
Genre                 2
Publisher            54
Global_Sales          0
Critic_Score       8582
Critic_Count       8582
User_Score         6704
User_Count         9129
Developer          6623
Rating             6769
dtype: int64

In [None]:
null_counts = vgs_data.isnull().sum()

fig = go.Figure(data=[go.Bar(x=null_counts.index, y=null_counts, marker_color='skyblue')])

fig.update_layout(title="Nombre de valeurs nulles par colonne",
                  xaxis_title="Colonne",
                  yaxis_title="Nombre de valeurs nulles"
                  )
fig.show()

## **Feature Engineering**

In [None]:
vgs_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16719 entries, 0 to 16718
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Platform         16719 non-null  object 
 1   Year_of_Release  16450 non-null  float64
 2   Genre            16717 non-null  object 
 3   Publisher        16665 non-null  object 
 4   Global_Sales     16719 non-null  float64
 5   Critic_Score     8137 non-null   float64
 6   Critic_Count     8137 non-null   float64
 7   User_Score       10015 non-null  object 
 8   User_Count       7590 non-null   float64
 9   Developer        10096 non-null  object 
 10  Rating           9950 non-null   object 
dtypes: float64(5), object(6)
memory usage: 1.4+ MB


### User_Score to numerical

In [None]:
vgs_data["User_Score"] = pd.to_numeric(vgs_data["User_Score"], errors='coerce')

In [None]:
vgs_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16719 entries, 0 to 16718
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Platform         16719 non-null  object 
 1   Year_of_Release  16450 non-null  float64
 2   Genre            16717 non-null  object 
 3   Publisher        16665 non-null  object 
 4   Global_Sales     16719 non-null  float64
 5   Critic_Score     8137 non-null   float64
 6   Critic_Count     8137 non-null   float64
 7   User_Score       7590 non-null   float64
 8   User_Count       7590 non-null   float64
 9   Developer        10096 non-null  object 
 10  Rating           9950 non-null   object 
dtypes: float64(6), object(5)
memory usage: 1.4+ MB


### **HANDLING Small MISSING VALUES**

In [None]:
#replacing year with its mean
vgs_data["Year_of_Release"]=vgs_data["Year_of_Release"].fillna(vgs_data["Year_of_Release"].mean())

In [None]:
#dropping Na publisher
vgs_data.dropna(subset=['Publisher'], inplace=True)


In [None]:
#dropping Na Genre
vgs_data.dropna(subset=['Genre'], inplace=True)

### **HANDLING BIG MISSING VALUES: For now replacing  numerical  values with median and object values with "unknown"**

In [None]:
#replacing  with its medians
vgs_data["Critic_Score"]=vgs_data["Critic_Score"].fillna(vgs_data["Critic_Score"].median())
vgs_data["Critic_Count"]=vgs_data["Critic_Count"].fillna(vgs_data["Critic_Count"].median())
vgs_data["User_Score"]=vgs_data["User_Score"].fillna(vgs_data["User_Score"].median())
vgs_data["User_Count"]=vgs_data["User_Count"].fillna(vgs_data["User_Count"].median())


In [None]:
#replacing missing Strings
vgs_data["Developer"]=vgs_data["Developer"].fillna("Unknown")
vgs_data["Rating"]=vgs_data["Rating"].fillna("Unknown")

In [None]:
#Checking for missing values
vgs_data.isnull().sum()

Platform           0
Year_of_Release    0
Genre              0
Publisher          0
Global_Sales       0
                  ..
Critic_Count       0
User_Score         0
User_Count         0
Developer          0
Rating             0
Length: 11, dtype: int64

## **Object values Transfomation**





In [None]:
vgs_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16663 entries, 0 to 16718
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Platform         16663 non-null  object 
 1   Year_of_Release  16663 non-null  float64
 2   Genre            16663 non-null  object 
 3   Publisher        16663 non-null  object 
 4   Global_Sales     16663 non-null  float64
 5   Critic_Score     16663 non-null  float64
 6   Critic_Count     16663 non-null  float64
 7   User_Score       16663 non-null  float64
 8   User_Count       16663 non-null  float64
 9   Developer        16663 non-null  object 
 10  Rating           16663 non-null  object 
dtypes: float64(6), object(5)
memory usage: 1.5+ MB


In [None]:
len(vgs_data["Platform"].unique())

31

In [None]:
len(vgs_data["Genre"].unique())

12

In [None]:
len(vgs_data["Publisher"].unique()) #TOO MUCH

581

In [None]:
len(vgs_data["Developer"].unique()) #TOO MUCH

1696

In [None]:
len(vgs_data["Rating"].unique())

9

### Distribution of unique values

In [None]:
x=["Platform","Genre","Publisher","Developer","Rating"]
y=[len(vgs_data["Platform"].unique()),len(vgs_data["Genre"].unique()),len(vgs_data["Publisher"].unique()),len(vgs_data["Developer"].unique()),len(vgs_data["Rating"].unique())]
fig = go.Figure(data=[go.Bar(x=x, y=y)])

fig.update_layout(title="Object unique values",
                  xaxis_title="Columns",
                  yaxis_title="Unique values")
fig.show()

### **Transformation of  Big Object VALUES**

In [None]:
pd.set_option('display.max_rows', 99999)
pd.set_option('max_colwidth', 400)

#### **Handling small Developers**

In [None]:
vgs_data["Developer"].value_counts()

In [None]:
counts = vgs_data["Developer"].value_counts()

In [None]:
filtered_counts = counts[counts.index != 'Unknown']

fig = go.Figure(data=[go.Bar(x=filtered_counts.index, y=filtered_counts.values)])

fig.update_layout(title="Nombre de jeux par développeur (sans Unknown)",
                  xaxis_title="Développeur",
                  yaxis_title="Nombre de jeux")
fig.show()

In [None]:
vgs_data["Developer"]=vgs_data["Developer"].apply(lambda x: "Small Developer" if counts[x] <10 else x )

In [None]:
len(vgs_data["Developer"].unique())

243

#### Handling small Publishers

In [None]:
vgs_data["Publisher"].value_counts()

In [None]:
counts = vgs_data["Publisher"].value_counts()

In [None]:
fig = go.Figure(data=[go.Bar(x=counts.index, y=counts.values)])

fig.update_layout(title="Nombre de jeux par publisher",
                  xaxis_title="Publisher",
                  yaxis_title="Nombre de jeux")

fig.show()

In [None]:
vgs_data["Publisher"]=vgs_data["Publisher"].apply(lambda x: "Small Publisher" if counts[x] <50 else x )

In [None]:
len(vgs_data["Publisher"].unique())

47

In [None]:
pd.set_option('display.max_rows', 10)
pd.set_option('max_colwidth', 400)

### **Target column**

In [None]:
#Target column
vgs_data['Hit'] = vgs_data['Global_Sales']
vgs_data.drop('Global_Sales', axis=1, inplace=True)

In [None]:
vgs_data.head(5)

Unnamed: 0,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,Hit
0,Wii,2006.0,Sports,Nintendo,76.0,51.0,8.0,322.0,Nintendo,E,82.53
1,NES,1985.0,Platform,Nintendo,71.0,21.0,7.5,24.0,Unknown,Unknown,40.24
2,Wii,2008.0,Racing,Nintendo,82.0,73.0,8.3,709.0,Nintendo,E,35.52
3,Wii,2009.0,Sports,Nintendo,80.0,73.0,8.0,192.0,Nintendo,E,32.77
4,GB,1996.0,Role-Playing,Nintendo,71.0,21.0,7.5,24.0,Unknown,Unknown,31.37


**> 1 is marked as Hit**

In [None]:
def hit(sales):
    if sales >= 1:
        return 1
    else:
        return 0

vgs_data['Hit'] = vgs_data['Hit'].apply(lambda x: hit(x))

In [None]:
vgs_data.head(5)

Unnamed: 0,Platform,Year_of_Release,Genre,Publisher,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating,Hit
0,Wii,2006.0,Sports,Nintendo,76.0,51.0,8.0,322.0,Nintendo,E,1
1,NES,1985.0,Platform,Nintendo,71.0,21.0,7.5,24.0,Unknown,Unknown,1
2,Wii,2008.0,Racing,Nintendo,82.0,73.0,8.3,709.0,Nintendo,E,1
3,Wii,2009.0,Sports,Nintendo,80.0,73.0,8.0,192.0,Nintendo,E,1
4,GB,1996.0,Role-Playing,Nintendo,71.0,21.0,7.5,24.0,Unknown,Unknown,1


### **ONE HOT ENCODING**

In [None]:
ohe = pd.get_dummies(data=vgs_data, columns=['Platform', 'Genre','Publisher','Developer','Rating'])

In [None]:
ohe

Unnamed: 0,Year_of_Release,Critic_Score,Critic_Count,User_Score,User_Count,Hit,Platform_2600,Platform_3DO,Platform_3DS,Platform_DC,...,Developer_n-Space,Rating_AO,Rating_E,Rating_E10+,Rating_EC,Rating_K-A,Rating_M,Rating_RP,Rating_T,Rating_Unknown
0,2006.0,76.0,51.0,8.0,322.0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,1985.0,71.0,21.0,7.5,24.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2008.0,82.0,73.0,8.3,709.0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,2009.0,80.0,73.0,8.0,192.0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,1996.0,71.0,21.0,7.5,24.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16714,2016.0,71.0,21.0,7.5,24.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
16715,2006.0,71.0,21.0,7.5,24.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
16716,2016.0,71.0,21.0,7.5,24.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
16717,2003.0,71.0,21.0,7.5,24.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


# **MACHINE LEARNING**

In [None]:
data = ohe.copy()

## viewing Correlations with the target column

In [None]:
corrMatrix = data.corrwith(data["Hit"]).abs()

In [None]:
corrMatrix

Year_of_Release    0.113824
Critic_Score       0.246015
Critic_Count       0.312965
User_Score         0.073054
User_Count         0.216928
                     ...   
Rating_K-A         0.022054
Rating_M           0.099069
Rating_RP          0.005057
Rating_T           0.008593
Rating_Unknown     0.072152
Length: 348, dtype: float64

In [None]:
# Sort the correlations in descending order
corrMatrix = corrMatrix.sort_values(ascending=False)

In [None]:
corrMatrix

Hit                                   1.000000
Critic_Count                          0.312965
Critic_Score                          0.246015
Publisher_Nintendo                    0.226817
User_Count                            0.216928
                                        ...   
Developer_SCE Japan Studio            0.000649
Developer_Pipeworks Software, Inc.    0.000649
Developer_Papaya Studios              0.000649
Developer_Crystal Dynamics            0.000511
Developer_Raven Software              0.000061
Length: 348, dtype: float64

## Filtering Columns that have > 0.001 correlation

In [None]:
selected_columns = []
for col in data.columns:
    try:
        correlation = corrMatrix[col]
        if abs(correlation) > 0.001:
            selected_columns.append(col)
    except KeyError:
        print(f"KeyError: Check if 'Hit' column exists in corrMatrix for column {col}")
    except Exception as e:
        print(f"Error occurred for column {col}: {str(e)}")

df_selected = data[selected_columns]

In [None]:
df_selected

Unnamed: 0,Year_of_Release,Critic_Score,Critic_Count,User_Score,User_Count,Hit,Platform_2600,Platform_3DO,Platform_3DS,Platform_DC,...,Developer_n-Space,Rating_AO,Rating_E,Rating_E10+,Rating_EC,Rating_K-A,Rating_M,Rating_RP,Rating_T,Rating_Unknown
0,2006.0,76.0,51.0,8.0,322.0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,1985.0,71.0,21.0,7.5,24.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2008.0,82.0,73.0,8.3,709.0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,2009.0,80.0,73.0,8.0,192.0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,1996.0,71.0,21.0,7.5,24.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16714,2016.0,71.0,21.0,7.5,24.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
16715,2006.0,71.0,21.0,7.5,24.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
16716,2016.0,71.0,21.0,7.5,24.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
16717,2003.0,71.0,21.0,7.5,24.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## Splitting to Train / Test

In [None]:
y= df_selected["Hit"]
x= df_selected.drop("Hit",axis=1)

**70% training / 30% test**

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, random_state=0)


In [None]:
Model_CrossValidation_Scores = dict()


## Random Forest Classifier

### optimising hyper parameters

In [None]:
n_features = x.shape[1]
sqrt = int (math.sqrt(n_features)) #la racine
log = int (math.log(n_features) ) # le log
criterions=["gini", "entropy", "log_loss"] #critère
n_estimators = [1,10,100,200] #nombre d'arbres
max_features = [log,sqrt,2*log,2*sqrt] # le nombre de features dans chaque arbre

In [None]:
max_features

[5, 18, 10, 36]

In [None]:
def best_parameters_RFC (model,Xtrain,Ytrain,criterions,n_estimators,max_features):
  kfold = KFold(n_splits=5, shuffle=True, random_state=42)
  best_so_far = 0

  best_n_estimator=None
  best_max_features = None
  best_criterion=None

  for n in n_estimators:
    for f in max_features:
      for c in criterions:
          scores = cross_val_score(model(n,f,c), Xtrain, ytrain, cv=kfold,scoring='accuracy',n_jobs=-1)
          print("Scores de validation croisée pour n,f,c:",n,f,c,"est: ", scores)
          average_score=scores.mean()
          print("average Score de validation croisée pour n,f,c:",n,f,c,"est: ", average_score)
          if (average_score > best_so_far):
            best_so_far=average_score
            best_n_estimator=n
            best_max_features = f
            best_criterion=c
  return best_n_estimator,best_max_features,best_criterion

In [None]:
rfc_without_parameters = lambda n,f,c: RandomForestClassifier(n_estimators=n,max_features=f,criterion=c,random_state=0).fit(Xtrain, ytrain)

In [None]:
best_n_estimator,best_max_features,best_criterion = best_parameters_RFC(rfc_without_parameters,Xtrain,ytrain,criterions,n_estimators,max_features)

Scores de validation croisée pour n,f,c: 1 5 gini est:  [0.86540934 0.85597943 0.85426489 0.86369481 0.8567753 ]
average Score de validation croisée pour n,f,c: 1 5 gini est:  0.8592247557051154
Scores de validation croisée pour n,f,c: 1 5 entropy est:  [0.84869267 0.84783541 0.85812259 0.86155165 0.86363636]
average Score de validation croisée pour n,f,c: 1 5 entropy est:  0.8559677356505475
Scores de validation croisée pour n,f,c: 1 5 log_loss est:  [0.84869267 0.84783541 0.85812259 0.86155165 0.86363636]
average Score de validation croisée pour n,f,c: 1 5 log_loss est:  0.8559677356505475
Scores de validation croisée pour n,f,c: 1 18 gini est:  [0.84997857 0.85555079 0.85469353 0.86026575 0.86149228]
average Score de validation croisée pour n,f,c: 1 18 gini est:  0.8563961845076127
Scores de validation croisée pour n,f,c: 1 18 entropy est:  [0.85469353 0.85426489 0.86112302 0.86798114 0.8542024 ]
average Score de validation croisée pour n,f,c: 1 18 entropy est:  0.8584529963481675
S

optimising hyper-parameters for Random Forest

In [None]:
best_n_estimator,best_max_features,best_criterion

(200, 36, 'entropy')

### validating RFC

In [None]:
rfc = rfc_without_parameters(best_n_estimator,best_max_features,best_criterion)

In [None]:
#5 plis
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(rfc, Xtrain, ytrain, cv=kfold, scoring='accuracy',n_jobs=-1)

print("Scores de validation croisée :", scores)
print("Score moyen de validation croisée :", scores.mean())


Scores de validation croisée : [0.90655808 0.90270039 0.91084441 0.90741535 0.91080617]
Score moyen de validation croisée : 0.9076648783690491


In [None]:
Model_CrossValidation_Scores["RFC"] = scores.mean()

## Logistic Regression

### trying with standard scaller

In [None]:
scaler = StandardScaler()
scaler.fit(Xtrain)
scaled_Xtrain=scaler.transform(Xtrain)

In [None]:
log_reg_scalled = LogisticRegression(random_state=0,max_iter=10000).fit(scaled_Xtrain, ytrain)

In [None]:
#5 plis
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(log_reg_scalled, scaled_Xtrain, ytrain, cv=kfold, scoring='accuracy',n_jobs=-1)

print("Scores de validation croisée :", scores)
print("Score moyen de validation croisée :", scores.mean())

Scores de validation croisée : [0.90184312 0.89327047 0.90270039 0.90270039 0.90909091]
Score moyen de validation croisée : 0.9019210536570158


### without scaling

In [None]:
log_reg = LogisticRegression(random_state=0,max_iter=10000).fit(Xtrain, ytrain)

### validating log_reg

In [None]:
#5 plis
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(log_reg, Xtrain, ytrain, cv=kfold, scoring='accuracy',n_jobs=-1)

print("Scores de validation croisée :", scores)
print("Score moyen de validation croisée :", scores.mean())

Scores de validation croisée : [0.90055722 0.89284183 0.90270039 0.89798543 0.91252144]
Score moyen de validation croisée : 0.901321262018073


In [None]:
Model_CrossValidation_Scores["LRC"] = scores.mean()

## K nearest Neighbos

### Finding the best k

In [None]:
def find_best_k (model,Xtrain,Ytrain,k_values):
  kfold = KFold(n_splits=5, shuffle=True, random_state=42)
  best_so_far = 0
  best_k=-1
  for k in k_values:
    if k < len(Ytrain):
      scores = cross_val_score(model(k), Xtrain, ytrain, cv=kfold,scoring='accuracy',n_jobs=-1)
      print("Scores de validation croisée pour k=:",k,"est: ", scores)
      average_score=scores.mean()
      print("average Score de validation croisée pour k=:",k,"est: ", average_score)
      if (average_score > best_so_far):
        best_so_far=average_score
        best_k = k
  return best_k


**getting a range for K**

In [None]:
import math
def sampled_range(mini, maxi, num):
  if not num:
    return []
  lmini = math.log(mini)
  lmaxi = math.log(maxi)
  ldelta = (lmaxi - lmini) / (num - 1)
  out = [x for x in set([int(math.exp(lmini + i * ldelta)) for i in range(num)])]
  out.sort()
  return out

In [None]:
k_values = sampled_range(1, 1000, 10)

**Cross Validation to find the best K**

In [None]:
missing_k_model = lambda x: KNeighborsClassifier(n_neighbors=x)
best_k=find_best_k(missing_k_model,Xtrain,ytrain,k_values)

Scores de validation croisée pour k=: 1 est:  [0.84954994 0.84054865 0.85083583 0.85469353 0.8464837 ]
average Score de validation croisée pour k=: 1 est:  0.8484223303647642
Scores de validation croisée pour k=: 2 est:  [0.87869696 0.8714102  0.88084012 0.88855551 0.88722127]
average Score de validation croisée pour k=: 2 est:  0.8813448110818085
Scores de validation croisée pour k=: 4 est:  [0.88084012 0.87355336 0.88169739 0.88941277 0.89150943]
average Score de validation croisée pour k=: 4 est:  0.8834026154679779
Scores de validation croisée pour k=: 9 est:  [0.88255465 0.87569653 0.88641234 0.88812688 0.88722127]
average Score de validation croisée pour k=: 9 est:  0.8840023335850231
Scores de validation croisée pour k=: 21 est:  [0.88469781 0.87869696 0.88512645 0.88898414 0.88893654]
average Score de validation croisée pour k=: 21 est:  0.8852883786142446
Scores de validation croisée pour k=: 46 est:  [0.87741106 0.88169739 0.88084012 0.88641234 0.89365352]
average Score de va

In [None]:
best_k

21

In [None]:
knn_classifier = missing_k_model(best_k).fit(Xtrain, ytrain)

### Validating Knn with best K

In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(knn_classifier, Xtrain, ytrain, cv=kfold, scoring='accuracy',n_jobs=-1)

print("Scores de validation croisée :", scores)
print("Score moyen de validation croisée :", scores.mean())

Scores de validation croisée : [0.88469781 0.87869696 0.88512645 0.88898414 0.88893654]
Score moyen de validation croisée : 0.8852883786142446


In [None]:
Model_CrossValidation_Scores["KNN"] = scores.mean()

In [None]:
Model_CrossValidation_Scores

{'RFC': 0.9076648783690491,
 'LRC': 0.901321262018073,
 'KNN': 0.8852883786142446}

## SVM

reference : https://scikit-learn.org/stable/auto_examples/svm/plot_rbf_parameters.html

### Finding the Best kernel, C and Gamma

In [None]:
def find_best_C_Gamma (model,Xtrain,Ytrain,kernels,C_values,Gamma_values):
  kfold = KFold(n_splits=5, shuffle=True, random_state=42)
  best_so_far = 0
  best_C=None
  best_Gamma = None
  best_kernel = None
  for k in kernels:
      if (k =="rbf"):
          for C in C_values:
            for gamma in Gamma_values:
              scores = cross_val_score(model(k,C,gamma), Xtrain, ytrain, cv=kfold,scoring='accuracy',n_jobs=-1)
              print("Scores de validation croisée pour kernel,C,Gamma=:",k,gamma,C,"est: ", scores)
              average_score=scores.mean()
              print("average Score de validation croisée pour kernel,C,Gamma=:",k,gamma,C,"est: ", average_score)
              if (average_score > best_so_far):
                best_so_far=average_score
                best_C = C
                best_Gamma = gamma
                best_kernel = k
      else:
        for C in C_values:
          gamma = "scale"
          scores = cross_val_score(model(k,C,gamma), Xtrain, ytrain, cv=kfold,scoring='accuracy',n_jobs=-1)
          print("Scores de validation croisée pour kernel,C=:",k,C,"est: ", scores)
          average_score=scores.mean()
          print("average Score de validation croisée pour kernel,C=:",k,C,"est: ", average_score)
          if (average_score > best_so_far):
            best_so_far=average_score
            best_C = C
            best_Gamma = gamma
            best_kernel = k
  return best_kernel,best_C,best_Gamma

In [None]:
C_range=[0.001, 0.01, 0.1, 1, 10, 100]
gamma_range=[0.0001, 0.001, 0.01, 0.1]
kernels= ["linear", "rbf"]

SVM_missing_kernel_c_and_gamma = lambda k,c,gamma: SVC(kernel=k,C=c,gamma=gamma)

best_kernel,best_C,best_Gamma = find_best_C_Gamma(SVM_missing_kernel_c_and_gamma,Xtrain,ytrain,kernels,C_range,gamma_range)

Scores de validation croisée pour kernel,C=: linear 0.001 est:  [0.86840977 0.87098157 0.87183883 0.88126875 0.89150943]
average Score de validation croisée pour kernel,C=: linear 0.001 est:  0.876801672476122
Scores de validation croisée pour kernel,C=: linear 0.01 est:  [0.87312473 0.87655379 0.87826832 0.88512645 0.89622642]
average Score de validation croisée pour kernel,C=: linear 0.01 est:  0.8818599422559018
Scores de validation croisée pour kernel,C=: linear 0.1 est:  [0.90098586 0.89755679 0.90355765 0.90227175 0.90951973]
average Score de validation croisée pour kernel,C=: linear 0.1 est:  0.9027783557415823
Scores de validation croisée pour kernel,C=: linear 1 est:  [0.89027004 0.87998285 0.89455637 0.88812688 0.89708405]
average Score de validation croisée pour kernel,C=: linear 1 est:  0.8900040363521668
Scores de validation croisée pour kernel,C=: linear 10 est:  [0.87912559 0.86026575 0.88255465 0.87355336 0.87692967]
average Score de validation croisée pour kernel,C=: l

In [None]:
best_kernel,best_C,best_Gamma

('linear', 0.1, 'scale')

best parameters : ("linear", 0.1, 'scale')

In [None]:
best_kernel,best_C,best_Gamma=("linear", 0.1, 'scale')

In [None]:
SVM_classifier = SVM_missing_kernel_c_and_gamma(best_kernel,best_C,best_Gamma).fit (Xtrain,ytrain)

### Validating SVM

In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(SVM_classifier, Xtrain, ytrain, cv=kfold, scoring='accuracy',n_jobs=-1,verbose=2)

print("Scores de validation croisée :", scores)
print("Score moyen de validation croisée :", scores.mean())

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Scores de validation croisée : [0.90098586 0.89755679 0.90355765 0.90227175 0.90951973]
Score moyen de validation croisée : 0.9027783557415823


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  5.2min finished


In [None]:
Model_CrossValidation_Scores["SVM"] = scores.mean()

In [None]:
Model_CrossValidation_Scores

{'RFC': 0.9076648783690491,
 'LRC': 0.901321262018073,
 'KNN': 0.8852883786142446,
 'SVM': 0.9027783557415823}

# Results: Taking the best classifier and testing it on Test

## Ranking feature performance

In [None]:
#sorting the indices of the features based on their importance in descending order
sorted_indices = np.argsort(rfc.feature_importances_)[::-1]

#ranking of the top 10 features
print('Ranking of the top 10 features:')

for rank in range(10):
    feature_index = sorted_indices[rank]
    feature_name = df_selected.columns[feature_index]
    feature_importance = rfc.feature_importances_[feature_index]
    print(f'{rank+1}. Feature {feature_index} {feature_name} ({feature_importance})')

Ranking of the top 10 features:
1. Feature 0 Year_of_Release (0.13579157976974013)
2. Feature 4 User_Count (0.11267440107326987)
3. Feature 1 Critic_Score (0.08839105403162598)
4. Feature 2 Critic_Count (0.07919479482016975)
5. Feature 3 User_Score (0.05377389731039293)
6. Feature 78 Publisher_Namco Bandai Games (0.03323644015847033)
7. Feature 18 Platform_NG (0.01663577908603426)
8. Feature 82 Publisher_Sega (0.016109387194387415)
9. Feature 36 Platform_XOne (0.013672472263433854)
10. Feature 63 Publisher_Eidos Interactive (0.012972234285175315)


## Accuracy on test

In [None]:
y_pred = rfc.predict(Xtest)
accuracy = accuracy_score(ytest, y_pred)
print("RFC_Accuracy:", accuracy)

RFC_Accuracy: 0.9129825965193039


# Interesting Results from Test : best 10 games of becoming hits through the years 2009-2016

In [None]:
Test_tops= Xtest.copy()

In [None]:
preds = rfc.predict_proba(Test_tops) #predicting the probability

In [None]:
Test_tops ["hit_probability"] = preds[:,1]

In [None]:
original_top=original_data.copy()

In [None]:
original_top["hit_probability"] = Test_tops ["hit_probability"]
original_top.sort_values(['hit_probability'], ascending=[False], inplace=True)#sort by decreasing hit probability

In [None]:
#show top 10 for each year
for year in range (2009,2017):
    print ("Year: ",year)
    best_10_year = original_top[original_top['Year_of_Release'] == year].copy()
    best_10_year = best_10_year[['Name',"Platform", 'hit_probability']]
    print (best_10_year.head(10).reset_index(drop=True) ,"\n")

Year:  2009
                               Name Platform  hit_probability
0         New Super Mario Bros. Wii      Wii            0.975
1        Uncharted 2: Among Thieves      PS3            0.975
2               Assassin's Creed II      PS3            0.965
3                      Halo 3: ODST     X360            0.950
4             Batman: Arkham Asylum     X360            0.950
5               Assassin's Creed II     X360            0.935
6                    FIFA Soccer 10      PS3            0.890
7                 Street Fighter IV      PS3            0.800
8                 Street Fighter IV     X360            0.790
9  Grand Theft Auto: Chinatown Wars       DS            0.745 

Year:  2010
                           Name Platform  hit_probability
0    Battlefield: Bad Company 2     X360         0.975000
1                God of War III      PS3         0.970000
2           Red Dead Redemption      PS3         0.950000
3                FIFA Soccer 11     X360         0.805000
4 