In [111]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score as ac
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [112]:
df=pd.read_excel("Genshin Impact Revenue.xlsx") #Dataset
df.dropna(inplace=True)

#Revenue class is divided into 'Above average'when revenue is above 20909226, otherwise, 'Below average'
#This number is taken from the added min and max point of the two clusters revenue which is then divided by 2 
df["Revenue Class"] = np.where(df["Revenue"]>20909226, "Above Average", "Below Average")

In [144]:
df.head(39)

Unnamed: 0,Version,Version Name,Start Date,End Date,5 Star Characters,Rerun,Mixed,Gender,Revenue,Banner Days,Avg Revenue/Day,Revenue Class
0,1.0,Welcome to Tevyat,25/09/2020,19/10/2020,Venti,0.0,0.0,0.0,30632752.0,25.0,1225310.0,Above Average
1,1.0,Welcome to Tevyat,20/10/2020,2020-10-11 00:00:00,Klee,0.0,0.0,1.0,22750080.0,22.0,1034095.0,Above Average
2,1.1,A New Star Approaches,2020-11-11 00:00:00,30/11/2020,Tartaglia,0.0,0.0,0.0,13443619.0,20.0,672180.9,Below Average
3,1.1,A New Star Approaches,2020-01-12 00:00:00,22/12/2020,Zhongli,0.0,0.0,0.0,16264892.0,22.0,739313.3,Below Average
4,1.2,The Chalk Prince and the Dragon,23/12/2020,2021-12-01 00:00:00,Albedo,0.0,0.0,0.0,11816107.0,21.0,562671.8,Below Average
5,1.2,The Chalk Prince and the Dragon,13/01/2021,2021-02-02 00:00:00,Ganyu,0.0,0.0,1.0,15669918.0,21.0,746186.6,Below Average
6,1.3,All That Glitters,2021-03-02 00:00:00,17/02/2021,Xiao,0.0,0.0,0.0,13145115.0,15.0,876341.0,Below Average
7,1.3,All That Glitters,18/02/2021,2021-02-03 00:00:00,Keqing,0.0,0.0,1.0,9505798.0,13.0,731215.2,Below Average
8,1.3,All That Glitters,2021-03-03 00:00:00,16/03/2021,Hu Tao,0.0,0.0,1.0,12481634.0,14.0,891545.3,Below Average
9,1.4,Invitation to Windblume,17/03/2021,2021-06-04 00:00:00,Venti (Rerun),1.0,0.0,0.0,16614209.0,21.0,791152.8,Below Average


In [114]:
X.head(10) #features: rerun, mixed, gender, banner days

Unnamed: 0,Rerun,Mixed,Gender,Banner Days
0,0.0,0.0,0.0,25.0
1,0.0,0.0,1.0,22.0
2,0.0,0.0,0.0,20.0
3,0.0,0.0,0.0,22.0
4,0.0,0.0,0.0,21.0
5,0.0,0.0,1.0,21.0
6,0.0,0.0,0.0,15.0
7,0.0,0.0,1.0,13.0
8,0.0,0.0,1.0,14.0
9,1.0,0.0,0.0,21.0


In [145]:
#Assign variables 
X=df.iloc[:,[5,6,7,9]]
y=df.loc[:,'Revenue Class']
print(X)
print(y)

    Rerun  Mixed Gender  Banner Days
0     0.0    0.0    0.0         25.0
1     0.0    0.0    1.0         22.0
2     0.0    0.0    0.0         20.0
3     0.0    0.0    0.0         22.0
4     0.0    0.0    0.0         21.0
5     0.0    0.0    1.0         21.0
6     0.0    0.0    0.0         15.0
7     0.0    0.0    1.0         13.0
8     0.0    0.0    1.0         14.0
9     1.0    0.0    0.0         21.0
10    1.0    0.0    0.0         21.0
11    1.0    0.0    0.0         21.0
12    0.0    0.0    1.0         21.0
13    1.0    0.0    1.0         21.0
14    0.0    0.0    0.0         21.0
15    0.0    0.0    1.0         21.0
16    0.0    0.0    1.0         21.0
17    0.0    0.0    1.0         20.0
18    0.0    0.0    1.0         22.0
19    1.0    0.0    0.0         21.0
20    1.0    0.0    1.0         22.0
21    1.0    1.0    2.0         21.0
22    0.0    0.0    0.0         22.0
23    1.0    1.0    2.0         21.0
24    1.0    1.0    2.0         22.0
25    0.0    0.0    1.0         21.0
2

In [141]:
#SVM Settings
svm = SVC(kernel='linear', random_state=1, gamma=2.0, C=20.8)
#2.0 gamma , 20.8 c accuracy score = 0.92
#cv result mean 0.67 with RBF
#cv result mean 0.78 with LINEAR

#Train the model
svm.fit(X, y)

#Predict X
y_pred=svm.predict(X)
#accurary score on trained data
print("Accuracy =", ac(y, y_pred))

#cross validation
cv_results = cross_val_score(svm, X, y, cv=15)
print("Cv Result =", cv_results)
print("Cv Result Mean =", np.mean(cv_results))

Accuracy = 0.8205128205128205
Cv Result = [1.         0.66666667 0.66666667 0.66666667 1.         1.
 0.66666667 1.         0.66666667 1.         0.         1.
 0.5        1.         1.        ]
Cv Result Mean = 0.7888888888888889




In [142]:
#Random Forest Classifier Settings
clf = RandomForestClassifier(max_depth=5, random_state=1)
#accuracy 0,79 but cv mean 0,74 very good

#Train the model
clf.fit(X, y)

#Predict X
y_pred=clf.predict(X)

print("Accuracy =", ac(y, y_pred))
cv_results = cross_val_score(clf, X, y, cv=15)

print("Cv Result =", cv_results)
print("Cv Result Mean =", np.mean(cv_results))

Accuracy = 0.8974358974358975




Cv Result = [0.66666667 0.66666667 0.66666667 0.66666667 1.         0.66666667
 0.66666667 0.66666667 1.         0.5        0.5        1.
 0.5        1.         0.5       ]
Cv Result Mean = 0.711111111111111


In [143]:
#The features that are used to predict the class revenue is X. 
#X has 4 features: Rerun, Mixed, Gender and Banner Days. 
#y is the class revenue label.
#The purpose is to see whether from the 4 features alone it can predict and classify whether the revenue will be above average/good or below average/bad. If so, we want to see how accurate our model is in classifying the revenue class based off those features and whether it can be considered a successful model or not.

#Results:
#when k= 15
#Using random forest classifier, it has a good cross validation result score of 0.71
#Using SVM with rbf as kernel, gamma 2.0 and C 20.8, it has a decent cross validation score of 0.67 
#Using SVM with linear as kernel, gamma 2.0 and C 20.8, it has a good mean crosss validation score of 0.78

#Thus the most optimal results is to use SVM linear. 