In [21]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score 
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler



#import minor and major regions
df = pd.concat(
    map(pd.read_csv, ['LCSA2022Summer.csv', 'LCKCL2022Summer.csv', 'LFL2022Summer.csv']), ignore_index=True)
df2 = pd.concat(
    map(pd.read_csv, ['LCS2023Spring.csv', 'LCK2023Spring.csv', 'LEC2023Winter.csv']), ignore_index=True)

#convert % columns to float 
df = df.replace('%','', regex=True)
cols = df.columns.drop(['Player', 'Team', 'Pos'])
df[cols] = df[cols].astype(float)

#remove players with less than 9 games player
df = df[df['GP'] >= 9]
df2 = df2[df2['GP'] >= 9]

#find intersection (players promoted)
int_df = pd.merge(df, df2, how = 'inner', on = ['Player'])

#create list of players promoted
players = int_df['Player'].values.tolist()

#create new column 'Promoted', set to no
df['Promoted'] = 0
#set promoted to yes if in intersection list
df['Promoted'] = np.where(df['Player'].isin(players), 1, 0)
df = df.drop(columns=['Player', 'Team', 'Pos'])

In [22]:
#split testing and training data
y_col = 'Promoted'
x_cols = df.loc[:, df.columns != y_col].columns
training_x, testing_x, training_y, testing_y = train_test_split(df[x_cols], df[y_col], test_size=0.2, random_state = 42)

In [26]:
#model without preprocessing
k = 1
knnMdl = KNeighborsClassifier(n_neighbors=k).fit(training_x, training_y)
knnScores = cross_val_score(knnMdl, training_x, training_y, cv=10, scoring='accuracy')
print("KNN average accuracy on 10-fold cross validation: " + str(np.mean(knnScores)))
knnPredictions = knnMdl.predict(testing_x)
print(knnPredictions)
print("KNN accuracy on  test data:", accuracy_score(testing_y, knnPredictions))

KNN average accuracy on 10-fold cross validation: 0.7280219780219779
[0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0]
KNN accuracy on  test data: 0.7647058823529411


In [24]:
lrMdl = LogisticRegression(solver= 'liblinear').fit(training_x, training_y)
lrScores = cross_val_score(lrMdl, training_x, training_y, cv=10, scoring='accuracy')
print("LR average accuracy on 10-fold cross validation: " + str(np.mean(lrScores)))
lrPredictions = lrMdl.predict(testing_x)
print(lrPredictions)
print("LR accuracy on  test data:", accuracy_score(testing_y, lrPredictions))

LR average accuracy on 10-fold cross validation: 0.7857142857142857
[0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
LR accuracy on  test data: 0.8235294117647058


In [2]:
#perform smote
y_col = 'Promoted'
x_cols = df.loc[:, df.columns != y_col].columns
training_x, testing_x, training_y, testing_y = train_test_split(df[x_cols], df[y_col], test_size=0.2, random_state = 42)
oversample = SMOTE(random_state = 42)
training_x_smote, training_y_smote = oversample.fit_resample(training_x, training_y)

In [20]:
#knn model after Smote
k = 4
knnMdl = KNeighborsClassifier(n_neighbors=k).fit(training_x_smote, training_y_smote)
knnScores = cross_val_score(knnMdl, training_x_smote, training_y_smote, cv=10, scoring='accuracy')
print("KNN average accuracy on 10-fold cross validation: " + str(np.mean(knnScores)))
knnPredictions = knnMdl.predict(testing_x)
print(knnPredictions)
print("KNN accuracy on  test data:", accuracy_score(testing_y, knnPredictions))

KNN average accuracy on 10-fold cross validation: 0.7581027667984189
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 0. 1. 1. 1.]
KNN accuracy on  test data: 0.23529411764705882


In [4]:
#lr model results after SMOTE
lrMdl = LogisticRegression(solver= 'liblinear').fit(training_x_smote, training_y_smote)
lrScores = cross_val_score(lrMdl, training_x_smote, training_y_smote, cv=10, scoring='accuracy')
print("LR average accuracy on 10-fold cross validation: " + str(np.mean(lrScores)))
lrPredictions = lrMdl.predict(testing_x)
print(lrPredictions)
print("LR accuracy on  test data:", accuracy_score(testing_y, lrPredictions))

LR average accuracy on 10-fold cross validation: 0.7444664031620554
[0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0]
LR accuracy on  test data: 0.7941176470588235


In [5]:
#scale data
scaler = MinMaxScaler()
dfcolumns = list(df.columns.values)
df_scaled = scaler.fit_transform(df)
df = pd.DataFrame(df_scaled, columns = dfcolumns)
y_col = 'Promoted'
x_cols = df.loc[:, df.columns != y_col].columns

#split data
training_x, testing_x, training_y, testing_y = train_test_split(df[x_cols], df[y_col], test_size=0.2, random_state = 42)

#perform PCA
pca = PCA(n_components=5)
training_x_pca = pca.fit_transform(training_x)
testing_x_pca = pca.transform(testing_x)
oversample = SMOTE(random_state = 42)

#oversample via SMOTE
training_x_pca_smote, training_y_pca_smote = oversample.fit_resample(training_x_pca, training_y)
explained_variance = pca.explained_variance_ratio_
#component variance
print(explained_variance)

[0.53001439 0.15128679 0.06401031 0.05853438 0.04513438]


In [14]:
#knn model after PCA and Smote
k = 4
knnMdl = KNeighborsClassifier(n_neighbors=k).fit(training_x_pca_smote, training_y_smote)
knnScores = cross_val_score(knnMdl, training_x_pca_smote, training_y_smote, cv=10, scoring='accuracy')
print("KNN average accuracy on 10-fold cross validation: " + str(np.mean(knnScores)))
knnPredictions = knnMdl.predict(testing_x_pca)
print(knnPredictions)
print("KNN accuracy on  test data:", accuracy_score(testing_y, knnPredictions))

KNN average accuracy on 10-fold cross validation: 0.7899209486166008
[0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
KNN accuracy on  test data: 0.7647058823529411


In [7]:
#lr model results after oversampling
lrMdl = LogisticRegression(solver= 'liblinear').fit(training_x_pca_smote, training_y_smote)
lrScores = cross_val_score(lrMdl, training_x_pca_smote, training_y_smote, cv=10, scoring='accuracy')
print("LR average accuracy on 10-fold cross validation: " + str(np.mean(lrScores)))
lrPredictions = lrMdl.predict(testing_x_pca)
print(lrPredictions)
print("LR accuracy on  test data:", accuracy_score(testing_y, lrPredictions))

LR average accuracy on 10-fold cross validation: 0.682608695652174
[1 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 0 1 1 0 1 0 0 1 0 0 0 1 1 0 0 0 0 0]
LR accuracy on  test data: 0.7647058823529411


Algorithm Comparision

--------------------------------------------------------------------

NO PREPROCESSING

KNN average accuracy on 10-fold cross validation: 0.8076923076923077
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
KNN accuracy on  test data: 0.8529411764705882

LR average accuracy on 10-fold cross validation: 0.7857142857142857
[0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
LR accuracy on  test data: 0.8235294117647058

---------------------------------------------------------------------

SMOTE NO PCA

KNN average accuracy on 10-fold cross validation: 0.7948616600790512
[0 0 0 1 0 0 1 0 0 0 1 0 0 1 1 0 0 1 0 0 1 0 1 0 0 0 0 1 0 1 0 0 0 0]
KNN accuracy on  test data: 0.7352941176470589

LR average accuracy on 10-fold cross validation: 0.7444664031620554
[0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0]
LR accuracy on  test data: 0.7941176470588235

---------------------------------------------------------------------

SMOTE WITH PCA

KNN average accuracy on 10-fold cross validation: 0.7988142292490119
[1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0]
KNN accuracy on  test data: 0.7647058823529411

LR average accuracy on 10-fold cross validation: 0.682608695652174
[1 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 0 1 1 0 1 0 0 1 0 0 0 1 1 0 0 0 0 0]
LR accuracy on  test data: 0.7647058823529411


--------------------------------------------------------------------

conclusions:
smote required to make valid predictions due to unbalanced classes
KNN performs better with PCA, but LR performs worse
however with PCA both perform the same

PCA gives consistant predictions between the two algos

In [11]:
#testing performing smote before PCA
training_x_smote, training_y_smote = oversample.fit_resample(training_x, training_y)
pca = PCA(n_components=5)
training_x_smote_pca = pca.fit_transform(training_x_smote)
testing_x_pca = pca.transform(testing_x)

explained_variance = pca.explained_variance_ratio_
print(explained_variance)

[0.53269619 0.15998769 0.06655798 0.05930242 0.03802557]


In [13]:
#knn model after smote then PCA
k = 4
knnMdl = KNeighborsClassifier(n_neighbors=k).fit(training_x_pca_smote, training_y_smote)
knnScores = cross_val_score(knnMdl, training_x_smote_pca, training_y_smote, cv=10, scoring='accuracy')
print("KNN average accuracy on 10-fold cross validation: " + str(np.mean(knnScores)))
knnPredictions = knnMdl.predict(testing_x_pca)
print(knnPredictions)
print("KNN accuracy on  test data:", accuracy_score(testing_y, knnPredictions))

KNN average accuracy on 10-fold cross validation: 0.7899209486166008
[0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
KNN accuracy on  test data: 0.7647058823529411


no change in results