In [701]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

Load and merged dataset

In [702]:
file_paths = ["../../data/Stats_csv/Processed_Forward_data.csv",
              "../../data/Stats_csv/Processed_Midfielder_data.csv"]
dfs = []
for file in file_paths:
    dfs.append(pd.read_csv(file))
common_columns = set(dfs[0].columns) 
for df in dfs[1:]:
    common_columns &= set(df.columns) 

common_columns = list(common_columns)
df = []
for data in dfs:
    df.append(data[common_columns])

df = pd.concat(df,ignore_index=True)
df = df.loc[df['Appearances']!=0]


Data cleaning

In [703]:
for col in ['Position', 'Nationality', 'Club']:
    df[col] = LabelEncoder().fit_transform(df[col])
df.columns

Index(['Goals with right foot', 'Crosses', 'Height', 'Penalties scored',
       'Shooting accuracy %', 'Big chances missed', 'Yellow cards',
       'Blocked shots', 'Assists', 'Nationality', 'Freekicks scored', 'Shots',
       'Offsides', 'Headed Clearance', 'Appearances', 'Position',
       'Interceptions', 'Headed goals', 'Goals with left foot', 'Passes',
       'Clearances', 'Hit woodwork', 'Tackles', 'Red cards', 'Date of Birth',
       'Name', 'Goals', 'Losses', 'Club', 'Big Chances Created', 'Fouls',
       'Wins', 'Goals per match', 'Passes per match', 'Shots on target'],
      dtype='object')

In [704]:
features = [
    'Goals with right foot',
    'Goals with left foot',
    'Headed goals',
    'Freekicks scored',
    'Penalties scored',
    'Shots on target',
    'Big Chances Created',
    'Offsides',
    'Appearances',
    'Height',
    'Wins',
    'Position',
    'Goals'
]
max_goals = df['Goals'].max()
df['TopScorer'] = (df['Goals'] == max_goals).astype(int)
target = 'TopScorer'
X = df[features]
y = df[target]

Split data

In [705]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [706]:
scaler = StandardScaler()
X_train_scaler = scaler.fit_transform(X_train)
X_test_scaler = scaler.transform(X_test)

In [707]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Prediction Accuracy (Logistic Regression):", accuracy)
print(classification_report(y_test, y_pred))

Prediction Accuracy (Logistic Regression): 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        75

    accuracy                           1.00        75
   macro avg       1.00      1.00      1.00        75
weighted avg       1.00      1.00      1.00        75



In [709]:
df['TopScorer_Probability'] = clf.predict_proba(X)[:, 1]
result = df[['Name', 'TopScorer_Probability', 'Goals'] + features].sort_values(by='TopScorer_Probability', ascending=False)
result.to_csv('../../data/Predict/players_top_scorer_probabilities.csv')