In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import *
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import *


**I. Chargement du jeu de données train**

In [None]:
#Chargement des données

df=pd.read_csv('train.csv')
df.head()

**II. Exploration du jeu de données train**

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df['Target'].value_counts()

In [None]:
df['Target'].hist()

II.1 Corrélation

In [None]:
df.columns

In [None]:
choix_corr = df[['attempted_passes', 'big_chances_missed', 'target_missed',
        'winning_goals', 'influence', 'key_passes', 'ict_index', 'goals_conceded',
        'completed_passes', 'creativity', 'bps','minutes', 'saves','Target']]


df2 = pd.DataFrame(choix_corr)
df2

In [None]:
df2.info()

In [None]:
#df_cat = df2.select_dtypes(['category']).columns
#df2[df_cat] = df2[df_cat].apply(lambda x: x.cat.codes)
#print(df_cat)

corr_matrix = df2.corr(method='spearman', min_periods=1) #la méthose "Spearman" me paraît la plus pertinente 
                                                                    #pour obtenir des résultats optimaux. 
print(corr_matrix)



In [None]:
cmap = sns.diverging_palette(220, 10, as_cmap=True)
matrix = sns.heatmap(corr_matrix, cmap=cmap, cbar_kws={"shrink": .5}, linewidths=.5)
plt.show()

II.2 Visualisation

In [None]:
df2[['attempted_passes', 'big_chances_missed', 'target_missed',
        'winning_goals', 'influence',]].hist(figsize=(10,10))

In [None]:
df2[['goals_conceded',
        'completed_passes', 'creativity', 'bps','minutes', 'saves','Target']].boxplot(figsize=(10, 10))

**III. Chargement des données Test**

In [None]:

#DATA_PATH='prdiction-des-points/'

df_test = pd.read_csv('test.csv')

**IV. Concaténation de train et de test**

In [None]:
df_pour_processing=pd.concat([df,df_test])

In [None]:
df_pour_processing

**V.Préprocessing**

V.1 Suppression des features dont la corrélation >0.90

In [None]:
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(np.bool))

In [None]:
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.90)] 
print(); print(to_drop)

In [None]:
df_pour_processing = df_pour_processing.drop(to_drop, axis=1) 
print(); print(df_pour_processing.head())

In [None]:
df_pour_processing.shape

V.2  Identification des variables catégorielles & numériques

In [None]:
df_cat = df_pour_processing.select_dtypes(include=['object'])
df_cat.columns

In [None]:
df_cat.isna().sum()

In [None]:
df_num = df_pour_processing.select_dtypes(include=['int', 'float'])
df_num.columns

In [None]:
FEATURES = []

NUMERICALS = ['sub_id', 'assists', 'attempted_passes', 'big_chances_created',
       'big_chances_missed', 'bonus', 'bps', 'clean_sheets',
       'clearances_blocks_interceptions', 'completed_passes', 'creativity',
       'dribbles', 'ea_index', 'element', 'errors_leading_to_goal',
       'errors_leading_to_goal_attempt', 'fixture', 'fouls', 'goals_conceded',
       'goals_scored', 'ict_index', 'influence', 'key_passes',
       'loaned_in', 'loaned_out', 'minutes', 'offside', 'open_play_crosses',
       'opponent_team', 'own_goals', 'penalties_conceded', 'penalties_missed',
       'penalties_saved', 'recoveries', 'red_cards', 'round', 'saves',
       'selected', 'tackled', 'tackles', 'target_missed', 'team_a_score',
       'team_h_score', 'threat', 'transfers_balance', 'transfers_in',
       'transfers_out', 'value', 'winning_goals', 'yellow_cards', 'GW', 'xP'
       ]
CATEGORICALS=['name', 'kickoff_time', 'kickoff_time_formatted', 'position', 'team']

Target = df_pour_processing['Target']

FEATURES = NUMERICALS + CATEGORICALS

V.3 Imputation des valeurs manquantes

In [None]:
#pourcentage de valeurs manquantes 
missing_values = df_pour_processing.isnull().sum()
total_missing_values = missing_values.sum()
total_values = np.product(df_pour_processing.shape)
percent_missing = round((total_missing_values/total_values)*100)
print(f'la part de valeurs manquantes est de {percent_missing}%')

In [None]:
df_pour_processing.isna().sum()

In [None]:
## Variable continue ------> Moyenne / Mediane
def median_method_1 (num_var_filled):
    
    moyenne_attempted_passes = df_pour_processing['attempted_passes'].mean()
    df_pour_processing['attempted_passes'] = df_pour_processing['attempted_passes'].fillna(moyenne_attempted_passes)

    mediane_big_chances_created = df_pour_processing['big_chances_created'].median()
    df_pour_processing['big_chances_created'] = df_pour_processing['big_chances_created'].fillna(mediane_big_chances_created)


    mediane_big_chances_missed = df_pour_processing['big_chances_missed'].median()
    df_pour_processing['big_chances_missed'] = df_pour_processing['big_chances_missed'].fillna(mediane_big_chances_missed)

    mediane_clearances_blocks_interceptions = df_pour_processing['clearances_blocks_interceptions'].median()
    df_pour_processing['clearances_blocks_interceptions'] = df_pour_processing['clearances_blocks_interceptions'].fillna(mediane_clearances_blocks_interceptions)

    mediane_dribbles = df_pour_processing['dribbles'].median()
    df_pour_processing['dribbles'] = df_pour_processing['dribbles'].fillna(mediane_dribbles)

    mediane_errors_leading_to_goal = df_pour_processing['errors_leading_to_goal'].median()
    df_pour_processing['errors_leading_to_goal'] = df_pour_processing['errors_leading_to_goal'].fillna(mediane_errors_leading_to_goal)

    mediane_fouls = df_pour_processing['fouls'].median()
    df_pour_processing['fouls'] = df_pour_processing['fouls'].fillna(mediane_fouls)

    mediane_loaned_out=df_pour_processing['loaned_out'].median()
    df_pour_processing['loaned_out'] = df_pour_processing['loaned_out'].fillna(mediane_loaned_out)

    mediane_open_play_crosses=df_pour_processing['open_play_crosses'].median()
    df_pour_processing['open_play_crosses'] = df_pour_processing['open_play_crosses'].fillna(mediane_open_play_crosses)

    mediane_recoveries=df_pour_processing['recoveries'].median()
    df_pour_processing['recoveries'] = df_pour_processing['recoveries'].fillna(mediane_recoveries)

    mediane_tackles=df_pour_processing['tackles'].median()
    df_pour_processing['tackles'] = df_pour_processing['tackles'].fillna(mediane_tackles)

    mediane_team_a_score=df_pour_processing['team_a_score'].median()
    df_pour_processing['team_a_score'] = df_pour_processing['team_a_score'].fillna(mediane_team_a_score)

    mediane_winning_goals=df_pour_processing['winning_goals'].median()
    df_pour_processing['winning_goals'] = df_pour_processing['winning_goals'].fillna(mediane_winning_goals)


    mediane_xP=df_pour_processing['xP'].median()
    df_pour_processing['xP'] = df_pour_processing['xP'].fillna(mediane_xP)

    return num_var_filled


In [None]:
median_method_1(df_pour_processing)

In [None]:
df_pour_processing.isna().sum()

In [None]:
def median_method_2 (num_var_filled2):

    mediane_ea_index= df_pour_processing['ea_index'].median()
    df_pour_processing['ea_index'] = df_pour_processing['ea_index'].fillna(mediane_ea_index)
    #df_pour_processing['ea_index'].isna().sum()

    #df_pour_processing['errors_leading_to_goal_attempt'].hist()
    #df_pour_processing['errors_leading_to_goal_attempt'].dtype

    mediane_errors_leading_to_goal_attempt= df_pour_processing['errors_leading_to_goal_attempt'].median()
    df_pour_processing['errors_leading_to_goal_attempt'] = df_pour_processing['errors_leading_to_goal_attempt'].fillna(mediane_errors_leading_to_goal_attempt)
    df_pour_processing['errors_leading_to_goal_attempt'].isna().sum()

    #df_pour_processing['key_passes'].dtype

    mediane_key_passes= df_pour_processing['key_passes'].median()
    df_pour_processing['key_passes'] = df_pour_processing['key_passes'].fillna(mediane_key_passes)
    #df_pour_processing['key_passes'].isna().sum()

    #df_pour_processing['loaned_in'].hist()

    mediane_loaned_in= df_pour_processing['loaned_in'].median()
    df_pour_processing['loaned_in'] = df_pour_processing['loaned_in'].fillna(mediane_loaned_in)
    #df_pour_processing['loaned_in'].isna().sum()


    #df_pour_processing['offside'].hist()

    mediane_offside= df_pour_processing['offside'].median()
    df_pour_processing['offside'] = df_pour_processing['offside'].fillna(mediane_offside)
    df_pour_processing['offside'].isna().sum()

    #df_pour_processing['penalties_conceded'].hist()

    mediane_penalties_conceded= df_pour_processing['penalties_conceded'].median()
    df_pour_processing['penalties_conceded'] = df_pour_processing['penalties_conceded'].fillna(mediane_penalties_conceded)
    #df_pour_processing['penalties_conceded'].isna().sum()


    #df_pour_processing['tackled'].hist()

    mediane_tackled= df_pour_processing['tackled'].median()
    df_pour_processing['tackled'] = df_pour_processing['tackled'].fillna(mediane_tackled)
    #df_pour_processing['tackled'].isna().sum()

    #df_pour_processing['target_missed'].median()

    mediane_target_missed= df_pour_processing['target_missed'].median()
    df_pour_processing['target_missed'] = df_pour_processing['target_missed'].fillna(mediane_target_missed)
    #df_pour_processing['target_missed'].isna().sum()

    #df_pour_processing['team_h_score'].median() #varible intéressante pour analyse

    mediane_team_h_score= df_pour_processing['team_h_score'].median()
    df_pour_processing['team_h_score'] = df_pour_processing['team_h_score'].fillna(mediane_team_h_score)
    #df_pour_processing['team_h_score'].isna().sum()

    mediane_Target= df_pour_processing['Target'].median()
    df_pour_processing['Target'] = df_pour_processing['Target'].fillna(mediane_Target)

    return num_var_filled2


In [None]:
median_method_2(df_pour_processing)

In [None]:
df_pour_processing.isna().sum()

In [None]:
CATEGORICALS

In [None]:
sns.boxplot(data=df_pour_processing, x='team', y='Target', palette='PuBu').set_title('BoxPlot Target/team')

In [None]:
sns.boxplot(data=df_pour_processing, x='team', y='Target', palette='PuBu').set_title('BoxPlot Target/team')

In [None]:
#Scatterpblot
ax = sns.scatterplot(x="kickoff_time_formatted", y="Target", data=df_pour_processing)

In [None]:
df_pour_processing = df_pour_processing.drop(['id', 'kickoff_time_formatted', 'position', 'team'], axis=1)

In [None]:
df_pour_processing.head()

In [None]:
#pourcentage de valeurs manquantes 
missing_values = df_pour_processing.isnull().sum()
total_missing_values = missing_values.sum()
total_values = np.product(df_pour_processing.shape)
percent_missing = round((total_missing_values/total_values)*100)
print(f'la part de valeurs manquantes est de {percent_missing}%')

 Imputation des valeurs aberrantes

V.4 Mise à jour des variables catégorielles & numériques

In [None]:
df_cat = df_pour_processing.select_dtypes(include=['object'])
df_cat.columns

In [None]:
df_cat.isna().sum()

In [None]:
df_num = df_pour_processing.select_dtypes(include=['int', 'float'])
df_num.columns

In [None]:
FEATURES=[]

NUMERICALS=['sub_id', 'assists', 'attempted_passes', 'big_chances_created',
       'big_chances_missed', 'bonus', 'clean_sheets',
       'clearances_blocks_interceptions', 'dribbles', 'ea_index', 'element',
       'errors_leading_to_goal', 'errors_leading_to_goal_attempt', 'fixture',
       'fouls', 'goals_conceded', 'goals_scored', 'key_passes', 'loaned_in',
       'loaned_out', 'offside', 'open_play_crosses', 'opponent_team',
       'own_goals', 'penalties_conceded', 'penalties_missed',
       'penalties_saved', 'recoveries', 'red_cards', 'round', 'saves',
       'selected', 'tackled', 'tackles', 'target_missed', 'team_a_score',
       'team_h_score', 'threat', 'transfers_balance', 'transfers_in',
       'transfers_out', 'value', 'winning_goals', 'yellow_cards', 'GW', 'xP'
       ]
CATEGORICALS=['name', 'kickoff_time']

Target=df_pour_processing['Target']

FEATURES = NUMERICALS + CATEGORICALS

V.5 Standardisation

In [None]:
#--------------------Appliquer le scaler sur un sample---------------------#

scaler = preprocessing.StandardScaler().fit(df_pour_processing[NUMERICALS].head(1000))
scaler.transform(df_pour_processing[NUMERICALS].head(1000))
print("Moyenne de chaque variable sur 1000 premières observations")
scaler.transform(df_pour_processing[NUMERICALS].head(1000)).mean(axis=0)
print("Ecart-type de chaque variable sur 1000 premières observations")
scaler.transform(df_pour_processing[NUMERICALS].head(1000)).std(axis=0)

In [None]:
#--------------------Appliquer le scaler à toutes les autres lignes---------------------#

X1 = scaler.transform(df_pour_processing[NUMERICALS].head(1000))
X2 = scaler.transform(df_pour_processing[NUMERICALS][1000:])
col_pos = df_pour_processing[NUMERICALS].columns.get_loc("attempted_passes")
# X2.mean(axis = 0)
# X2.std(axis = 0)
f, axes = plt.subplots(2, figsize=(10, 10))
sns.distplot(X2[:,col_pos] , color="olive", ax=axes[1])
sns.distplot(X1[:,col_pos] , color="skyblue", ax=axes[0])
plt.savefig('standardisation.png', bbox_inches='tight')

V.6 Normalisation

In [None]:
scaler = preprocessing.Normalizer().fit(df_pour_processing[NUMERICALS].dropna(how = "any").head(1000))
X1 = scaler.transform(df_pour_processing[NUMERICALS].dropna(how = "any").head(1000))
f, axes = plt.subplots(2, figsize=(10, 10))
sns.distplot(df_pour_processing["attempted_passes"] , color="skyblue", ax=axes[0])
sns.distplot(X1[:,col_pos] , color="olive", ax=axes[1])
plt.savefig('normalisation.png', bbox_inches='tight')

In [None]:
# Vérification de la norme L2 est bien égale à 1
np.sqrt(np.sum(X1**2, axis=1))[:5] # L2-norm

V.7 Transformation des variables catégorielles

In [None]:
data_dummies= pd.get_dummies(df_pour_processing[CATEGORICALS].astype('category'))
data_dummies.sample(5)

In [None]:
data_dummies.columns

In [None]:
CATEGORICALS = data_dummies.columns
len(CATEGORICALS)

In [None]:
df_pour_processing[CATEGORICALS] = data_dummies[CATEGORICALS]

In [None]:
FEATURES = list(CATEGORICALS) + NUMERICALS

In [None]:
FEATURES

In [None]:
df_pour_processing.sample(2)

**VI. Stratégie de validation**

In [None]:
df.shape
df_preprocesse=df_pour_processing
#Target= df_preprocesse['Target']
df_preprocesse['Target']=Target
df_preprocesse.shape


In [None]:
df_pour_processing

In [None]:
#df_preprocesse['Target']

In [None]:
df_train = df_preprocesse.iloc[:df.shape[0],:]
#Target
df_test = df_preprocesse.iloc[-df_test.shape[0]:,:]
#Target

In [None]:
df_test

In [None]:
df_test=df_test.drop(["Target"], axis=1)
df_test.head()

In [None]:
NFOLD = 4

kf = KFold(n_splits=NFOLD,shuffle=True,random_state=2021)
#split = list(kf.split(df_train[FEATURES],Target))
split = list(kf.split(df_train[FEATURES],df_train['Target']))

for i,(train_index, test_index) in enumerate(split):
    df_train.loc[test_index,'fold'] = i+1
    
df_train['fold'] = df_train['fold'].astype(int)

In [None]:
df_train['fold'].value_counts()

**VII.Modélisation**

RandomForestRegressor

In [None]:

model_rf = []

MAE_scores_rf=[]

features = FEATURES

for fold in range(1,NFOLD+1):
    print(f'\n ---------------- Fold {fold} ------------\n')
    
    ## Echantillon train test
    
    dtrain = df_train[df_train.fold!=fold]
    dval = df_train[df_train.fold==fold]
        
    print(f" -------------- Training on {len(dtrain)} samples-------------- ")
    print(f" -------------- Validation on {len(dval)} samples-------------- ")
    
    
    ## Definition et Apprentissage du model
    
    #Definition et Apprentissage du model foret aléatoire
    
    clf = RandomForestRegressor(n_estimators=10, random_state=1)
    clf.fit(dtrain[features], dtrain['Target'])
 
    # Save model
    model_rf.append(clf)
    
    # Predire sur validation

    y_pred_val_rf = clf.predict(dval[features])

    y_pred_train_rf = clf.predict(dtrain[features])

    # Fonction de perte
    
    # metrics de performances
    
    mae_train_rf = mean_absolute_error(dtrain['Target'],y_pred_train_rf)
    mae_val_rf = mean_absolute_error(dval['Target'],y_pred_val_rf)

    print(f"MAE rf Train: {mae_train_rf}  ; MAE rf VAL : {mae_val_rf}")

    MAE_scores_rf.append(mae_val_rf)

In [None]:
min_scores=min(MAE_scores_rf)
c=0
best_model= RandomForestRegressor()

for mae_scores in MAE_scores_rf:
    if mae_scores == min_scores:
       #best_model=models[c]
       best_model=model_rf[c]
       print(c)
       #print('best')
    else:
        print("pas le meilleur modèle")
    c=c+1


In [None]:
#----------Amélioration modèle ----------#

parameters = {
    'n_estimators': [5, 10],
    #'max_depth': [2,4],
}
regr = RandomForestRegressor(random_state=0)

clf = GridSearchCV(regr, parameters)
clf.fit(dtrain[features], dtrain['Target'])

In [None]:
print(" Résultat du Grid Search ")
print("\n Le nombre d'estimators optimal:\n",clf.best_estimator_)
print("\n Le meilleur score obtenu:\n",clf.best_score_)
print("\n Les meilleurs paramètres:\n",clf.best_params_)

In [None]:
#Entraînement modèle avec les données Grid search

model_rf = []

MAE_scores_rf=[]

features = FEATURES

for fold in range(1,NFOLD+1):
    print(f'\n ---------------- Fold {fold} ------------\n')
    
    ## Echantillon train test
    
    dtrain = df_train[df_train.fold!=fold]
    dval = df_train[df_train.fold==fold]
        
    print(f" -------------- Training on {len(dtrain)} samples-------------- ")
    print(f" -------------- Validation on {len(dval)} samples-------------- ")
    
    
    ## Definition et Apprentissage du model
    
    #Definition et Apprentissage du model foret aléatoire
    
    clf = RandomForestRegressor(n_estimators=10, random_state=0)
    clf.fit(dtrain[features], dtrain['Target'])
 
    # Save model
    model_rf.append(clf)
    
    # Predire sur validation

    y_pred_val_rf = clf.predict(dval[features])

    y_pred_train_rf = clf.predict(dtrain[features])

    # Fonction de perte
    
    # metrics de performances
    
    mae_train_rf = mean_absolute_error(dtrain['Target'],y_pred_train_rf)
    mae_val_rf = mean_absolute_error(dval['Target'],y_pred_val_rf)

    print(f"MAE rf Train: {mae_train_rf}  ; MAE rf VAL : {mae_val_rf}")

    MAE_scores_rf.append(mae_val_rf)

In [None]:
min_scores=min(MAE_scores_rf)
c=0
best_model_rf= RandomForestRegressor()

for mae_scores in MAE_scores_rf:
    if mae_scores == min_scores:
       Grid_rf=model_rf[c]
       print(c)
       #print('best')
    else:
        print("pas le meilleur modèle")
    c=c+1

Variables importance

In [None]:
features_names=Grid_rf.feature_names_in_
len(features_names)

In [None]:
importances = Grid_rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in Grid_rf.estimators_], axis=0)

forest_importances = pd.Series(importances[importances>0], index=features_names[importances>0])
forest_importances
forest_importances.sort_values
#fig, ax = plt.subplots()
#forest_importances.plot.bar(yerr=std[importances>0], ax=ax)
#ax.set_title("Feature importances using MDI")
#ax.set_ylabel("Mean decrease in impurity")

#fig.tight_layout()

In [None]:
len(forest_importances)

**Ridge Regression**

In [None]:
MAE_scores_Ridge=[]

df_resultat_Ridge= []

features = FEATURES

for fold in range(1,NFOLD+1):
    print(f'\n ---------------- Fold {fold} ------------\n')
    
    ## Echantillon train test
    
    dtrain = df_train[df_train.fold!=fold]
    dval = df_train[df_train.fold==fold]
        
    print(f" -------------- Training on {len(dtrain)} samples-------------- ")
    print(f" -------------- Validation on {len(dval)} samples-------------- ")


    n_alphas = 1
    alphas = np.arange(0,n_alphas,0.5) #avec alpha le pénalisant

    for alpha in tqdm_notebook(alphas):
        reg_Ridge = Ridge(alpha=alpha).fit(dtrain[FEATURES],dtrain['Target'])

        #prediction

        y_pred_val_Ridge = reg_Ridge.predict(dval[features])
        y_pred_train_Ridge = reg_Ridge.predict(dtrain[features])
        df_resultat_Ridge.append(reg_Ridge)

        ##Fonction de perte 
        #Metric de performance 
        
        mae_train_Ridge = mean_absolute_error(dtrain['Target'],y_pred_train_Ridge)
        mae_val_Ridge= mean_absolute_error(dval['Target'],y_pred_val_Ridge)

        print(f"MAE rf Train: {mae_train_Ridge}  ; MAE rf VAL : {mae_val_Ridge}")

        MAE_scores_Ridge.append(mae_val_Ridge)



In [None]:
min_scores=min(MAE_scores_Ridge)
c=0
best_model_Ridge= Ridge()

for mae_scores in MAE_scores_Ridge:
    if mae_scores == min_scores:
       best_Ridge=df_resultat_Ridge[c]
       print(c)
       #print('best')
    else:
        print("pas le meilleur modèle")
    c=c+1


Variable name

In [None]:
features_name_ridge =best_Ridge.feature_names_in_
features_name_ridge

In [None]:
best_Ridge.n_features_in_

**VIII. Prédiction sur Test**

RandomForestRegressor

In [None]:
y_test_pred_rf = Grid_rf.predict(df_test[FEATURES])
y_test_pred_rf

In [None]:
df_test['target'] = y_test_pred_rf
df_test['target']

In [None]:
df_test[["sub_id","target"]].to_csv('submssion_RandomForest2.csv',index=False)

**Ridge Regression**

In [None]:
y_test_pred_Ridge = best_Ridge.predict(df_test[FEATURES])
y_test_pred_Ridge

In [None]:
best_Ridge

In [None]:
features_names=y_test_pred_Ridge.get_feature_names_out()
features_names

In [None]:
df_test['target'] = y_test_pred_Ridge
df_test['target']

In [None]:
df_test[["sub_id","target"]].to_csv('submssion_Ridge-Regression2.csv',index=False)

In [None]:
#mean_absolute_error(y_test, y_test_pred)