## Creating Histograms

In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np
import re
import pandas as pd

csv_file_path = r'C:\Users\Hungdever\Desktop\My_study\EPL\data\results.csv'
histo_output_dir = r'C:\Users\Hungdever\Desktop\My_study\EPL\Histograms'  

columns_to_plot = ['Exp_xG','GnS_SCA90','Shoot_G/Sh', 'Defen_Tkl', 'Defen_Blocks', 'Defen_Int' ]
stats_df = pd.read_csv(csv_file_path)

os.makedirs(histo_output_dir, exist_ok=True)
print(f"Histogram output directory: {histo_output_dir}")

stats_numeric_df = stats_df.apply(pd.to_numeric, errors='coerce')

valid_opted_columns = []
all_numeric_column_names = stats_numeric_df.select_dtypes(include=[np.number]).columns.tolist()

for col_name in columns_to_plot:
    if col_name in all_numeric_column_names:
        valid_opted_columns.append(col_name)
    else:
        print(f"WARNING: Column '{col_name}' not found.")

if valid_opted_columns:
    for stat_column_name in valid_opted_columns:
        print(f"\nProcessing statistic: {stat_column_name}")
        
        plt.figure(figsize=(12, 7)) 
        
        data_all_players = stats_numeric_df[stat_column_name].dropna()
        
        if not data_all_players.empty:
            data_all_players.hist(bins=20, color='skyblue', edgecolor='black', alpha=0.75)
            plt.title(f'Distribution of {stat_column_name} - All Players', fontsize=15)
            plt.xlabel(stat_column_name, fontsize=12)
            plt.ylabel('Frequency', fontsize=12)
            plt.grid(axis='y', alpha=0.7)
            plt.tight_layout()
            
            # Sanitize the statistic name for the filename
            sanitized_stat_name = re.sub(r'[^a-zA-Z0-9_]', '_', stat_column_name)
            all_players_hist_path = os.path.join(histo_output_dir, f'all_players_{sanitized_stat_name}.png')
            try:
                plt.savefig(all_players_hist_path)
                print(f"Saved histogram for all players: {all_players_hist_path}")
            except Exception as e:
                print(f"Error when saving histogram for all players ({stat_column_name}): {e}")
            plt.close()
        else:
            print(f"No data for column '{stat_column_name}' (all players) after dropping NaN.")
        
        # Histograms for each team 
        if 'team' not in stats_df.columns:
            print("WARNING: Column 'team' not found")
            continue  

        unique_teams = stats_df['team'].unique()
        print(f"  Creating histograms for {len(unique_teams)} teams...")

        for team_name in unique_teams:
            # Filter data for current team from original DataFrame
            team_data_df = stats_df[stats_df['team'] == team_name]
            
            # Get and convert team's statistic data to numeric, drop NaN
            if stat_column_name in team_data_df.columns:
                team_stat_data_series = pd.to_numeric(team_data_df[stat_column_name], errors='coerce').dropna()
                
                if not team_stat_data_series.empty:
                    plt.figure(figsize=(12, 7))
                    team_stat_data_series.hist(bins=15, color='mediumseagreen', edgecolor='black', alpha=0.75)
                    
                    # Sanitize team name for filename
                    sanitized_team_name = re.sub(r'[\n<>:"/\\|?*()\s]', '_', str(team_name)).strip('_')
                    sanitized_team_name = re.sub(r'_+', '_', sanitized_team_name)

                    plt.title(f'Distribution of {stat_column_name} - Team: {team_name}', fontsize=15)
                    plt.xlabel(stat_column_name, fontsize=12)
                    plt.ylabel('Frequency', fontsize=12)
                    plt.grid(axis='y', alpha=0.7)
                    plt.tight_layout()
                    
                    team_hist_path = os.path.join(histo_output_dir, f'{sanitized_team_name}_{sanitized_stat_name}.png')
                    try:
                        plt.savefig(team_hist_path)
                    except Exception as e:
                        print(f"Error saving histogram for team {team_name} ({stat_column_name}): {e}")
                    plt.close()

            else:
                print(f"WARNING: Column '{stat_column_name}' does not exist in data for team {team_name}.")

    print(f"\nDone!")



Successfully loaded data from: C:\Users\Hungdever\Desktop\My_study\EPL\data\results.csv
Histogram output directory: C:\Users\Hungdever\Desktop\My_study\EPL\Histograms

Processing statistic: Exp_xG
Saved histogram for all players: C:\Users\Hungdever\Desktop\My_study\EPL\Histograms\all_players_Exp_xG.png
  Creating histograms for 20 teams...

Processing statistic: GnS_SCA90
Saved histogram for all players: C:\Users\Hungdever\Desktop\My_study\EPL\Histograms\all_players_GnS_SCA90.png
  Creating histograms for 20 teams...

Processing statistic: Shoot_G/Sh
Saved histogram for all players: C:\Users\Hungdever\Desktop\My_study\EPL\Histograms\all_players_Shoot_G_Sh.png
  Creating histograms for 20 teams...

Processing statistic: Defen_Tkl
Saved histogram for all players: C:\Users\Hungdever\Desktop\My_study\EPL\Histograms\all_players_Defen_Tkl.png
  Creating histograms for 20 teams...

Processing statistic: Defen_Blocks
Saved histogram for all players: C:\Users\Hungdever\Desktop\My_study\EPL\Hist

# Made by Hung-dev-guy </Hng/>