In [None]:
import pandas as pd
from io import StringIO
pd.set_option('display.max_columns', None)

In [None]:
#Import dataframe
df_100pos = pd.read_csv('DataFiles/merged_df_100pos.csv', low_memory=False)

In [None]:
# Converter function for height 
def feet_to_meters(height):
    if isinstance(height, str):
        feet, inches = map(int, height.split('-'))
        total_feet = feet + inches / 12  # Convert inches to feet
        total_meters = total_feet * 0.3048  # Convert feet to meters
        return round(total_meters, 2)  # Round to 2 decimals
    else:
        return None  # Handle missing or invalid values

In [None]:
# Apply the function to convert heights
df_100pos['PLAYER_HEIGHT_METERS'] = df_100pos['HEIGHT'].apply(feet_to_meters)

In [None]:
# Adding additional advanced metric columns

# Shooting efficiency
df_100pos['FG3A%'] = df_100pos['FG3A'] / df_100pos['FGA']
df_100pos['PTS/FGA'] = df_100pos['PTS'] / df_100pos['FGA']
df_100pos['FG3M/FGM'] = df_100pos['FG3M'] / df_100pos['FGM']
df_100pos['FTA/FGA'] = df_100pos['FTA'] / df_100pos['FGA']

# True Shooting Percentage
df_100pos['TRU%'] = 0.5 * df_100pos['PTS'] / (df_100pos['FGA'] + 0.475 * df_100pos['FTA'])

# Assists to Turnover Ratio
df_100pos['AST_TOV'] = df_100pos['AST'] / df_100pos['TOV']

df_100pos[(df_100pos['PLAYER_ID'] == 1629029)]

In [None]:
# Replacing intermediary positions with one. All in all there are 3 positions - C, F and G. 
df_100pos['POSITION'] = df_100pos['POSITION'].str.replace('F-C', 'F').replace('C-F', 'C').replace('F-G', 'F').replace('G-F', 'G')

In [None]:
# European countries in wider basketball sense:
europe = ['Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Cyprus', 'Czech Republic', 'Denmark',
  'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Ireland',
  'Italy', 'Latvia', 'Lithuania', 'Luxembourg', 'Malta', 'Netherlands',
  'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden', 
  'United Kingdom', 'Norway', 'Switzerland', 'Belarus', 'Russia', 'Ukraine',
  'Moldova', 'Georgia', 'Armenia', 'Azerbaijan', 'Turkey', 'Israel', 'Albania',
  'North Macedonia', 'Serbia', 'Montenegro', 'Bosnia and Herzegovina', 'Iceland']

In [None]:
# Define function for European countries
def filter_europe(df, column_name, europe):
    return df[df[column_name].isin(europe)]

In [None]:
# New object for European players playing at F:
europe_F = filter_europe(df_100pos, 'COUNTRY', europe)
europe_F = europe_F.loc[europe_F['POSITION'].isin(['F'])]

In [None]:
# Define function for non-european countries
def filter_non_europe(df, column_name='COUNTRY', country_list=europe):
    return df[~df[column_name].isin(europe)]

In [None]:
# New object for Non-europe players playing at F in regular season games:
non_europe_F = filter_non_europe(df_100pos)
non_europe_F = non_europe_F.loc[non_europe_F['POSITION'].isin(['F'])]

In [None]:
# Check number of unique European forwards since 1996 during regular season:
europe_F.groupby('POSITION')['PLAYER_ID'].nunique()

In [None]:
# Check number of unique Non-European forwards since 1996 during regular season:
non_europe_F.groupby('POSITION')['PLAYER_ID'].nunique()

In [None]:
# Checking for anomalies via comparing AVG metrics between European and Non-European:

import altair as alt
import pandas as pd
import altair_saver


# Annahme: Sie haben zwei DataFrames: data_per_min_europe_regular und data_per_min_usa_regular

# Fügen Sie eine neue Spalte hinzu, um anzuzeigen, ob der Spieler Europäer ist oder nicht
europe_F['Europe'] = 'European'
non_europe_F['Europe'] = 'Non-European'

# Kombinieren Sie die beiden DataFrames
combined_df = pd.concat([europe_F, non_europe_F])

# Liste der Metriken, die geplottet werden sollen
metrics = ['FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA', 
           'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 
           'PF', 'PTS', 'FG3A%', 'PTS/FGA', 'FG3M/FGM', 'FTA/FGA', 
           'TRU%', 'AST_TOV']

# Liste zum Speichern der Diagramme
charts = []

# Einen Bruchteil des kombinierten DataFrame auswählen
sampled_df = combined_df.sample(frac=0.5)  # Passen Sie den Bruchteil bei Bedarf an

# Für jede Metrik ein gruppiertes Balkendiagramm erstellen und die Diagramme in eine Liste speichern
for metric in metrics:
    chart = alt.Chart(sampled_df).mark_bar().encode(
        x=alt.X('Europe:N', title='Europe'),
        y=alt.Y(f'median({metric}):Q', title=f'Average {metric}'),
        color=alt.Color('Europe:N', legend=alt.Legend(title='Europe'))
    ).properties(
        title=f'Average {metric} by Europe',
        width=400,
        height=300
    )
    charts.append(chart)

# Diagramme in sechs Zeilen stapeln
alt_row = alt.vconcat(*[alt.hconcat(*charts[i:i+4]) for i in range(0, len(charts), 4)])

# Die Diagramme anzeigen
alt_row


In [None]:
# Create a heatmap for CORR between different metrics
import altair as alt
import pandas as pd

# Select the columns you want to check for correlation
selected_columns = ['PLAYER_HEIGHT_METERS', 'FGM', 'FGA', 'FG3M', 'FG3A', 'FTM', 'FTA', 
                    'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'FG3M/FGM', 'TRU%']

# Create a correlation matrix
correlation_matrix = df_100pos[selected_columns].corr()

# Convert correlation matrix to tidy format
correlation_tidy = correlation_matrix.stack().reset_index()
correlation_tidy.columns = ['variable1', 'variable2', 'correlation']

# Create heatmap
heatmap = alt.Chart(correlation_tidy).mark_rect().encode(
    x='variable1:O',
    y='variable2:O',
    color='correlation:Q'
).properties(
    title='Correlation Heatmap'
)

# Display heatmap
heatmap

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Calculate mean and standard deviation for FG3M/FGM for data_per_min_europe_regular
avg_fg3m_fgm_europe = europe_F.groupby('PLAYER_ID')['FG3M'].agg(['mean', 'std'])

# Calculate mean and standard deviation for FG3M/FGM for data_per_min_usa_regular
avg_fg3m_fgm_usa = non_europe_F.groupby('PLAYER_ID')['FG3M'].agg(['mean', 'std'])

# Create a scatter plot
plt.figure(figsize=(10, 6))

# Scatter plot for data_per_min_europe_regular
plt.scatter(avg_fg3m_fgm_europe['mean'], avg_fg3m_fgm_europe['std'], label='Europe Regular', color='blue')

# Scatter plot for data_per_min_usa_regular
plt.scatter(avg_fg3m_fgm_usa['mean'], avg_fg3m_fgm_usa['std'], label='USA Regular', color='red', alpha=0.3)

# Add labels and title
plt.xlabel('Average FG3M PRC')
plt.ylabel('Standard Deviation')
plt.title('Scatter Plot of Average FG3M/FGM Ratio vs. Standard Deviation (Grouped by PLAYER_ID)')
plt.legend()

# Show plot
plt.grid(True)
plt.show()


std_europe = avg_fg3m_fgm_europe['std'].mean()
std_usa = avg_fg3m_fgm_usa['std'].mean()

print("Standard Deviation for Europe:", std_europe)
print("Standard Deviation for USA:", std_usa)

In [None]:
## Bar chart of forwards per country since 1996

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
import numpy as np

# Season count per country per player:
above_avg_F = europe_F.groupby('COUNTRY')['PLAYER_NAME'].nunique().reset_index()

# Sort the DataFrame by 'Unique Player Count' in descending order
above_avg_F = above_avg_F.sort_values(by='PLAYER_NAME', ascending=False)

fig, ax = plt.subplots(figsize=(10, 6))

sns.barplot(x='COUNTRY', y="PLAYER_NAME", data=above_avg_F, palette='rocket', linewidth=2)  # Create a count plot

plt.xticks(rotation=80)  # Rotate x-axis labels for better readability with many countries

plt.title("European Forwards in NBA since 1996", weight='bold', fontsize=16)
plt.xlabel("Country", rotation=0, labelpad=20, weight='bold', fontsize=12)
plt.ylabel("Count", rotation=0, labelpad=20, weight='bold', fontsize=12)
plt.ylim(ymin=0)        # Start y-axis at zero

# Set the y-axis to show only integer ticks
plt.gca().yaxis.set_major_locator(ticker.MaxNLocator(integer=True))


plt.grid(axis='y', linestyle='--', alpha=0.7)  # Add grid lines with style

plt.tight_layout()  # Adjust spacing for better readability

plt.show()  # Display the plot

In [None]:
## Bar chart of forwards with above average stats

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
import numpy as np


# Step 1: Filter 'europe_F' based on the above AVG metrics for Forwards:
filtered_df = europe_F[(europe_F['FG3M/FGM'] >= 0.3) & (europe_F['FG3A%'] >= 0.35) & (europe_F['AST'] >= 3.0)]

# Season count per country per player:
above_avg_F = filtered_df.groupby('COUNTRY')['PLAYER_NAME'].nunique().reset_index()

# Sort the DataFrame by 'Unique Player Count' in descending order
above_avg_F = above_avg_F.sort_values(by='PLAYER_NAME', ascending=False)

fig, ax = plt.subplots(figsize=(10, 6))

sns.barplot(x='COUNTRY', y="PLAYER_NAME", data=above_avg_F, palette='rocket', linewidth=2)  # Create a count plot

plt.xticks(rotation=80)  # Rotate x-axis labels for better readability with many countries

plt.title("European Forwards in NBA with > AVG Stats since 1996", weight='bold', fontsize=16)
plt.xlabel("Country", rotation=0, labelpad=20, weight='bold', fontsize=12)
plt.ylabel("Count", rotation=0, labelpad=20, weight='bold', fontsize=12)
plt.ylim(ymin=0)
plt.ylim(ymax=8)         # Start y-axis at zero

# Set the y-axis to show only integer ticks
plt.gca().yaxis.set_major_locator(ticker.MaxNLocator(integer=True))


plt.grid(axis='y', linestyle='--', alpha=0.7)  # Add grid lines with style

plt.tight_layout()  # Adjust spacing for better readability

plt.show()  # Display the plot

In [None]:
import pandas as pd
import altair as alt

# Calculate mean and standard deviation for AVG AST for data_per_min_europe_regular
avg_ast_std_europe = europe_F_regular.groupby('PLAYER_NAME')['FG3M'].agg(['mean', 'std']).reset_index()

# Calculate mean and standard deviation for AVG AST for data_per_min_usa_regular
avg_ast_std_usa = non_europe_F_regular.groupby('PLAYER_NAME')['FG3M'].agg(['mean', 'std']).reset_index()

# Create scatter plot for data_per_min_europe_regular
scatter_europe = alt.Chart(avg_ast_std_europe).mark_circle(size=60).encode(
    x='mean:Q',
    y='std:Q',
    tooltip=['PLAYER_NAME', 'mean', 'std'],
    color=alt.value('blue'),
    opacity=alt.value(0.8)
).properties(
    width=400,
    height=300,
    title='Scatter Plot of Average FG3M vs. Standard Deviation (Europe Regular)'
)

# Create scatter plot for data_per_min_usa_regular
scatter_usa = alt.Chart(avg_ast_std_usa).mark_circle(size=60).encode(
    x='mean:Q',
    y='std:Q',
    tooltip=['PLAYER_NAME', 'mean', 'std'],
    color=alt.value('red'),
    opacity=alt.value(0.8)
).properties(
    width=400,
    height=300,
    title='Scatter Plot of Average FG3M vs. Standard Deviation (USA Regular)'
)

# Concatenate the plots
scatter_europe | scatter_usa

In [None]:
import pandas as pd
import altair as alt

# Calculate mean and standard deviation for AVG AST for data_per_min_europe_regular
avg_ast_std_europe = europe_F_regular.groupby('PLAYER_NAME')['FG3M'].agg(['median', 'std']).reset_index()

# Calculate mean and standard deviation for AVG AST for data_per_min_usa_regular
avg_ast_std_usa = non_europe_F_regular.groupby('PLAYER_NAME')['FG3M'].agg(['median', 'std']).reset_index()

# Create scatter plot for data_per_min_europe_regular
scatter_europe = alt.Chart(avg_ast_std_europe).mark_circle(size=60).encode(
    x='median:Q',
    y='std:Q',
    tooltip=['PLAYER_NAME', 'median', 'std'],
    color=alt.value('blue'),
    opacity=alt.value(0.8)
).properties(
    width=400,
    height=300,
    title='Scatter Plot of Average FG3M vs. Standard Deviation (Europe Regular)'
)

# Create scatter plot for data_per_min_usa_regular
scatter_usa = alt.Chart(avg_ast_std_usa).mark_circle(size=60).encode(
    x='median:Q',
    y='std:Q',
    tooltip=['PLAYER_NAME', 'median', 'std'],
    color=alt.value('red'),
    opacity=alt.value(0.8)
).properties(
    width=400,
    height=300,
    title='Scatter Plot of Average FG3M vs. Standard Deviation (USA Regular)'
)

# Concatenate the plots
scatter_europe | scatter_usa