<a href="https://colab.research.google.com/github/HUANG37W/Data-Structure-Tutorial/blob/master/module7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

csv_url1 = "/content/player_data.csv"
csv_url2 = "/content/Seasons_Stats.csv"

players = pd.read_csv(csv_url1)
stats = pd.read_csv(csv_url2)
players

Unnamed: 0,name,year_start,year_end,position,height,weight,birth_date,college
0,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke University
1,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,"April 7, 1946",Iowa State University
2,Kareem Abdul-Jabbar,1970,1989,C,7-2,225.0,"April 16, 1947","University of California, Los Angeles"
3,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",Louisiana State University
4,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974",San Jose State University
...,...,...,...,...,...,...,...,...
4545,Ante Zizic,2018,2018,F-C,6-11,250.0,"January 4, 1997",
4546,Jim Zoet,1983,1983,C,7-1,240.0,"December 20, 1953",Kent State University
4547,Bill Zopf,1971,1971,G,6-1,170.0,"June 7, 1948",Duquesne University
4548,Ivica Zubac,2017,2018,C,7-1,265.0,"March 18, 1997",


In [None]:
# Clean player data
players.drop(['year_start', 'year_end', 'birth_date', 'college'], axis=1, inplace=True)
players.rename(columns={'name': 'Name', 'position': 'Position', 'height': 'Height', 'weight': 'Weight'}, inplace=True)
players.set_index('Name')
players['Name'].str.strip()
players = players[~players['Name'].duplicated(keep=False)]

# Convert heights from feet-inches to centimeters and weight from pounds to kg
conversions = [30.48, 2.54]
players['Height'] = players['Height'].dropna().str.split('-').apply(pd.Series).astype(int).dot(conversions)
players['Weight'] = players['Weight'] * 0.454

# Clean stats data
stats.drop(['Unnamed: 0', 'Year', 'Pos', 'Age', 'Tm', 'GS', '3PAr', 'FTr', 'ORB%',
            'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'blanl', 'OWS',
            'DWS', 'WS', 'WS/48', 'blank2', 'OBPM', 'DBPM', 'BPM', 'VORP', 'FG', 'FGA',
            '3P', '3PA', '2P', '2PA', 'FT', 'FTA', 'ORB', 'DRB'], axis=1, inplace=True)

# Aggregate player stats
aggregation_functions = {'GamesPlayed': 'sum', 'MinutesPlayed': 'sum', 'PER': 'mean', 'TS%': 'mean',
                         'FG%': 'mean', '3P%': 'mean', '2P%': 'mean', 'eFG%': 'mean',
                         'FT%': 'mean', 'TRB': 'sum', 'AST': 'sum', 'STL': 'sum',
                         'BLK': 'sum', 'TOV': 'sum', 'PF': 'sum', 'PTS': 'sum'}
stats.rename(columns={'Player': 'Name', 'G': 'GamesPlayed', 'MP': 'MinutesPlayed'}, inplace=True)
stats = stats.groupby(by=['Name']).aggregate(aggregation_functions).reset_index()

# Combine player information with career stats
combined = pd.merge(players, stats, how='outer', on='Name')

# Select numerical columns for clustering
numeric_columns = ['Height', 'Weight', 'GamesPlayed', 'MinutesPlayed', 'PER', 'TS%',
                   'FG%', '3P%', '2P%', 'eFG%', 'FT%', 'TRB', 'AST', 'STL', 'BLK',
                   'TOV', 'PF', 'PTS']

# Drop rows with missing values in the selected numerical columns
data = combined[numeric_columns].dropna()

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# Perform K-means clustering
k = 4  # Number of clusters
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(scaled_data)

# Add cluster labels to the combined data
data['Cluster'] = labels
combined = pd.merge(combined, data['Cluster'], left_index=True, right_index=True, how='left')





In [None]:
# Plot the clusters
fig = go.Figure()

for cluster in range(k):
    cluster_data = combined[combined['Cluster'] == cluster]
    fig.add_trace(go.Scatter(x=cluster_data['Weight'], y=cluster_data['Height'],
                             mode='markers', text=cluster_data['Name'],
                             marker=dict(color=f'rgba(0, 0, 255, {cluster / (k - 1)})')))

fig.update_layout(
    title='NBA Player Clustering: Height vs Weight',
    xaxis_title='Weight (kg)',
    yaxis_title='Height (cm)',
    plot_bgcolor='rgba(0, 0, 0, 0)'
)

fig.show()

In [None]:
# Plotting clusters based on PPG and FG%
fig = go.Figure()

for cluster in range(k):
    cluster_data = combined[combined['Cluster'] == cluster]
    fig.add_trace(go.Scatter(x=cluster_data['PTS'] / cluster_data['GamesPlayed'], y=cluster_data['FG%'],
                             mode='markers', text=cluster_data['Name'],
                             marker=dict(color=f'rgba(0, 0, 255, {cluster / (k - 1)})')))

fig.update_layout(
    title='NBA Player Clustering: Points per Game vs Field Goal Percentage',
    xaxis_title='Points per Game',
    yaxis_title='Field Goal Percentage',
    plot_bgcolor='rgba(0, 0, 0, 0)'
)

fig.show()