In [9]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import numpy as np
from sklearn.decomposition import PCA
import plotly.graph_objects as go
from collections import defaultdict
import csv


In [10]:
def pca_processing(fname, n_comp=3):
    '''
    Function to decrease the dimension of the players based on their POSS_PCT
    :param fname: the path of the pca_table file
    :param n_comp: the number of main components we want to use
    :return: data after pca with dimension n*n_comp; player names
    '''
    df = pd.read_csv(fname)
    kept_cols = [
        col for col in df.columns if col.endswith("freq")
    ]
    kept_cols.append("PLAYER_NAME")
    kept_cols.reverse()
    df = df[kept_cols]

    player_name = df["PLAYER_NAME"]
    data_org = df.iloc[:, 2:]
    pca = PCA(n_components=n_comp)
    pca.fit(data_org)
    data_pca = data_org @ pca.components_.T
    assert len(data_pca) == len(player_name)

    return data_pca, player_name


def k_means(fname, dim=3, cluster_num=5):
    '''
    Function to cluster the data into cluster_num groups and visualize them in a 3D space
    :param fname: the path of the pca_table file
    :param dim: the number of dimensions we want to use
    :param cluster_num: the number of clusters we want to cluster them
    :return: NONE
    '''
    assert isinstance(fname, str)
    data, names = pca_processing(fname, dim)
    X = np.array(data)
    k_means = KMeans(n_clusters=cluster_num).fit(X)
    labels = k_means.labels_
    distance = k_means.transform(X)
    return names, X, labels, distance


def calculate_top5(distance, names):
    '''
    Function to find the top5 players who are the most closest to the k-means's cluster center
    :param distance: Distance of each player to the clusters
    :param names: names of each player
    :return: top5 names of each clusters
    '''
    dim = len(distance[0])
    result = np.zeros((dim, 5))
    top5_name = []
    for i in range(dim):
        temp = []
        curr = np.array(distance[:, i])
        min_5 = curr.argsort()[:5]
        result[i, :] = min_5
        for index in min_5:
            temp.append(names[index])
        top5_name.append(temp)
    return top5_name

def top5_img(distance, names, year, cluster_num = 7, test = False):
    '''
    :param distance: distance of player to kmeans center
    :param names: player names
    :param year: year
    :return: dictionary of player names and their play type
    '''
    assert 2015 <= year <= 2019
    top5names = calculate_top5(distance, names)
    path = "data/data_cleaned/pca_data/" + str(year) + "_pca_table.csv"
    if test:
        path = "../data/data_cleaned/pca_data/" + str(year) + "_pca_table.csv"
    df = pd.read_csv(path)
    data_per_player = np.zeros((1, 11))
    for i in range(len(top5names)):
        for j in range(len(top5names[i])):
            new_df = df[df['PLAYER_NAME'] == top5names[i][j]]
            data = new_df[
                ["iso_freq", "tr_freq", "prb_freq", "prr_freq", "pu_freq", "su_freq", "ho_freq", "cut_freq", "os_freq",
                 "putback_freq", "misc_freq"]]
            data = data.values.tolist()
            data = np.array(data)[0]
            data_per_player = np.vstack((data_per_player, data))
    data_per_player = data_per_player[1:,:]
    result = {}
    for i in range(cluster_num):
        for j in range(5):
            result[top5names[i][j]] = data_per_player[i * 5 + j, :]
    return result

def update_app1():
    """
    Updates App 1
    """
    fig = go.Figure()
    final_list = ['2015', '2016', '2017', '2018', '2019']
    dims = [7, 7, 7, 8, 8]
    count = 0

    for each in final_list:
        names, X, labels, _ = k_means('data/data_cleaned/pca_data/' + each + '_pca_table.csv', dim=3,
                                      cluster_num=dims[count])
        count += 1
        fig.add_trace(
            go.Scatter3d(x=X[:, 1], y=X[:, 0], z=X[:, 2], text=names, hoverinfo='text', mode='markers',
                         marker=dict(color=labels), visible=False, name="Player Clusters for " + each)
        )

    # print(len(fig.data))
    fig.data[0].visible = True

    steps = []
    for i in range(len(fig.data)):
        step = dict(
            method="restyle",
            args=["visible", [False] * len(fig.data)],
            label='Year ' + str(i + 2015)

        )
        step["args"][1][i] = True  # Toggle i'th trace to "visible"
        steps.append(step)

    sliders = [dict(
        active=5,
        currentvalue={"prefix": "Year: "},
        pad={"t": 5},
        steps=steps
    )]
    start_index = 2015
    fig.update_layout(
        sliders=sliders,
        # title = "9 Cluster classification of players based on Scoring Styles"
        title={"text": "Scoring Clusters Per Year"}
    )
    return fig

def update_app3(year):
    """
    Updates App3
    """
    category_names = ["Iso", "Tra", "PRB", "PRR", "Pos", "Spo", "Han", "Cut", "Off", "OffR", "Misc"]
    dims = {"2015": 7, "2016": 7, "2017": 7, "2018": 8, "2019": 8}
    print(year)
    print(dims[year])
    names, _, _, distance = k_means("data/data_cleaned/pca_data/" + year + '_pca_table.csv', 3, dims[year])
    results = top5_img(distance, names, int(year), dims[year])
    labels = list(results.keys())
    new_labels = []
    start = " "
    for i in range(len(labels)):
        new_labels.append(labels[i])
        if i % 5 == 4:
            new_labels.append(start)
            start += " "
    labels = new_labels
    print(labels)
    data = np.array(list(results.values()))
    new_data = []
    for i in range(len(data)):
        new_data.append(data[i])
        if i % 5 == 4:
            new_data.append(np.zeros(shape=data[0].shape))
    data = np.array(new_data)

    data_cum = data.cumsum(axis=1)
    category_colors = plt.get_cmap('RdYlGn')(
        np.linspace(0.15, 0.85, data.shape[1]))

    fig = go.Figure()
    for i, (colname, color) in enumerate(zip(category_names, category_colors)):
        widths = data[:, i]
        color = tuple([color[i] * 255 for i in range(3)] + [1])
        starts = data_cum[:, i] - widths
        fig.add_trace(go.Bar(
            y=labels,
            x=widths,
            name=colname,
            orientation='h',
            marker=dict(
                color='rgba' + str(color),
                line=dict(color='rgba' + str(color), width=1.5)
            )
        )
        )
    fig.update_layout(autosize=True, width=900, height=1200, barmode='stack')
    return fig



names, X, labels, distance = k_means('data/data_cleaned/pca_data/2015_pca_table.csv', 3, 7)
# results = top5_img(distance, names, 2015)
#print(results)
print(len(labels))
a = update_app3('2015')
a.show()
cluster_index = {}
for i in range(len(names)):
    cluster_index[names[i]] = labels[i]
    
    



395
2015
7
['Danilo Gallinari', 'Khris Middleton', 'Kevin Martin', 'Jeff Green', 'Gerald Henderson', ' ', 'Tyson Chandler', 'Brandan Wright', 'DeAndre Jordan', 'Ian Mahinmi', 'JaVale McGee', '  ', 'Mario Hezonja', 'Evan Fournier', 'Marcus Thornton', 'Kentavious Caldwell-Pope', 'Markel Brown', '   ', 'Al-Farouq Aminu', 'Robert Covington', 'Jae Crowder', 'Gerald Green', 'Lamar Patterson', '    ', 'Alex Len', 'Roy Hibbert', 'Brook Lopez', 'Nene', 'Greg Monroe', '     ', 'Jon Leuer', 'Terrence Jones', 'Kelly Olynyk', 'Darrell Arthur', 'Mike Scott', '      ', 'CJ McCollum', 'Monta Ellis', 'John Wall', 'Eric Bledsoe', 'Mario Chalmers', '       ']


In [16]:
update_app1()

cluster_names = {cluster_index['Jeff Green']: "Versatile Wings", cluster_index['Eric Bledsoe']: "PnR/Ball handling Guards", cluster_index['Kentavious Caldwell-Pope']: "C&S Guards", cluster_index['Robert Covington']:  "Perimeter Wings", cluster_index['Brook Lopez']: "Post-Up Bigs", cluster_index['Mike Scott']: "Stretch 4s", cluster_index['DeAndre Jordan']: "Rolling Bigs"}


In [21]:
# cluster_index
import plotly.express as px

pp_data = pd.read_csv("data/data_cleaned/poss_ppp_data/poss2015.csv")
cluster_poss = defaultdict(int)
def get_ppp(pp_new):
    ppp = pp_new['iso_ppp']*pp_new['iso_poss'] + pp_new['tr_ppp']*pp_new['tr_poss'] + pp_new['prb_ppp']*pp_new['prb_poss']+pp_new['prr_ppp']*pp_new['prr_poss']+ pp_new['pu_ppp']*pp_new['pu_poss']+ pp_new['su_ppp']*pp_new['su_poss']+pp_new['ho_ppp']*pp_new['ho_poss']+pp_new['cut_ppp']*pp_new['cut_poss']+pp_new['os_ppp']*pp_new['os_poss']+pp_new['putback_ppp']*pp_new['putback_poss']+pp_new['misc_ppp']*pp_new['misc_poss']
    return ppp 
    
    
    
    

ppp_dict = defaultdict(float)


for each in cluster_index:
    pp_new = pp_data[pp_data['PLAYER_NAME'] == each]
    clus = cluster_index[each]
    cluster_poss[cluster_names[clus]]+= pp_new['total_poss'].values[0]
    # print(get_ppp(pp_new).values[0])
    ppp_dict[cluster_names[clus]] += get_ppp(pp_new).values[0]
    
    


for each in ppp_dict:
    ppp_dict[each] = ppp_dict[each]/float(cluster_poss[each])
print(ppp_dict)  
    
a = cluster_poss.values()
norm = [(float(i)-min(a))/(max(a)-min(a)) for i in a]

fig = px.bar(x = cluster_poss.keys(), y = norm,height=400, color = list(cluster_poss.keys()),labels={
                     "x" : "Play-Style Cluster",
                     "y": "Normalized Possession Share",
                     "color" : "Play-Style"
                 },
            title="Normalized Player-style possession share vs Playstyle")
fig.show()

a = ppp_dict.values()
norm = [(float(i)-min(a))/(max(a)-min(a)) for i in a]


fig = px.bar(x = ppp_dict.keys(), y = norm,height=400, color = list(ppp_dict.keys()),labels={
                     "x" : "Play-Style Cluster",
                     "y": "Normalized player-style efficiency",
                     "color" : "Play-Style"
                 },
            title="Normalized Player-style efficiency vs Playstyle")
fig.show()
    

defaultdict(<class 'float'>, {'PnR/Ball handling Guards': 0.9086360344481821, 'Stretch 4s': 0.9335830291559645, 'Post-Up Bigs': 0.9644462762734028, 'Perimeter Wings': 0.9603076749725512, 'Rolling Bigs': 0.9912432101503071, 'Versatile Wings': 0.972922086149388, 'C&S Guards': 0.9397766071680584})


In [23]:
df_shots = pd.read_csv('data/shot_logs_margin.csv')

clutch = df_shots[(df_shots['Margin'] <= 5) & (df_shots['PERIOD'] == 4) & (df_shots['MINUTES_REMAINING'] <= 5) ]

cluster_names = {cluster_index['Jeff Green']: "Versatile Wings", cluster_index['Eric Bledsoe']: "PnR/Ball handling Guards", cluster_index['Kentavious Caldwell-Pope']: "C&S Guards", cluster_index['Robert Covington']:  "Perimeter Wings", cluster_index['Brook Lopez']: "Post-Up Bigs", cluster_index['Mike Scott']: "Stretch 4s", cluster_index['DeAndre Jordan']: "Rolling Bigs"}
# print(df_shots[''])
# clutch = df_shots[(df_shots.FINAL_MARGIN < 5) & (df_shots.GAME_CLOCK < '5:00') & (df_shots.PERIOD == 4)]
# raw = df_shots
# counts = raw[(raw.FINAL_MARGIN < 5) & (raw.GAME_CLOCK < '5:00') & (raw.PERIOD == 4)].groupby('SHOT_RESULT').size()
# print(counts)
print(len(clutch))
clutch_dict = {}
areas = df_shots['SHOT_ZONE_BASIC'].unique()
print(areas)
area_EFG = {}
for each in names:
    
    
    
    try:
        curr = clutch[clutch['PLAYER_NAME'] == each]
        twos = curr[curr['SHOT_TYPE'] == '2PT Field Goal']
        threes = curr[curr['SHOT_TYPE'] == '3PT Field Goal']
        EFG = (len(threes[threes['SHOT_MADE_FLAG'] == 1])* 0.5 + len(curr[curr['SHOT_MADE_FLAG'] == 1]))/ len(curr)
        clutch_dict[each] = (EFG, len(curr))
    except:
        continue
        
for each in areas:
    
    
    
    try:
        curr = clutch[clutch['SHOT_ZONE_BASIC'] == each]
        twos = curr[curr['SHOT_TYPE'] == '2PT Field Goal']
        threes = curr[curr['SHOT_TYPE'] == '3PT Field Goal']
        EFG = (len(threes[threes['SHOT_MADE_FLAG'] == 1])* 0.5 + len(curr[curr['SHOT_MADE_FLAG'] == 1]))/ len(curr)
        area_EFG[each] = (EFG, len(curr))
    except:
        continue       
        
print(area_EFG)

with open('data/clutchstats.csv', 'w') as csv_file:  
    writer = csv.writer(csv_file)
    for key, value in clutch_dict.items():
        writer.writerow([key, value])
    

clust_dict = cluster_index
weighted_score = defaultdict(float)
weighted_att = defaultdict(float)
weighted_EFG = {}
attempts = defaultdict(int)
for each in clust_dict:
    try:
        score = clutch_dict[each] 
        weighted_score[clust_dict[each]] += score[0]*score[1]
        weighted_att[clust_dict[each]] += score[1]
        weighted_EFG[cluster_names[ clust_dict[each]]  ] = weighted_score[clust_dict[each]]/ weighted_att[clust_dict[each]]
        attempts[cluster_names[ clust_dict[each]]  ] += score[1]

    except:
        continue
    
    
print(attempts)
    


print(weighted_EFG)


    


        

5833
['Mid-Range' 'Restricted Area' 'In The Paint (Non-RA)' 'Above the Break 3'
 'Left Corner 3' 'Right Corner 3' 'Backcourt']
{'Mid-Range': (0.3942901234567901, 1296), 'Restricted Area': (0.5787486515641855, 1854), 'In The Paint (Non-RA)': (0.39146800501882056, 797), 'Above the Break 3': (0.4713687150837989, 1432), 'Left Corner 3': (0.5341463414634147, 205), 'Right Corner 3': (0.5432098765432098, 243), 'Backcourt': (0.25, 6)}
defaultdict(<class 'int'>, {'PnR/Ball handling Guards': 1997, 'Stretch 4s': 588, 'Post-Up Bigs': 624, 'Perimeter Wings': 752, 'Rolling Bigs': 282, 'Versatile Wings': 748, 'C&S Guards': 657})
{'PnR/Ball handling Guards': 0.45693540310465697, 'Stretch 4s': 0.48554421768707484, 'Post-Up Bigs': 0.5160256410256411, 'Perimeter Wings': 0.5146276595744681, 'Rolling Bigs': 0.5425531914893617, 'Versatile Wings': 0.4505347593582888, 'C&S Guards': 0.4840182648401826}


In [19]:
import plotly.express as px
a = weighted_EFG.values()
norm = [(float(i)-min(a))/(max(a)-min(a)) for i in a]

fig = px.bar(x = weighted_EFG.keys(), y = norm,height=400, color = list(weighted_EFG.keys()),labels={
                     "x" : "Play-Style Cluster",
                     "y": "Normalized Scoring Efficiency",
                     "color" : "Play-Style"
                 },
            title="Normalized Efficiency vs Playstyle")
fig.show()

print(len(names))

395


In [20]:
import plotly.express as px
a = attempts.values()
norm = [(float(i)-min(a))/(max(a)-min(a)) for i in a]

fig = px.bar(x = attempts.keys(), y = norm,height=400, color = list(weighted_EFG.keys()),labels={
                     "x" : "Play-Style Cluster",
                     "y": "Normalized Scoring Efficiency",
                     "color" : "Play-Style"
                 } ,
            title="Normalized Attempt volume vs Playstyle")
fig.show()

print(len(names))

395
