In [None]:
import pandas as pd
import numpy as np
import math
from pyecharts.charts import (Pie
                              ,Bar
                              ,Map)
from pyecharts import options as opts

from factor_analyzer import (factor_analyzer
                             ,Rotator
                             ,FactorAnalyzer)

In [None]:
player_df = pd.read_csv(r'C:\Users\Administrator\Desktop\桌面文件\nba_player.csv')
player_df.head()

In [None]:
player_df.info()

### 数据处理

In [None]:
#剔除没有出过场的球员
player_df1 = player_df[player_df.changshu != 0]

#删除球员详情页url
player_df1.drop(columns=['url'],inplace=True)

#提取球员体重
player_df1.weight = player_df1.weight.str.split().str[0]

#重置索引
player_df1.reset_index(drop=True,inplace=True)

#转换数据类型
player_df1.height = player_df1.height.astype('float64')
player_df1.weight = player_df1.weight.astype('float64')
player_df1.year = player_df1.year.astype('int64')

### 数据探索

1.球员身高分布（饼图）

2.球员体重分布（玫瑰图）

3.国籍分布（环形图）

4.球龄分布（条形图）

5.球员数量分布图(地图)

In [None]:
#球员身高分布

#分箱
bins = [1.75,1.80,1.85,1.90,1.95,2.00,2.05,2.10,2.15,2.20,2.25,2.30]
labels = ['1.75-1.80m','1.81-1.85m','1.86-1.90m','1.91-1.95m','1.96-2.00m'\
          ,'2.01-2.05m','2.06-2.10m','2.11-2.15m','2.16-2.20m','2.21-2.25m','2.26-2.30m']

#数据准备
height_cut = pd.cut(player_df1.height,bins,labels = labels,right=True,include_lowest=False)
y_data = height_cut.value_counts().tolist()
x_data = height_cut.value_counts().index.tolist()

#绘图
pie1 = Pie(init_opts=opts.InitOpts(width="770px",height="400px"))
pie1.add("",[z for z in zip(x_data , y_data)])
pie1.set_global_opts(title_opts=opts.TitleOpts(title = "球员身高分布")
                     ,legend_opts=opts.LegendOpts(orient="vercial"
                                                  ,pos_left="5%"
                                                  ,pos_bottom="10%"))
pie1.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}\n{d}%"))
pie1.render_notebook()

In [None]:
#球员体重分布

#分箱
bins = [75,85,95,105,115,125,135,145]
labels = ['75-85kg','86-95kg','96-105kg','106-115kg','116-125kg','126-135kg','136-145kg']

#数据准备
weight_cut = pd.cut(player_df1.weight,bins,labels=labels,right=True,include_lowest=False)
y_data = weight_cut.value_counts().tolist()
x_data = weight_cut.value_counts().index.tolist()

#绘图
pie2 = Pie(init_opts=opts.InitOpts(width="650px",height="400px"))
pie2.add("",[z for z in zip(x_data , y_data)],rosetype=True)
pie2.set_global_opts(title_opts=opts.TitleOpts(title = "球员体重分布")
                     ,legend_opts=opts.LegendOpts(orient="vercial"
                                                  ,pos_left="0%"
                                                  ,pos_bottom="10%"))
pie2.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}\n{d}%"))
pie2.render_notebook()

In [None]:
#国籍分布

#数据准备
country_df1 = player_df1.groupby(by=['country']).agg({'country':'count'})
country_df1.columns = ['num']
country_df1.sort_values(by=['num'],ascending=False,inplace=True)
#选择频数大于等于4的国籍，其余的归为其他
country1 = country_df1[country_df1['num'] >= 4]
country2 = country_df1[country_df1['num'] < 4]
order_num = country2.sum()
country1.loc['其他'] = order_num

y_data = country1.num.tolist()
x_data = country1.index.tolist()

#绘图
pie3 = Pie(init_opts=opts.InitOpts(width="650px",height="450px"))
pie3.add(""
         ,[z for z in zip(x_data , y_data)]
         ,radius=["35%","60%"]
         ,rosetype=None)
pie3.set_global_opts(title_opts=opts.TitleOpts(title = "国籍分布")
                     ,legend_opts=opts.LegendOpts(orient="vercial"
                                                  ,pos_left="0%"
                                                  ,pos_bottom="10%"))
pie3.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}\n{d}%"))
pie3.render_notebook()

In [None]:
#球龄分布

#数据准备
year_df1 = player_df1.groupby(by = ['year']).agg({'year':'count'})

year_df1.index = 2020 - year_df1.index
year_df1.columns = ['num']

x_data = year_df1.index.tolist()
y_data = year_df1.num.tolist()

bar1 = Bar(init_opts=opts.InitOpts(width="800px",height="450px"))
bar1.add_xaxis(x_data)
bar1.add_yaxis("人数",y_data)
bar1.set_global_opts(title_opts = opts.TitleOpts(title = "球龄分布")
                     ,legend_opts = opts.LegendOpts(is_show=False)
                     ,visualmap_opts = opts.VisualMapOpts(max_ = max(y_data))
                     ,xaxis_opts = opts.AxisOpts(name = "球龄")
                     ,yaxis_opts = opts.AxisOpts(name = "人数"))
bar1.render_notebook()

In [None]:
#球员数量分布图

#数据准备
country_df2 = player_df1.groupby(by=['country']).agg({'country':'count'})
country_df2.columns = ['num']
country_df2.reset_index(inplace=True)

replace_dict = {
    '美国':'United States'
    ,'加拿大':'Canada'
    ,'乌克兰':'Ukraine'
    ,'克罗地亚':'Croatia'
    ,'刚果民主共和国':'Congo'
    ,'南苏丹':'Sudan'
    ,'喀麦隆':'Cameroon'
    ,'土耳其':'Turkey'
    ,'埃及':'Egypt'
    ,'塞内加尔':'Senegal'
    ,'塞尔维亚':'Serbia'
    ,'多明尼加共和国':'Dominican'
    ,'奥地利':'Austria'
    ,'安哥拉':'Angola'
    ,'尼日利亚':'Nigeria'
    ,'巴哈马':'Bahamas'
    ,'巴西':'Brazil'
    ,'希腊':'Greece'
    ,'德国':'Germany'
    ,'意大利':'Italy'
    ,'拉脱维亚':'Latvia'
    ,'捷克共和国':'Chech'
    ,'斯洛文尼亚':'Slovenia'
    ,'新西兰':'New Zealand'
    ,'格鲁吉亚':'Georgia'
    ,'法国':'France'
    ,'澳洲':'Australia'
    ,'瑞士':'Switzerland'
    ,'立陶宛':'Lithuania'
    ,'芬兰':'Finland'
    ,'苏丹':'sultan'
    ,'英国':'United Kingdom'
    ,'西班牙':'Spain'
    ,'黑山':'Montenegro'
}

country_df2.country = country_df2.country.replace(replace_dict)

y_data = country_df2.num.tolist()
x_data = country_df2.country.tolist()

map1 = Map()
map1.add("球员数量",[z for z in zip(x_data,y_data)],"world")
map1.set_global_opts(title_opts=opts.TitleOpts(title = "球员数量分布图")
                     ,legend_opts=opts.LegendOpts(is_show=False)
                     ,visualmap_opts=opts.VisualMapOpts(max_ = max(y_data)))
map1.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
map1.render_notebook()

### 数据建模

因子分析对球员进行综合排名

In [None]:
#提取球员各项数据
player_c = player_df1[['changshu','xianfa','fenzhong','mingzhonglv','sanfen',\
            'jingong','fanshou','defen','lanban','zhugong','qiangduan','gaimao','fangui']]

#计算先发率，加入惩罚项
changshu_z = (player_c.changshu - player_c.changshu.min()) / (player_c.changshu.max() - player_c.changshu.min())
player_c['xianfalv'] = changshu_z * player_df1.xianfa / player_df1.changshu

#删除场数和先发两个指标
player_c.drop(columns=['changshu','xianfa'],inplace=True)

#标准化
f = lambda x:(x - x.min()) / (x.max() - x.min())
player_z = player_c.apply(f , axis = 0)

#失误为逆指标,作正向化处理
player_z['shiwu'] = (player_df1.shiwu.max()-player_df1.shiwu) / (player_df1.shiwu.max()-player_df1.shiwu.min())

#相关系数矩阵
player_z1 = np.array(player_z.corr())

#kmo检验和bartlett检验
def kmo_bartlett(player_c):
    kmo = round(factor_analyzer.calculate_kmo(player_c)[1],5)
    bartlett = round(factor_analyzer.calculate_bartlett_sphericity(player_c)[1],5)
    return kmo,bartlett

kmo_num , bartlett_num =  kmo_bartlett(player_c)
print("kmo：" + str(kmo_num) + "\nbartlett：" + str(bartlett_num))

#公因子个数
eig_value , aig_vector = np.linalg.eig(player_z1)
eig_value_df = pd.DataFrame()
eig_value_df['column'] = player_z.columns
eig_value_df['eig_value'] = eig_value
eig_value_df['eig_value_cumsum'] = eig_value_df['eig_value'].cumsum() / eig_value_df['eig_value'].sum()
m = eig_value_df[eig_value_df['eig_value_cumsum'] < 0.88].index.max()+2
print("Factor num：" + str(m))

In [None]:
#建模

factor_name = ['Factor1','Factor2','Factor3','Factor4','Factor5']

#实例化
fa = FactorAnalyzer(n_factors = 5
                    ,method="principal"
                    ,rotation='varimax')
fa.fit(player_z)

#公因子方差
print(pd.DataFrame(fa.get_communalities()
                   ,index=player_z.columns
                   ,columns=['Communalities']))

#因子载荷矩阵
print(pd.DataFrame(fa.loadings_
                   ,index = player_z.columns
                   ,columns = factor_name))

#贡献度，累计贡献度
print(pd.DataFrame(fa.get_factor_variance()
                   ,index = ['SS Loadings','Proportion Var','Cumulative Var']
                   ,columns = factor_name).T)

#成分得分系数矩阵
print(pd.DataFrame(np.dot(np.linalg.inv(player_z1),fa.loadings_)
                   ,index = player_z.columns
                   ,columns = factor_name))

#计算各球员得分，综合得分
factor_score = pd.DataFrame(np.dot(player_z,np.dot(np.linalg.inv(player_z1),fa.loadings_))
                             ,index = player_df1.name.tolist()
                             ,columns = factor_name)
factor_score['factor_all'] = factor_score.apply(lambda x:x.sum(),axis=1)

#球员TOP10
factor_score.sort_values(by=['factor_all'],ascending=False)[:5]