In [1]:
import pandas as pd
import requests
import os
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

plt.rcParams["font.size"] = 16

ModuleNotFoundError: No module named 'sklearn'

# Data

In [None]:
read_file = pd.read_csv (r'train.txt')
read_file.to_csv(r'train.csv', index=None)
train = pd.read_csv("train.csv")
shares = [0.9520000000000001,0.932,0.955,0.879,1.0,0.922,0.986,0.998,0.888,0.977,0.98,0.9690000000000001,0.873,
         0.882,0.7390000000000001,0.8390000000000001,0.991,0.8079999999999999,0.757,0.904,0.998,0.701,0.934,
         0.857,0.986,0.858,0.88,0.852,0.938,0.9279999999999999,0.691,0.782,0.831,0.94,0.981,0.978,0.858,
         0.96,0.735,0.6579999999999999]
train["Share"] = shares
train

In [None]:
read_file = pd.read_csv (r'test.txt')
read_file.to_csv(r'test.csv', index=None)
test = pd.read_csv("test.csv")
test = test.drop(columns=['Unnamed: 31'])
test

# Descriptive Analysis

In [None]:
ax = test.plot(x='Rk',y=['FG%','3P%','FT%'], kind='bar', figsize = (15,5))
ax.set_xticklabels([1,2,3,4,5,6,7,8,9,10], rotation = 360)
ax.set_xlabel('Rank')
ax.set_ylabel('Features')
ax.set_title("Figure 1: Features Vs Rank (2020-21 Season)", pad=20)

In [None]:
xcols = ['G','PTS','FG%','3P%','FT%','TRB','AST','STL','BLK']
ycol = "Share"

# Principal Component Analysis

In [None]:
def explained(scale):
    
    stages = [("imp", SimpleImputer(strategy="most_frequent"))]
    
    if scale:
        stages.append(("std", StandardScaler()))
    stages.append(("pca", PCA()))
    
    p = Pipeline(stages)
    p.fit(train[xcols])

    explained = p["pca"].explained_variance_
    s = pd.Series(explained.cumsum() / explained.sum(),
                  index=range(1, len(xcols)+1))
    print(s)
    return s

In [None]:
ax = explained(False).plot.line(label="not scaled", ylim=0)
explained(True).plot.line(label="scaled", ax=ax)
ax.set_xlabel("Number of Components")
ax.set_ylabel("Explained Variance")
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.legend(frameon=False)
ax.set_title("Figure 2: Principal Components of Breaks", pad=20)

# Linear Regression

In [None]:
lr = LinearRegression()
lr.fit(train[xcols], train[ycol])

In [None]:
test["predicted_shares"] = lr.predict(test[xcols])
test

In [None]:
ax1 = pd.Series(lr.coef_, index = xcols).plot.barh()
ax1.set_xlabel("Weight")
ax1.set_ylabel("Feature")
ax1.spines["top"].set_visible(False)
ax1.spines["right"].set_visible(False)
ax1.set_title("Figure 3: Linear Regression Coefficients", pad=20)

# Prediction

In [None]:
for rank in range(len(test["Rk"].tolist())):
    print((rank+1), " ", test["Player"].tolist()[rank])

In [None]:
test_pred = test.sort_values(by=['predicted_shares'], ascending=False)
for rank in range(len(test_pred["Rk"].tolist())):
    print("Expected: ", (rank+1), " ", "Current: ", test_pred["Rk"].tolist()[rank], " ", test_pred["Player"].tolist()[rank])