In [9]:
import pickle
import numpy as np 
import pandas as pd 
from datetime import datetime
from datetime import date

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
import warnings

## Reading and preparing data 

In [10]:
data = pd.read_csv("nba2k20-full.csv",parse_dates=True)

Let's cast all data columns to correct data types

In [11]:
data['weight'] = [float(data['weight'][i].split()[3]) for i in range(len(data))]
data['height'] = [float(data['height'][i].split()[-1]) for i in range(len(data))]
data['salary'] = [int(data['salary'][i].split('$')[1]) for i in range(len(data))]

data['b_day'] = data['b_day'].apply(lambda x: datetime.strptime(x, '%m/%d/%y').date())
data['age'] = (datetime.today().date() - data['b_day']).astype('<m8[Y]').astype('int64')

data['draft_round'] = data['draft_round'].apply(lambda x: 0 if x=='Undrafted' else int(x)) 

data['team'] = data['team'].fillna('No team')


## Feature engeneering

Dropping some outliers

In [12]:
for column in ['weight', 'height']:
    upper_lim = data[column].quantile(.95)
    lower_lim = data[column].quantile(.05)
    data.loc[(data[column] > upper_lim),column] = upper_lim
    data.loc[(data[column] < lower_lim),column] = lower_lim
for column in ['age', 'rating']:
    upper_lim = data[column].quantile(.95)
    lower_lim = data[column].quantile(.05)
    data.loc[(data[column] > upper_lim),column] = int(upper_lim)
    data.loc[(data[column] < lower_lim),column] = int(lower_lim)

Union some relative positions and getting dummies of some columns

In [13]:
data['position'] = data['position'].apply(lambda x: 'F-C' if x=='C-F' else x)
data['position'] = data['position'].apply(lambda x: 'F-G' if x=='G-F' else x)

for column in ['team', 'country', 'position', 'draft_round']:
    encoded_columns = pd.get_dummies(data[column])
    data = data.join(encoded_columns).drop(column, axis=1)

Splitting into feature and target datasets

In [14]:
data = data.drop(['college', 'full_name', 'b_day', 'jersey', 'draft_peak'], axis=1)
y, X = data['salary'], data.drop('salary', axis=1)
X = preprocessing.normalize(X)


In [15]:
np.savetxt("nba2k20-full_preprocess.csv", X[:100], delimiter=",")

## Prediction values

Pretrained model is stored in pickle file, so let's load it and make predictions

In [16]:
Pkl_Filename = "Pickle_Salary_Model.pkl"
warnings.filterwarnings('ignore')

with open(Pkl_Filename, 'rb') as file:  
    Pickled_LR_Model = pickle.load(file)
    
y_pred = Pickled_LR_Model.predict(X)

with open('new_score', 'w') as file:  
    file.write(str(Pickled_LR_Model.score(X, y)))