In [1]:
# machine learning imports
import sklearn as skl
import pandas as pd
from sklearn.linear_model import LinearRegression # does not auto import
from sklearn.metrics import mean_absolute_error # does not auto import
import numpy as np

In [2]:
# import our data
pubg_data = pd.read_csv('train_V2.csv')
# there is a NaN value we need to drop
pubg_data = pubg_data.dropna() 

In [3]:
# select our features
labels = ['boosts', 'damageDealt', 'heals', 'killPlace', 'kills',
       'killStreaks', 'longestKill', 'revives', 'rideDistance',  'walkDistance',
       'weaponsAcquired']

# create input data
pubg_x = pubg_data[labels]
# clip outliers on a per column basis
pubg_x = pubg_x.clip(lower=None, upper= pubg_x.quantile(0.999), axis = 1)

# set up our target data
pubg_y = pubg_data['winPlacePerc']

In [4]:
# now lets scale data
scaler = skl.preprocessing.StandardScaler().fit(pubg_x)

# lets look at the head again 
# we need to convert back to dataframe from numpy array though
pubg_x = pd.DataFrame(scaler.transform(pubg_x), columns= labels)

In [5]:
# partition into train and test
pubg_x_train, pubg_x_test, pubg_y_train, pubg_y_test = (
    skl.model_selection.train_test_split(pubg_x, pubg_y, random_state = 9))

In [6]:
# now lets create the model
model = LinearRegression()

# and fit it...
model.fit(pubg_x_train, pubg_y_train);

In [7]:
# now lets test how well it fits training data and unseen data
predict_train = model.predict(pubg_x_train)
print('Mean absolute error for training set using linear model %.4f' 
      % mean_absolute_error(pubg_y_train, predict_train))
predict_test = model.predict(pubg_x_test)
print('Mean absolute error for the test set using linear model %.4f' 
      % mean_absolute_error(pubg_y_test, predict_test)) 

Mean absolute error for training set using linear model 0.0997
Mean absolute error for the test set using linear model 0.0997


In [8]:
# now lets test how well it fits training data and unseen data
predict_train = model.predict(pubg_x_train)
print('Mean absolute error for training set using linear model %.4f' 
      % mean_absolute_error(pubg_y_train, np.clip(predict_train,0,1))) 
predict_test = model.predict(pubg_x_test)
print('Mean absolute error for the test set using linear model %.4f' 
      % mean_absolute_error(pubg_y_test, np.clip(predict_test,0,1))) 

Mean absolute error for training set using linear model 0.0977
Mean absolute error for the test set using linear model 0.0978
