In [None]:
# machine learning imports
import sklearn as skl
import pandas as pd
from sklearn import preprocessing
#from sklearn.linear_model import LinearRegression # does not auto import
#from sklearn.linear_model import Ridge # does not auto import
#from sklearn.linear_model import Lasso # does not auto import
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error # also does not auto import
from sklearn.model_selection import train_test_split
#from sklearn.externals import joblib # needed to save classes
import numpy as np
#import matplotlib.pyplot as plt
#import seaborn as sns

In [None]:
# optional suppression of warning
import warnings
warnings.filterwarnings('ignore')
# warnings.filterwarnings(action='once')

In [None]:
# set up our feature engineering labels
e_labels = ['matchId','groupId','killPlace', 'boosts', 'walkDistance', 'weaponsAcquired', 'damageDealt', 'heals', 
                        'kills', 'longestKill', 'killStreaks', 'rideDistance', 'winPlacePerc', 'matchType']

# list of the variables discovered to be significant in data analysis
variables = ['killPlace', 'boosts', 'walkDistance', 'weaponsAcquired', 'damageDealt', 'heals', 
             'kills', 'longestKill', 'killStreaks', 'rideDistance','rampage', 'lethality', 
             'items', 'totalDistance','winPlacePerc']



def feature_engineering(pubg_data):
    '''FEATURE ENGINEERING
    GIVEN: a PUBG dataframe which must have a dummy 'winPlacePerc' column if a test set
    Conduct data engineering including:
    producing group data, normalising data with relevant match stats, clipping extreme results
    RETURNS: pubg_x dataframe consisting of feature engineered input columns
             pubg_y dataframe with target values (0 dummy frame if this is a test set)
    '''

    # total the pickups
    pubg_data['items'] = pubg_data['heals'] + pubg_data['boosts'] + pubg_data["weaponsAcquired"]
    
    # total the distance
    pubg_data['totalDistance'] = pubg_data['rideDistance'] + pubg_data['swimDistance'] + pubg_data['walkDistance']

    # estimate accuracy of players
    pubg_data['lethality'] = pubg_data['headshotKills'] / pubg_data['kills']
    pubg_data['lethality'].replace(np.inf, 0, inplace=True)
    pubg_data['lethality'].fillna(0, inplace=True)
    
    # estimate how players behave in shootouts
    pubg_data['rampage'] = pubg_data['killStreaks'] / pubg_data['kills']
    pubg_data['rampage'].replace(np.inf, 0, inplace=True)
    pubg_data['rampage'].fillna(0, inplace=True)
    
    # reduce dataframe to the columns we want to use
    # pubg_data = pubg_data[e_labels]

    # use groupby to get means for team
    pubg_group_means = pubg_data.groupby(['matchId','groupId']).mean().reset_index()

    # use groupby to get means of matches
    pubg_match_means = pubg_data.groupby(['matchId']).mean().reset_index()

    # merge back in leaving columns unchanged for one set to allow for future suffixing (only affects shared columns)
    pubg_engineered = pd.merge(pubg_data, pubg_group_means, 
                               suffixes=["", "_group"], how = "left", on = ['matchId', 'groupId']) 
    pubg_engineered = pd.merge(pubg_engineered, pubg_match_means, 
                               suffixes=["_player", "_match"], how = "left", on = ['matchId'])

    # norm the player variables
    for variable in variables:
        pubg_engineered[variable+'_norm'] = pubg_engineered[variable+'_player']/(pubg_engineered[variable+'_match']+0.1)

    # norm the group variables
    for variable in variables:
        pubg_engineered[variable+'_g_norm'] = pubg_engineered[variable+'_group']/(pubg_engineered[variable+'_match']+0.1)
        
    # one hot encode the matchTypes since different matches may follow different logics
    one_hot = pd.get_dummies(pubg_engineered['matchType'])
    # Drop column B as it is now encoded
    pubg_engineered = pubg_engineered.drop('matchType',axis = 1)
    # Join the encoded df
    pubg_engineered = pubg_engineered.join(one_hot)

    # setting up our basic data
    pubg_engineered = pubg_engineered.reset_index(drop=True)
    
    return pubg_engineered

In [None]:
# labels for desired columns for dataset to feed to model
# use normed values for stats except for rideDistance where correlation worsens
# use category variables for game types where this information assists the model
labels = ['killPlace_norm', 'boosts_norm','walkDistance_norm', 'weaponsAcquired_norm', 
          'damageDealt_norm','heals_norm', 'kills_norm', 'longestKill_norm',
          'killStreaks_norm', 'rideDistance_player', 'killPlace_g_norm', 'boosts_g_norm', 
          'walkDistance_g_norm', 'weaponsAcquired_g_norm', 'damageDealt_g_norm', 
          'heals_g_norm','kills_g_norm', 'longestKill_g_norm', 'killStreaks_g_norm',
          'rampage_norm', 'lethality_norm', 'duo', 'duo-fpp', 'solo', 'solo-fpp', 
          'squad','squad-fpp']

def feature_selection(pubg_data):
    # create raw input  data
    pubg_x = pubg_data[labels]

    # clip extreme outliers on a per column basis 
    pubg_x = pubg_x.clip(lower=None, upper= pubg_x.quantile(0.999), axis = 1)

    # set up our target data (not needed for test, so creates a dummy variable
    

    #return values
    return pubg_x

In [None]:
# import our training data
pubg_data = pd.read_csv('../input/train_V2.csv')

In [None]:
# clean up our data (drop invalid row)
pubg_data = pubg_data.dropna()
pubg_y = pubg_data['winPlacePerc']

# SUPPRESS chained assignment warnings
pd.options.mode.chained_assignment = None

# perform feature engineering on the dataset
pubg_engineered = feature_engineering(pubg_data)

In [None]:
# do our feature engineering and split off our target variable
pubg_x = feature_selection(pubg_engineered)

# save memory before running training
del(pubg_data)
del(pubg_engineered)

In [None]:
# now lets scale data to ensure column scales do not skew results
scaler = skl.preprocessing.StandardScaler().fit(pubg_x)
#joblib.dump(scaler, 'pubg_scaler.joblib') 

# lets look at the head again - we need to convert back to dataframe from numpy array though
pubg_x = pd.DataFrame(scaler.transform(pubg_x), columns= labels)
# having a scaler object will let us use it on the test data too :-)

In [None]:
# partition into train and validation (only needed during development)
# pubg_x_train, pubg_x_valid, pubg_y_train, pubg_y_valid = train_test_split(pubg_x, pubg_y, random_state = 9)

# reassign pubg_x and pubg_y to provide continuity with development variable names
pubg_x_train = pubg_x 
pubg_y_train = pubg_y

# save memory before running training
del(pubg_x)
del(pubg_y)

In [None]:
# now lets create the model
model_rf = RandomForestRegressor(n_estimators=32, oob_score=False, random_state=0, n_jobs =-1, verbose = 2)

# and fit it...
model_rf.fit(pubg_x_train, pubg_y_train)

# now lets test how well it fits training data 
# obviously the test data will not achieve this level of fit but it checks for obvious errors
predict_train_rf = model_rf.predict(pubg_x_train)
print('Mean absolute error for the training set using random forest regressor model %.4f' %
      mean_absolute_error(pubg_y_train, np.clip(predict_train_rf, 0, 1)))
# test against our validation set (during development only)
# predict_train_rf = model_rf.predict(pubg_x_valid)
# print('Mean absolute error for the training set using random forest regressor model %.4f' %
#       mean_absolute_error(pubg_y_valid, np.clip(predict_train_rf, 0, 1)))

In [None]:
# save memory before loading test data
del(pubg_x_train)
del(pubg_y_train)
del(predict_train_rf)
# del(pubg_x_valid)
# del(pubg_y_valid)

In [None]:
# now we are ready to read in the test data
pubg_data_test = pd.read_csv('../input/test_V2.csv')
#print(pubg_data_test.isnull().sum()) # no NaNs

# add a dummy winPlacePerc column to pubg_data_test so we can use our feature engineering function
pubg_data_test['winPlacePerc'] = 0

In [None]:
# do our feature engineering (NB pubg_y is a dummy return here)
pubg_engineered_t = feature_engineering(pubg_data_test)

In [None]:
# do our feature selection
pubg_x_t = feature_selection(pubg_engineered_t)

# save space before running prediction
del(pubg_engineered_t)

In [None]:
#use our scaler on the test data too
pubg_x_test = pd.DataFrame(scaler.transform(pubg_x_t), columns= labels)

# then make predictions
predict_test_rf_t = model_rf.predict(pubg_x_test)

In [None]:
# and clip outlying values as they cannot be correct
predict_test_rf_clip_t = np.clip(predict_test_rf_t, 0, 1)

In [None]:
# prepare output
predict_test_rf_df_t = pd.DataFrame(data= predict_test_rf_clip_t, columns=['winPlacePerc'])
output_df = pd.merge(pubg_data_test["Id"].to_frame(),predict_test_rf_df_t['winPlacePerc'].to_frame(), left_index=True, right_index=True)
output_df.head()

# write output
output_df.to_csv("submission.csv", index = False, index_label=False)

In [None]:
# release unneeded memory
del(pubg_x_test)
del(pubg_data_test)
del(predict_test_rf_clip_t)
del(predict_test_rf_df_t)
del(output_df)
del(predict_test_rf_t)

print("DONE")