In [None]:
import pandas as pd
import numpy as np

path = "/content/drive/MyDrive/Colab Notebooks/PUBG Project/"
squadFile = "train_squad.csv"
duoFile = "train_duo.csv"
soloFile = "train_solo.csv"
eventFile = "train_event.csv"

squadTr = pd.read_csv(path+squadFile)
duoTr = pd.read_csv(path+duoFile)
soloTr = pd.read_csv(path+soloFile)
eventTr = pd.read_csv(path+eventFile)

# Collinearity

Use VIF calculations to find input variables with excessive collinearity with the other input variables.

In [None]:
import statsmodels.formula.api as smf

### Generate smf.ols() argument string from list of input vars & dependent var
### Inputs: dependent variable, list of input variables
### Output: argument string for smf.ols() model building function.
def genArgStr(depVar, varLst):
    argStr = depVar + " ~ "
    for var in varLst:
        argStr = argStr + var + " + "
    return argStr[:-3]

### Function to generate a list of VIF values for a DataFrame's input variables.
### Inputs: DataFrame, list of input variables
### Returns: DataFrame of R-square & VIF values for each variable.
def genVIFs(df, varLst):
    def vif(rsq):
        return 1/(1 - rsq)
    # Build list of string argument permutations to pass to the smf.ols() method.
    argStrings = []
    for vr in varLst:
        vrs = varLst[:]
        vrs.remove(vr)
        argStrings.append(genArgStr(vr, vrs))
    # Get R-squares
    rSqrs = []
    for string in argStrings:
        rSq = smf.ols(string, data = df).fit().rsquared
        rSqrs.append(rSq)
    # Build DataFrame with VIF values and return sorted.
    VIF_df = pd.DataFrame({"Variable": varLst, "R^2": rSqrs, "VIF": map(vif, rSqrs)})
    return VIF_df.sort_values("VIF", ascending=False).reset_index(drop = True)


  import pandas.util.testing as tm


Prepare list of independent variables, then pass the DataFrame and variable list to the function.

In [None]:
dpVar = "winPlacePerc"
varList = list(squadTr.columns)
varList.remove(dpVar)

squadVIFs = genVIFs(squadTr, varList)
squadVIFs

  return 1 - self.ssr/self.centered_tss


Unnamed: 0,Variable,R^2,VIF
0,total_distance,1.0,inf
1,items,1.0,inf
2,weaponsAcquired,1.0,inf
3,walkDistance,1.0,inf
4,boosts,1.0,inf
5,swimDistance,1.0,inf
6,heals_and_boosts,1.0,inf
7,rideDistance,1.0,inf
8,heals,1.0,inf
9,percent_kill,0.878901,8.257724


The highest 9 VIFs are clearly a problem we have to deal with. First get them in a list.

In [None]:
highVIFs = squadVIFs.head(9)
highVIFs

Unnamed: 0,Variable,R^2,VIF
0,total_distance,1.0,inf
1,items,1.0,inf
2,weaponsAcquired,1.0,inf
3,walkDistance,1.0,inf
4,boosts,1.0,inf
5,swimDistance,1.0,inf
6,heals_and_boosts,1.0,inf
7,rideDistance,1.0,inf
8,heals,1.0,inf


What we have in this list of very highly intercorrelated variables are the engineered features and their inputs, which explains the 1.0 R-square values. So rather than removing all of these from the data, just remove the original features and leave the engineered ones in.

In [None]:
varList = list(squadTr.columns)
varList.remove(dpVar)

removeList = list(highVIFs.Variable)
removeList.remove("total_distance")
removeList.remove("items")
removeList.remove("heals_and_boosts")
# Add 'roadKills' to removeList since it's all 0 in the squad data and producing NaN VIF
removeList.append("roadKills")
# Remove the variables
for vr in removeList:
    varList.remove(vr)

Now generate the VIFs again to make sure the engineered features we left in are within the VIF threshold of 10 which we are using.

In [None]:
squadVIFs = genVIFs(squadTr, varList)
squadVIFs

Unnamed: 0,Variable,R^2,VIF
0,items,0.883139,8.557196
1,percent_kill,0.878269,8.214863
2,percent_team_damage,0.866073,7.466757
3,percent_team_kill,0.864004,7.353144
4,heals_and_boosts,0.851875,6.751074
5,killPlace,0.828332,5.825204
6,percent_damage,0.824213,5.688704
7,killStreaks,0.81589,5.431529
8,kills_in_match,0.756739,4.11081
9,DBNOs,0.75123,4.019784


The engineered features' VIFs are all under 10, so they're good. We can now obtain the argument string for the ols() method to build the OLS model.

In [None]:
argStringSquad = genArgStr(dpVar, varList)
argStringSquad

'winPlacePerc ~ assists + DBNOs + killPlace + killStreaks + longestKill + matchDuration + maxPlace + numGroups + rankPoints + revives + teamKills + vehicleDestroys + players_in_match + players_in_team + kills_in_match + percent_kill + percent_team_kill + damage_in_match + percent_damage + percent_team_damage + headshot_rate + heals_and_boosts + items + total_distance'

# Influence Observations

Now we build the OLS model and use it to find influence observations.

In [None]:
model = smf.ols(argStringSquad, data = squadTr).fit()

# Obtain the influence data included in the model result.
influence = model.get_influence()

# Obtain the Cook's Distances of the dataset from the influence data.
cooksDs = influence.cooks_distance

# cooksDs is a tuple of 2 arrays, the first being of the cooks distances.
cooksDsList = list(cooksDs[0])
cooksDsList[:10]

[1.1629741068723958e-08,
 3.6535072196440777e-08,
 5.14375260541185e-07,
 2.411862637827793e-08,
 2.3728013816943334e-06,
 1.442640412489424e-06,
 9.4537419412522e-08,
 1.0025692833224854e-07,
 6.531038646564427e-09,
 1.0400342331655867e-08]

To pick out the influence points we will try a standard that is commonly used where Cook's D > 4/N where N is the number of observations in the dataset. See the following pages:

[Cook's Distance - Yellowbrick](https://www.scikit-yb.org/en/latest/api/regressor/influence.html)

[Statology.org](https://www.statology.org/how-to-identify-influential-data-points-using-cooks-distance/#:~:text=Cook%E2%80%99s%20distance%2C%20often%20denoted%20D%20i%2C%20is%20used,regression%20model%3B%20MSE%20is%20the%20mean%20squared%20error)

In [None]:
ckStd = 4 / len(cooksDsList)

# Emumerate the list to get row references.
cooksDsIdx = list(enumerate(cooksDsList))

# List of observations meeting standard for excessive influence.
cooksOverStd = [v[0] for v in cooksDsIdx if v[1] > ckStd]
len(cooksOverStd) / len(cooksDsList)

0.06980350730800129

By the standard we are trying, nearly 7% of the observations are deemed influential. We'll build a model with those removed and compare to the original model.

In [None]:
squadTrNew = squadTr.drop(cooksOverStd)
modelNew = smf.ols(argStringSquad, data = squadTrNew).fit()

In [None]:
## Compare R-square values
print(model.rsquared, modelNew.rsquared)

0.8060661280879198 0.862836685030444


There is a 7% improvement in the R-squared value so we'll stick with the 4/N standard and accept the culled dataset.

# Performing the operations on the other datasets.

**train_duo data collinearity**

In [None]:
dpVar = "winPlacePerc"
varList = list(duoTr.columns)
varList.remove(dpVar)

duoVIFs = genVIFs(duoTr, varList)
duoVIFs

  return 1 - self.ssr/self.centered_tss


Unnamed: 0,Variable,R^2,VIF
0,total_distance,1.0,inf
1,items,1.0,inf
2,weaponsAcquired,1.0,inf
3,walkDistance,1.0,inf
4,boosts,1.0,inf
5,swimDistance,1.0,inf
6,heals_and_boosts,1.0,inf
7,rideDistance,1.0,inf
8,heals,1.0,inf
9,percent_team_kill,0.911324,11.277037


In [None]:
highVIFs = duoVIFs.head(9)
varList = list(duoTr.columns)
varList.remove(dpVar)

removeList = list(highVIFs.Variable)
removeList.remove("total_distance")
removeList.remove("items")
removeList.remove("heals_and_boosts")
removeList.append("roadKills")
for vr in removeList:
    varList.remove(vr)

In [None]:
duoVIFs = genVIFs(duoTr, varList)
duoVIFs

Unnamed: 0,Variable,R^2,VIF
0,percent_team_kill,0.911283,11.271761
1,percent_kill,0.909932,11.102693
2,percent_team_damage,0.908619,10.943251
3,percent_damage,0.882518,8.511921
4,items,0.882218,8.49027
5,heals_and_boosts,0.847203,6.544619
6,killPlace,0.84503,6.452876
7,killStreaks,0.831475,5.93383
8,kills_in_match,0.792103,4.810079
9,damage_in_match,0.76243,4.209278


In [None]:
varList.remove("percent_team_kill")
duoVIFs = genVIFs(duoTr, varList)
duoVIFs

Unnamed: 0,Variable,R^2,VIF
0,items,0.882217,8.490196
1,percent_kill,0.863971,7.351366
2,heals_and_boosts,0.847156,6.542618
3,killPlace,0.844586,6.434445
4,percent_damage,0.842341,6.342809
5,killStreaks,0.831225,5.925037
6,kills_in_match,0.792098,4.809967
7,damage_in_match,0.762425,4.209202
8,players_in_match,0.731427,3.723378
9,maxPlace,0.710824,3.458104


In [None]:
argStringDuo = genArgStr(dpVar, varList)
argStringDuo

'winPlacePerc ~ assists + DBNOs + killPlace + killStreaks + longestKill + matchDuration + maxPlace + numGroups + rankPoints + revives + teamKills + vehicleDestroys + players_in_match + players_in_team + kills_in_match + percent_kill + damage_in_match + percent_damage + percent_team_damage + headshot_rate + heals_and_boosts + items + total_distance'

**train_duo data influence**

In [None]:
model = smf.ols(argStringDuo, data = duoTr).fit()
cooksDs = model.get_influence().cooks_distance
cooksDsIdx = list(enumerate(list(cooksDs[0])))
ckStd = 4 / len(cooksDsIdx)
cooksOverStd = [v[0] for v in cooksDsIdx if v[1] > ckStd]
len(cooksOverStd) / len(cooksDsIdx)

0.07782788858560814

In [None]:
duoTrNew = duoTr.drop(cooksOverStd)
modelNew = smf.ols(argStringDuo, data = duoTrNew).fit()
## Compare R-square values
print(model.rsquared, modelNew.rsquared)

0.8439303055858012 0.8953371403896558


**train_solo data collinearity**

In [None]:
dpVar = "winPlacePerc"
varList = list(soloTr.columns)
varList.remove(dpVar)

soloVIFs = genVIFs(soloTr, varList)
soloVIFs

  return 1 - self.ssr/self.centered_tss


Unnamed: 0,Variable,R^2,VIF
0,total_distance,1.0,inf
1,items,1.0,inf
2,weaponsAcquired,1.0,inf
3,walkDistance,1.0,inf
4,heals_and_boosts,1.0,inf
5,boosts,1.0,inf
6,swimDistance,1.0,inf
7,rideDistance,1.0,inf
8,heals,1.0,inf
9,percent_kill,0.903212,10.331816


In [None]:
highVIFs = soloVIFs.head(9)
varList = list(soloTr.columns)
varList.remove(dpVar)

removeList = list(highVIFs.Variable)
removeList.remove("total_distance")
removeList.remove("items")
removeList.remove("heals_and_boosts")
removeList.extend(["roadKills", "revives", "DBNOs"])
for vr in removeList:
    varList.remove(vr)

In [None]:
soloVIFs2 = genVIFs(soloTr, varList)
soloVIFs2

Unnamed: 0,Variable,R^2,VIF
0,percent_kill,0.902061,10.21045
1,percent_damage,0.881815,8.461339
2,kills_in_match,0.876368,8.088536
3,items,0.875523,8.033584
4,killPlace,0.873224,7.887939
5,damage_in_match,0.864067,7.35656
6,maxPlace,0.831493,5.93446
7,heals_and_boosts,0.830062,5.884487
8,killStreaks,0.795846,4.898261
9,players_in_match,0.783205,4.612656


In [None]:
varList.remove("percent_damage")
soloVIFs3 = genVIFs(soloTr, varList)
soloVIFs3

Unnamed: 0,Variable,R^2,VIF
0,kills_in_match,0.876368,8.088506
1,items,0.875521,8.033494
2,killPlace,0.872459,7.840615
3,damage_in_match,0.864065,7.356451
4,maxPlace,0.831483,5.93411
5,heals_and_boosts,0.829971,5.881354
6,killStreaks,0.795249,4.883984
7,players_in_match,0.783139,4.611241
8,numGroups,0.760922,4.182729
9,percent_kill,0.727033,3.663448


In [None]:
argStringSolo = genArgStr(dpVar, varList)
argStringSolo

'winPlacePerc ~ assists + killPlace + killStreaks + longestKill + matchDuration + maxPlace + numGroups + rankPoints + teamKills + vehicleDestroys + players_in_match + kills_in_match + percent_kill + damage_in_match + headshot_rate + heals_and_boosts + items + total_distance'

**train_solo data influence**

In [None]:
model = smf.ols(argStringSolo, data = soloTr).fit()
cooksDs = model.get_influence().cooks_distance
cooksDsIdx = list(enumerate(list(cooksDs[0])))
ckStd = 4 / len(cooksDsIdx)
cooksOverStd = [v[0] for v in cooksDsIdx if v[1] > ckStd]
len(cooksOverStd) / len(cooksDsIdx)

0.07589983402590166

In [None]:
soloTrNew = soloTr.drop(cooksOverStd)
modelNew = smf.ols(argStringSolo, data = soloTrNew).fit()
## Compare R-square values
print(model.rsquared, modelNew.rsquared)

0.8848761470442806 0.9319753829203304


**train_event data collinearity**

In [None]:
dpVar = "winPlacePerc"
varList = list(eventTr.columns)
varList.remove(dpVar)

eventVIFs = genVIFs(eventTr, varList)
eventVIFs



Unnamed: 0,Variable,R^2,VIF
0,total_distance,1.0,inf
1,items,1.0,inf
2,weaponsAcquired,1.0,inf
3,walkDistance,1.0,inf
4,boosts,1.0,inf
5,swimDistance,1.0,inf
6,heals_and_boosts,1.0,inf
7,rideDistance,1.0,inf
8,heals,1.0,inf
9,maxPlace,0.989681,96.905797


In [None]:
highVIFs = eventVIFs.head(11)
varList = list(eventTr.columns)
varList.remove(dpVar)

removeList = list(highVIFs.Variable)
removeList.remove("total_distance")
removeList.remove("items")
removeList.remove("heals_and_boosts")
for vr in removeList:
    varList.remove(vr)

In [None]:
eventVIFs2 = genVIFs(eventTr, varList)
eventVIFs2

Unnamed: 0,Variable,R^2,VIF
0,damage_in_match,0.908807,10.965799
1,items,0.906363,10.679553
2,percent_kill,0.892088,9.266837
3,percent_team_damage,0.888709,8.985467
4,percent_team_kill,0.886508,8.81122
5,kills_in_match,0.882937,8.54243
6,heals_and_boosts,0.876833,8.119059
7,matchDuration,0.845927,6.490442
8,percent_damage,0.84151,6.309542
9,killStreaks,0.80519,5.133197


In [None]:
varList.remove("damage_in_match")
varList.remove("items")
eventVIFs3 = genVIFs(eventTr, varList)
eventVIFs3

Unnamed: 0,Variable,R^2,VIF
0,percent_kill,0.892049,9.263497
1,percent_team_damage,0.888437,8.963552
2,percent_team_kill,0.88649,8.809803
3,percent_damage,0.84146,6.307558
4,killStreaks,0.804273,5.109156
5,killPlace,0.790811,4.78037
6,kills_in_match,0.753179,4.051517
7,players_in_match,0.750785,4.012601
8,DBNOs,0.63259,2.721754
9,matchDuration,0.559569,2.270505


In [None]:
argStringEvent = genArgStr(dpVar, varList)
argStringEvent

'winPlacePerc ~ assists + DBNOs + killPlace + killStreaks + longestKill + matchDuration + rankPoints + revives + roadKills + teamKills + vehicleDestroys + players_in_match + players_in_team + kills_in_match + percent_kill + percent_team_kill + percent_damage + percent_team_damage + headshot_rate + heals_and_boosts + total_distance'

**train_event data influence**

In [None]:
model = smf.ols(argStringEvent, data = eventTr).fit()
cooksDs = model.get_influence().cooks_distance
cooksDsIdx = list(enumerate(list(cooksDs[0])))
ckStd = 4 / len(cooksDsIdx)
cooksOverStd = [v[0] for v in cooksDsIdx if v[1] > ckStd]
len(cooksOverStd) / len(cooksDsIdx)

0.07256350571804474

In [None]:
eventTrNew = eventTr.drop(cooksOverStd)
modelNew = smf.ols(argStringEvent, data = eventTrNew).fit()
## Compare R-square values
print(model.rsquared, modelNew.rsquared)

0.8089481063291497 0.8690633313361338


# Argument Strings for Final Model Building

Squad data:

In [None]:
argStringSquad

'winPlacePerc ~ assists + DBNOs + killPlace + killStreaks + longestKill + matchDuration + maxPlace + numGroups + rankPoints + revives + teamKills + vehicleDestroys + players_in_match + players_in_team + kills_in_match + percent_kill + percent_team_kill + damage_in_match + percent_damage + percent_team_damage + headshot_rate + heals_and_boosts + items + total_distance'

Duo data:

In [None]:
argStringDuo

'winPlacePerc ~ assists + DBNOs + killPlace + killStreaks + longestKill + matchDuration + maxPlace + numGroups + rankPoints + revives + teamKills + vehicleDestroys + players_in_match + players_in_team + kills_in_match + percent_kill + damage_in_match + percent_damage + percent_team_damage + headshot_rate + heals_and_boosts + items + total_distance'

Solo data:

In [None]:
argStringSolo

'winPlacePerc ~ assists + killPlace + killStreaks + longestKill + matchDuration + maxPlace + numGroups + rankPoints + teamKills + vehicleDestroys + players_in_match + kills_in_match + percent_kill + damage_in_match + headshot_rate + heals_and_boosts + items + total_distance'

Event data:

In [None]:
argStringEvent

'winPlacePerc ~ assists + DBNOs + killPlace + killStreaks + longestKill + matchDuration + rankPoints + revives + roadKills + teamKills + vehicleDestroys + players_in_match + players_in_team + kills_in_match + percent_kill + percent_team_kill + percent_damage + percent_team_damage + headshot_rate + heals_and_boosts + total_distance'