In [3]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np

### I've thought of mainly two ways to build models for this: Logistic and Classfication
### 1. Logistic:
#### Split oac ability into: can do oac and can't do oac > simple logistic
#### Split oac ability into 4: multiple oac, single oac, almost oac, no oac > ordinal logistic regression
#### Ordinal logistic regression seems to be supported on R from what I've found
#### I could assign a 'score' for each 4 levels
#### multiple oac - score of 4
#### single oac - score of 3
#### almost oac - score of 2
#### no oac - score of 1
#### and then I could do ordinal regression instead instead of ordinal logistic regression
#### OR I could just use R and do ordinal logistic regression
#### Time to go watch statquest again

### 2. Classification
#### SVM and Decision tree. Haven't given too much thought about these yet 
#### but these algorithms came to mind first


In [58]:
raw = pd.read_excel("no_dummies_v2.xlsx")

In [43]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1127 entries, 0 to 1126
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   height                    1127 non-null   float64
 1   weight                    1127 non-null   float64
 2   upper_arm_length          459 non-null    float64
 3   pull_style                1127 non-null   object 
 4   weight_x_repmax           1127 non-null   object 
 5   orm                       987 non-null    float64
 6   orm_as_%                  987 non-null    float64
 7   orm_bins                  987 non-null    object 
 8   max_pulls                 998 non-null    float64
 9   oac_ability               1127 non-null   object 
 10  best_contribution_to_oac  946 non-null    object 
 11  specific_training_time    1117 non-null   object 
 12  sessions_per_week         860 non-null    float64
 13  years_of_training         870 non-null    float64
dtypes: float

In [59]:
raw["oac_ability"].unique()

array(['one-negative', 'neither', 'multiple-negatives/bad-form',
       'never-tried', 'oap', 'both', 'oac', 'multiple-oap',
       'multiple-both', 'multiple-oac'], dtype=object)

In [45]:
def yesno(x):
    ability = x["oac_ability"]
    yes = ['oap','oac','both','multiple-both','multiple-oap','multiple-oac']
    if ability in yes:
        return 1
    else:
        return 0

In [60]:
raw["oac_yesno"] = raw.apply(yesno,axis=1)

### I did classify multiple oac/oap as a separate category but upon further thought I think since there are only 7 responses I think it's safe to combine it with single oap/oac
### Also there is a difference between being able to do multiple-negatives/bad form and single negative(personal experience) so will split these instead

In [47]:
def fourlevels(x):
    ability = x["oac_ability"]
    yes = ['oap','oac','both','multiple-both','multiple-oap','multiple-oac']
    if ability in yes:
        return "advanced"
    elif ability  == 'multiple-negatives/bad-form':
        return "intermediate"
    elif ability == 'one-negative':
        return "beginner"
    else:
        return "none"

In [61]:
raw["oac_ability_rank"] = raw.apply(fourlevels,axis=1)

In [62]:
raw.head()

Unnamed: 0,height,weight,upper_arm_length,pull_style,weight_x_repmax,orm,orm_as_%,orm_bins,max_pulls,oac_ability,best_contribution_to_oac,specific_training_time,sessions_per_week,years_of_training,oac_yesno,oac_ability_rank
0,180.0,84.0,38.0,wpu,51x1,51.0,60.714286,60~79%,,one-negative,"pulley,high-volume,partials",2/2,2.0,2.0,0,beginner
1,192.0,78.0,,none,0x0,,,,12.0,neither,,0/0,,,0,none
2,168.0,55.0,,wpu,75x1,75.0,136.363636,120~139%,18.0,multiple-negatives/bad-form,"climbers-approach,eccentrics,band",2/1,2.0,1.0,0,intermediate
3,190.0,80.0,14.0,wpu,25x1,25.0,31.25,20~39%,10.0,neither,weighted-pulls,2/1,2.0,1.0,0,none
4,172.0,68.0,,wcu,40x2,47.2,69.411765,60~79%,18.0,neither,,1/0,1.0,,0,none


### didn't like how I did dummies in the data cleaning stage so will do it again
### also forgot to drop some dummy columns

### To dummy:
#### weighted_pull_style
#### best_contribution
#### oac_ability_rank

### Also going to drop some responses where weighted pull orm % is over 170% ish

### Columns to drop
#### Arm length > too few responses
#### weight_x_repmax > not needed
#### orm_bins > not needed
#### orm > already have orm as %
#### oac_ability > not needed, already dummied
#### specific_training_time > not needed

In [63]:
raw = raw.drop(columns=["upper_arm_length","weight_x_repmax","orm","orm_bins","oac_ability","specific_training_time"])

In [64]:
raw.head()

Unnamed: 0,height,weight,pull_style,orm_as_%,max_pulls,best_contribution_to_oac,sessions_per_week,years_of_training,oac_yesno,oac_ability_rank
0,180.0,84.0,wpu,60.714286,,"pulley,high-volume,partials",2.0,2.0,0,beginner
1,192.0,78.0,none,,12.0,,,,0,none
2,168.0,55.0,wpu,136.363636,18.0,"climbers-approach,eccentrics,band",2.0,1.0,0,intermediate
3,190.0,80.0,wpu,31.25,10.0,weighted-pulls,2.0,1.0,0,none
4,172.0,68.0,wcu,69.411765,18.0,,1.0,,0,none


In [65]:
pull_style_dummies = pd.get_dummies(raw.pull_style)
pull_style_dummies.head()

Unnamed: 0,both,none,npu,rpu,wcu,wpu
0,0,0,0,0,0,1
1,0,1,0,0,0,0
2,0,0,0,0,0,1
3,0,0,0,0,0,1
4,0,0,0,0,1,0


In [66]:
oac_ability_rank_dummies = pd.get_dummies(raw.oac_ability_rank)
oac_ability_rank_dummies.head()

Unnamed: 0,advanced,beginner,intermediate,none
0,0,1,0,0
1,0,0,0,1
2,0,0,1,0
3,0,0,0,1
4,0,0,0,1


In [67]:
from sklearn.preprocessing import MultiLabelBinarizer

In [68]:
def splitter(x):
    try:
        return x.split(",")
    except:
        return []

In [69]:
raw["best_contribution_to_oac"] = raw["best_contribution_to_oac"].apply(lambda x: splitter(x))
raw.head()

Unnamed: 0,height,weight,pull_style,orm_as_%,max_pulls,best_contribution_to_oac,sessions_per_week,years_of_training,oac_yesno,oac_ability_rank
0,180.0,84.0,wpu,60.714286,,"[pulley, high-volume, partials]",2.0,2.0,0,beginner
1,192.0,78.0,none,,12.0,[],,,0,none
2,168.0,55.0,wpu,136.363636,18.0,"[climbers-approach, eccentrics, band]",2.0,1.0,0,intermediate
3,190.0,80.0,wpu,31.25,10.0,[weighted-pulls],2.0,1.0,0,none
4,172.0,68.0,wcu,69.411765,18.0,[],1.0,,0,none


In [70]:
mlb = MultiLabelBinarizer()

contribution = raw["best_contribution_to_oac"]

dummied_contributions = pd.DataFrame(mlb.fit_transform(contribution),columns=mlb.classes_, index=contribution.index)
dummied_contributions.head()

Unnamed: 0,band,climbers-approach,climbing,eccentrics,grip,high-volume,isometrics,mixed-grip/mantle,partials,pulley,rope-climbs,weighted-pulls
0,0,0,0,0,0,1,0,0,1,1,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,0,0
