In [2]:
import re, requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from gamelogs_to_mongo import format_injury_df
from pymongo import MongoClient
from data_cleaning import *
import matplotlib.pyplot as plt

import string
from collections import Counter
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.util import ngrams

In [4]:
df0 = pd.read_pickle('../data/df.pkl')
players = df0['bbref_id'].unique()
df_raw = format_injury_df(df0)

In [5]:
df = df_raw.copy()
df

Unnamed: 0,Date,player,Team,Status,Notes,bbref_id,from,to,pos,height,...,age,Season,Return_Date,Inj_Duration,New_Inj,Out_of_NBA,Career,League_Years,Num_Inj_Career,Num_Inj_Season
356,1980-10-09,Tree Rollins,Hawks,Injured,placed on IL,rollitr01,1978,1995,C,85,...,25.32,1981,1980-10-24,15 days,1,0,0,3,1,1
368,1980-11-07,Tree Rollins,Hawks,Injured,placed on IL,rollitr01,1978,1995,C,85,...,25.40,1981,1980-11-18,11 days,1,0,0,3,2,2
370,1980-11-17,Rick Mahorn,Bullets,Injured,sprained ankle (DTD),mahorri01,1981,1999,C-F,82,...,22.16,1981,1980-11-21,4 days,1,0,0,0,1,1
377,1980-12-05,John Long,Pistons,Injured,placed on IL,longjo01,1979,1997,G-F,77,...,24.27,1981,1980-12-17,12 days,1,0,0,2,1,1
387,1980-12-21,John Long,Pistons,Injured,placed on IL with knee injury,longjo01,1979,1997,G-F,77,...,24.32,1981,1981-01-02,12 days,1,0,0,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62219,2021-06-07,Jordan Nwora,Bucks,Injured,placed on IL with left thigh injury,nworajo01,2021,2021,F,80,...,22.74,2021,2021-06-25,18 days,1,0,0,0,2,2
62221,2021-06-08,Mike Conley,Jazz,Injured,placed on IL with strained right hamstring,conlemi01,2008,2021,G,73,...,33.66,2021,2021-06-18,10 days,1,0,0,13,46,9
62223,2021-06-09,Sam Merrill,Bucks,Injured,placed on IL with sprained right ankle,merrisa01,2021,2021,G,76,...,25.07,2021,2021-06-13,4 days,1,0,0,0,2,2
62237,2021-06-16,Chris Paul,Suns,Injured,placed on IL with COVID-19 protocols,paulch01,2006,2021,G,72,...,36.11,2021,2021-06-24,8 days,1,0,0,15,43,2


In [30]:
df_new = df[df.New_Inj == True].copy()
df_new = df_new[df_new['Date'] >= '1994-07-01']

In [31]:
notes = df.Notes
notes_list = notes.tolist()
notes_new = df_new.Notes


In [32]:
tokens = notes.apply(lambda x: word_tokenize(x.lower()))
tokens_new = notes_new.apply(lambda x: word_tokenize(x.lower()))
tokens

356                                       [placed, on, il]
368                                       [placed, on, il]
370                           [sprained, ankle, (, dtd, )]
377                                       [placed, on, il]
387                   [placed, on, il, with, knee, injury]
                               ...                        
62219          [placed, on, il, with, left, thigh, injury]
62221    [placed, on, il, with, strained, right, hamstr...
62223       [placed, on, il, with, sprained, right, ankle]
62237          [placed, on, il, with, covid-19, protocols]
62240                  [sprained, right, ankle, (, dtd, )]
Name: Notes, Length: 43099, dtype: object

In [33]:
stopwords_ = "a,able,about,across,after,all,almost,also,am,among,an,and,any,\
are,as,at,be,because,been,but,by,can,could,dear,did,do,does,either,\
else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,\
how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,\
me,might,most,must,my,neither,no,of,off,often,on,only,or,other,our,\
own,rather,said,say,says,she,should,since,so,some,than,that,the,their,\
them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,\
what,when,where,which,while,who,whom,why,will,with,would,yet,you,your".split(',')

In [34]:
punctuation_ = set(string.punctuation)
def filter_tokens(sent):
    return([w for w in sent if not w in stopwords_ and not w in punctuation_])
tokens_filtered = tokens.apply(lambda x: filter_tokens(x))
tokens_filtered_new = tokens_new.apply(lambda x: filter_tokens(x))
tokens_filtered

356                                  [placed, il]
368                                  [placed, il]
370                        [sprained, ankle, dtd]
377                                  [placed, il]
387                    [placed, il, knee, injury]
                           ...                   
62219           [placed, il, left, thigh, injury]
62221    [placed, il, strained, right, hamstring]
62223        [placed, il, sprained, right, ankle]
62237           [placed, il, covid-19, protocols]
62240               [sprained, right, ankle, dtd]
Name: Notes, Length: 43099, dtype: object

In [35]:
stemmer_porter, stemmer_snowball, lemmatizer = PorterStemmer(), SnowballStemmer('english'),WordNetLemmatizer()
tokens_stemporter = tokens_filtered.apply(lambda x: [stemmer_porter.stem(w) for w in x])
tokens_stemsnowball = tokens_filtered.apply(lambda x: [stemmer_snowball.stem(w) for w in x])
tokens_lemmatizer = tokens_filtered.apply(lambda x: [lemmatizer.lemmatize(w) for w in x])
tokens_stemporter_new = tokens_filtered_new.apply(lambda x: [stemmer_porter.stem(w) for w in x])
tokens_stemsnowball_new = tokens_filtered_new.apply(lambda x: [stemmer_snowball.stem(w) for w in x])
tokens_lemmatizer_new = tokens_filtered_new.apply(lambda x: [lemmatizer.lemmatize(w) for w in x])

In [36]:
stem_lemm_df = pd.DataFrame({'porter':tokens_stemporter, 'snowball':tokens_stemsnowball, 'lemmatizer': tokens_lemmatizer})
stem_lemm_df_new = pd.DataFrame({'porter_new':tokens_stemporter_new, 'snowball_new':tokens_stemsnowball_new, 'lemmatizer_new': tokens_lemmatizer_new})


stem_lemm_df
stem_lemm_df_new

Unnamed: 0,porter_new,snowball_new,lemmatizer_new
1597,"[arthroscop, surgeri, knee, out, 6-8, week]","[arthroscop, surgeri, knee, out, 6-8, week]","[arthroscopic, surgery, knee, out, 6-8, week]"
1598,"[surgeri, knee, repair, torn, acl, out, season]","[surgeri, knee, repair, torn, acl, out, season]","[surgery, knee, repair, torn, acl, out, season]"
1599,"[surgeri, left, ankl, remov, calcium, deposit,...","[surgeri, left, ankl, remov, calcium, deposit,...","[surgery, left, ankle, remove, calcium, deposi..."
1600,"[surgeri, left, knee, remov, loos, cartilag, o...","[surgeri, left, knee, remov, loos, cartilag, o...","[surgery, left, knee, remove, loose, cartilage..."
1601,"[stress, fractur, right, kneecap, out, 8, week]","[stress, fractur, right, kneecap, out, 8, week]","[stress, fracture, right, kneecap, out, 8, week]"
...,...,...,...
62219,"[place, il, left, thigh, injuri]","[place, il, left, thigh, injuri]","[placed, il, left, thigh, injury]"
62221,"[place, il, strain, right, hamstr]","[place, il, strain, right, hamstr]","[placed, il, strained, right, hamstring]"
62223,"[place, il, sprain, right, ankl]","[place, il, sprain, right, ankl]","[placed, il, sprained, right, ankle]"
62237,"[place, il, covid-19, protocol]","[place, il, covid-19, protocol]","[placed, il, covid-19, protocol]"


In [37]:
stem_lemm_df[tokens_stemporter != tokens_stemsnowball]
stem_lemm_df_new[tokens_stemporter_new != tokens_stemsnowball_new]

Unnamed: 0,porter_new,snowball_new,lemmatizer_new
1604,"[injuri, sinu, caviti, out, 10, day]","[injuri, sinus, caviti, out, 10, day]","[injury, sinus, cavity, out, 10, day]"
1611,"[torn, achil, tendon]","[torn, achill, tendon]","[torn, achilles, tendon]"
1619,"[ruptur, left, achil, tendon]","[ruptur, left, achill, tendon]","[ruptured, left, achilles, tendon]"
1632,"[place, il, left, achil, tendin]","[place, il, left, achill, tendin]","[placed, il, left, achilles, tendinitis]"
1712,"[stomach, viru, dnp]","[stomach, virus, dnp]","[stomach, virus, dnp]"
...,...,...,...
62057,"[place, il, right, achil, injuri]","[place, il, right, achill, injuri]","[placed, il, right, achilles, injury]"
62061,"[right, achil, injuri, dtd]","[right, achill, injuri, dtd]","[right, achilles, injury, dtd]"
62084,"[place, il, left, achil, injuri]","[place, il, left, achill, injuri]","[placed, il, left, achilles, injury]"
62129,"[place, il, left, achil, injuri]","[place, il, left, achill, injuri]","[placed, il, left, achilles, injury]"


In [41]:
for token in [tokens_stemporter, tokens_stemsnowball, tokens_lemmatizer]:
    term_occurance = Counter(x for row in token for x in row)
    print(term_occurance.most_common(75),'\n')

[('dnp', 18429), ('place', 16877), ('il', 16115), ('left', 12825), ('right', 12810), ('knee', 7278), ('sprain', 6715), ('ankl', 5885), ('sore', 4904), ('strain', 4834), ('dtd', 4800), ('injuri', 3702), ('back', 3168), ('out', 3074), ('surgeri', 2661), ('bruis', 2329), ('foot', 2224), ('lower', 1520), ('hamstr', 1509), ('season', 1494), ('fractur', 1437), ('torn', 1342), ('tendin', 1299), ('shoulder', 1295), ('groin', 1212), ('recov', 1124), ('spasm', 1034), ('repair', 1015), ('calf', 983), ('hip', 970), ('indefinit', 909), ('flu', 907), ('ill', 816), ('achil', 795), ('rest', 776), ('wrist', 768), ('tendon', 763), ('ir', 762), ('bone', 722), ('toe', 567), ('thumb', 564), ('hand', 545), ('week', 544), ('broken', 524), ('quadricep', 515), ('elbow', 481), ('finger', 480), ('stress', 458), ('concuss', 455), ('leg', 427), ('arthroscop', 398), ('plantar', 364), ('muscl', 363), ('abdomin', 341), ('infect', 319), ('stomach', 313), ('thigh', 306), ('big', 305), ('acl', 299), ('ligament', 293), (

In [42]:
for token in [tokens_stemporter_new, tokens_stemsnowball_new, tokens_lemmatizer_new]:
    term_occurance = Counter(x for row in token for x in row)
    print(term_occurance.most_common(75),'\n')


[('place', 11774), ('il', 11313), ('dnp', 7097), ('left', 6469), ('right', 6338), ('dtd', 4119), ('knee', 3667), ('sprain', 3497), ('ankl', 3160), ('sore', 2860), ('injuri', 2499), ('strain', 2291), ('back', 1890), ('out', 1839), ('bruis', 1333), ('foot', 973), ('lower', 840), ('tendin', 824), ('season', 815), ('hamstr', 755), ('rest', 726), ('shoulder', 712), ('flu', 696), ('surgeri', 688), ('spasm', 679), ('ill', 670), ('indefinit', 602), ('groin', 587), ('hip', 573), ('calf', 527), ('fractur', 491), ('ir', 460), ('torn', 459), ('achil', 421), ('wrist', 389), ('week', 347), ('quadricep', 316), ('toe', 305), ('tendon', 299), ('thumb', 287), ('finger', 268), ('bone', 262), ('concuss', 254), ('hand', 251), ('stomach', 244), ('elbow', 237), ('repair', 234), ('broken', 234), ('leg', 215), ('thigh', 185), ('infect', 181), ('nba', 175), ('protocol', 171), ('health', 168), ('safeti', 168), ('neck', 166), ('muscl', 158), ('patella', 155), ('viru', 155), ('plantar', 154), ('arthroscop', 153), 

In [82]:
def join_sent_ngrams(input_tokens, n):
    # first add the 1-gram tokens
    ret_list = list(input_tokens)
    
    #then for each n
    for i in range(2,n+1):
        # add each n-grams to the list
        ret_list.extend(['-'.join(tgram) for tgram in ngrams(input_tokens, i)])
    
    return(ret_list)
tokens_ngrams = list(map(lambda x : join_sent_ngrams(x, 3), tokens_stemporter_new))
# tokens_ngrams

In [85]:
term_occurance = Counter(x for row in tokens_ngrams for x in row)
term_occurance.most_common(50)

[('place', 11774),
 ('il', 11313),
 ('place-il', 11313),
 ('dnp', 7097),
 ('left', 6469),
 ('right', 6338),
 ('dtd', 4119),
 ('knee', 3667),
 ('sprain', 3497),
 ('ankl', 3160),
 ('sore', 2860),
 ('injuri', 2499),
 ('strain', 2291),
 ('back', 1890),
 ('out', 1839),
 ('left-knee', 1671),
 ('sprain-left', 1624),
 ('right-knee', 1597),
 ('sprain-right', 1551),
 ('left-ankl', 1550),
 ('right-ankl', 1362),
 ('bruis', 1333),
 ('sprain-left-ankl', 1183),
 ('il-sprain', 1147),
 ('place-il-sprain', 1147),
 ('sore-left', 1116),
 ('ankl-dnp', 1068),
 ('sprain-right-ankl', 1051),
 ('sore-right', 1039),
 ('foot', 973),
 ('il-sore', 918),
 ('place-il-sore', 918),
 ('lower', 840),
 ('tendin', 824),
 ('season', 815),
 ('out-season', 811),
 ('strain-left', 800),
 ('il-strain', 799),
 ('place-il-strain', 799),
 ('lower-back', 778),
 ('injuri-dtd', 772),
 ('strain-right', 766),
 ('knee-dnp', 761),
 ('hamstr', 755),
 ('rest', 726),
 ('shoulder', 712),
 ('flu', 696),
 ('surgeri', 688),
 ('spasm', 679),
 ('i

In [78]:
nsc = notes_new.str.contains
## Injury type: Dislocation, torn, surgery, ruptured, stretched, hyperextended, separated, rest
notes_new[(nsc('health', case=False)|nsc('protocol', case=False)|nsc('COVID', case=False))] ### COVID
notes_new[(nsc('^ill', case=False)|nsc('illn', case=False))&nsc('chill', case=False)].values ## Illness
# notes_new[nsc('infec', case=False)].value_counts() ## Infection
# notes_new[nsc('broke', case=False)|nsc('frac', case=False)] ## Broken
# notes_new[nsc('sprain', case=False)] ## Sprain
# notes_new[nsc('strain', case=False)&~nsc('sprain', case=False)] ## Sprain
# notes_new[nsc('brus', case=False)|nsc('bruis', case=False)|nsc('contu', case=False)] ## bruise
# notes_new[nsc('sore', case=False)] ## Sore
# notes_new[nsc('sick', case=False)] ## Sore
notes_new[nsc('feet', case=False)]

3719                         surgery on feet (out 6 weeks)
6918                                       sore feet (DNP)
12603                 plantar fasciatis in both feet (DNP)
12848                                      sore feet (DNP)
12942                 plantar fasciatis in both feet (DNP)
16912                       placed on IL with bruised feet
19704    underwent shockwave treatment to alleviate pai...
Name: Notes, dtype: object

In [72]:
temp_players = set(df[((notes == 'placed on IL') & (df.New_Inj == 1))]['bbref_id'].unique().tolist())

df[((notes == 'placed on IL') & (df.New_Inj == 1))|((df.New_Inj != 1)&df.bbref_id.isin(temp_players))].tail(40)

Unnamed: 0,Date,player,Team,Status,Notes,bbref_id,from,to,pos,height,...,age,Season,Return_Date,Inj_Duration,New_Inj,Out_of_NBA,Career,League_Years,Num_Inj_Career,Num_Inj_Season
60790,2021-03-11,James Ennis,Magic,Injured,placed on IL with left calf injury,ennisja01,2015,2021,F,78,...,30.69,2021,2021-03-21,10 days,0,0,0,6,17,3
60831,2021-03-13,Matthew Dellavedova,Cavaliers,Injured,placed on IL with appendectomy,dellama01,2014,2021,G,75,...,30.51,2021,2021-04-01,19 days,0,0,0,7,17,1
60856,2021-03-15,Jeremy Lamb,Pacers,Injured,placed on IL with sore left knee,lambje01,2013,2021,G-F,77,...,28.79,2021,2021-03-19,4 days,0,0,0,8,22,2
60876,2021-03-16,Keljin Blevins,Blazers,Injured,placed on IL,blevike01,2021,2021,F,76,...,25.31,2021,2021-04-02,17 days,1,0,0,0,3,3
60886,2021-03-17,Jalen Harris,Raptors,Injured,placed on IL,harrija01,2021,2021,G,77,...,22.59,2021,2021-04-29,43 days,1,0,0,0,1,1
60890,2021-03-17,Sean McDermott,Grizzlies,Injured,placed on IL,mcderse01,2021,2021,F,78,...,24.37,2021,2021-03-27,10 days,1,0,0,0,3,3
60914,2021-03-18,Devon Dotson,Bulls,Injured,placed on IL with NBA health and safety protocols,dotsode01,2021,2021,G,74,...,21.63,2021,2021-03-27,9 days,0,0,0,0,2,2
60934,2021-03-19,Udonis Haslem,Heat,Injured,NBA health and safety protocols (DTD),hasleud01,2004,2021,F-C,80,...,40.78,2021,2021-05-13,55 days,0,0,0,17,30,1
60987,2021-03-23,Stephen Curry,Warriors,Injured,placed on IL with bruised tailbone,curryst01,2010,2021,G,75,...,33.02,2021,2021-03-29,6 days,0,0,0,11,36,2
61007,2021-03-24,Jordan McLaughlin,Timberwolves,Injured,conditioning (DTD),mclaujo01,2020,2021,G,71,...,24.96,2021,2021-03-26,2 days,0,0,0,1,2,2


NameError: name 'df1' is not defined