In [1]:
import pandas as pd
from joblib import load, dump

import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
def ifelse(boolean, ifValue, elseValue):
    if boolean:
        return ifValue
    else:
        return elseValue

def clean_text(txt):
    """""
    cleans the input text in the following steps
    1- replace contractions
    2- removing punctuation
    3- spliting into words
    4- removing stopwords
    5- removing leftover punctuations
    """""
    contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}
    def _get_contractions(contraction_dict):
        contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
        return contraction_dict, contraction_re

    def replace_contractions(text):
        contractions, contractions_re = _get_contractions(contraction_dict)
        def replace(match):
            return contractions[match.group(0)]
        return contractions_re.sub(replace, text)

    # replace contractions
    txt = replace_contractions(txt)
    
    #remove punctuations
    txt  = "".join([char for char in txt if char not in string.punctuation])
    txt = re.sub('[0-9]+', '', txt)
    
    # split into words
    words = word_tokenize(txt)
    
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    
    # removing leftover punctuations
    words = [word for word in words if word.isalpha()]
    
    cleaned_text = ' '.join(words)
    return cleaned_text

In [3]:
data = pd.read_csv('filtered.csv')
data = data[data['injury_report'] == 'x']
data = data[['injury_report', 'tweet']]
data.drop_duplicates(inplace = True)
data.dropna(inplace = True)
data = data.sample(10000)
data['clean'] = data['tweet'].apply(lambda txt: clean_text(txt))

In [4]:
lgr = load('logistic_regression.joblib')
rfbool = load('random_forest_bool.joblib')
rftfidf = load('random_forest_tfidf.joblib')
bNB = load('bernoulliNB.joblib')
kNN = load('kNN.joblib')
svm = load('svm.joblib')
v_tfidf = load('tfidf.joblib')
v_bool = load('bool.joblib')
v_count = load('count.joblib')

In [5]:
data['boolNB_predictions'] = bNB.predict(v_bool.transform(data['clean']))
data['lgr_predictions'] = lgr.predict(v_tfidf.transform(data['clean']))

In [6]:
positives = data[data['lgr_predictions'] == 1]

In [7]:
print(len(positives['tweet'].drop_duplicates()))

for lines in positives['tweet']:
    print(lines + '\n')

331
Eddie Rosario (abdominal) to begin rehab stint Tue  https://t.co/qRUv30fZV2

Breaking: Coco Gauff announced on social media that she has tested positive for COVID-19 and won’t be able to compete in the Tokyo Olympics.  https://t.co/QXYAq1VNMA

@ballsandgutters I don't think they will push his innings this year after returning from TJ surgery, but we will see. He actually pitched more in 2019 than what you see on paper. He was in extended for a time making starts

Aaron Boone says it's "likely" Darren O'Day's season is over:  https://t.co/nx8Ip5Rt0X

@MichaelV11391 @BleacherNation The only thing I've seen is a note from one of Sharma's articles that he was out with a shoulder injury. No mention of how serious.

Multiple members of UConn coaching staff test positive for COVID-19 or show symptoms; Dan Hurley not at NBA Draft.  https://t.co/EGZHl2Dqbo

Pablo Lopez’s first rehab outing with @JaxShrimp through his scheduled 3 IP:  3 IP, 3 K, 38/24 P/S.  That’s it. That’s the line.  #Marl