In [37]:

# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [38]:
data = r"C:\Users\agaro\Documents\GitHub\Chiron\chiron_content.csv"

df = pd.read_csv(data)
print(df.head())

                                                link  \
0  https://www.eaglenews.ph/zimbabwe-reports-firs...   
1  https://www.eaglenews.ph/cuba-faces-uphill-bat...   
2  https://www.eaglenews.ph/thailand-says-mpox-ca...   
3  https://www.eaglenews.ph/africa-could-start-mp...   
4  https://www.eaglenews.ph/one-case-of-clade-2-m...   

                                               title  annotation  \
0  Zimbabwe reports first two mpox cases of unspe...           1   
1  Cuba faces uphill battle as Oropouche virus sp...           1   
2  Thailand says mpox case recorded in traveller ...           1   
3   Africa could start mpox vaccinations within days           1   
4                                 health agency says           1   

                                             content  
0  HARARE (Reuters) – Zimbabwe has confirmed its ...  
1  HAVANA (Reuters) – Cuban health authorities la...  
2  -Thailand has detected an mpox case in a Europ...  
3  By Anait MiridzhanianThe Democr

In [39]:
import string
string.punctuation

# Checks the contents if there are empty or missing values

uncleanContents = df[df['content'].isna() | (df['content'].str.strip() == '')].index

# Print the missing/empty indices
print(f'Indices with missing or empty content: {uncleanContents}')
# print('empty content example: ' + (df['content'])[217])
# Fill NaN values with empty strings
df['content'].fillna('', inplace=True)

print(f"First file rows: {len(df)}")

Indices with missing or empty content: Index([1222, 1461, 1466, 1473, 1474, 1729, 2201, 2202, 2203, 2204, 3042, 3169,
       3187, 3194, 3258, 3273],
      dtype='int64')
First file rows: 3384


First step we are doing is data cleaning and preprocessing

In [40]:
#defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree
#storing the puntuation free text
df['clean_msg']= df['content'].apply(lambda x:remove_punctuation(x))
df.head()

Unnamed: 0,link,title,annotation,content,clean_msg
0,https://www.eaglenews.ph/zimbabwe-reports-firs...,Zimbabwe reports first two mpox cases of unspe...,1,HARARE (Reuters) – Zimbabwe has confirmed its ...,HARARE Reuters – Zimbabwe has confirmed its fi...
1,https://www.eaglenews.ph/cuba-faces-uphill-bat...,Cuba faces uphill battle as Oropouche virus sp...,1,HAVANA (Reuters) – Cuban health authorities la...,HAVANA Reuters – Cuban health authorities laun...
2,https://www.eaglenews.ph/thailand-says-mpox-ca...,Thailand says mpox case recorded in traveller ...,1,-Thailand has detected an mpox case in a Europ...,Thailand has detected an mpox case in a Europe...
3,https://www.eaglenews.ph/africa-could-start-mp...,Africa could start mpox vaccinations within days,1,By Anait MiridzhanianThe Democratic Republic o...,By Anait MiridzhanianThe Democratic Republic o...
4,https://www.eaglenews.ph/one-case-of-clade-2-m...,health agency says,1,One case of the mpox virus has been detected i...,One case of the mpox virus has been detected i...


In [41]:
# standardization: making all letters into lowercase
df['msg_lower']= df['clean_msg'].apply(lambda x: x.lower())
df.head()

Unnamed: 0,link,title,annotation,content,clean_msg,msg_lower
0,https://www.eaglenews.ph/zimbabwe-reports-firs...,Zimbabwe reports first two mpox cases of unspe...,1,HARARE (Reuters) – Zimbabwe has confirmed its ...,HARARE Reuters – Zimbabwe has confirmed its fi...,harare reuters – zimbabwe has confirmed its fi...
1,https://www.eaglenews.ph/cuba-faces-uphill-bat...,Cuba faces uphill battle as Oropouche virus sp...,1,HAVANA (Reuters) – Cuban health authorities la...,HAVANA Reuters – Cuban health authorities laun...,havana reuters – cuban health authorities laun...
2,https://www.eaglenews.ph/thailand-says-mpox-ca...,Thailand says mpox case recorded in traveller ...,1,-Thailand has detected an mpox case in a Europ...,Thailand has detected an mpox case in a Europe...,thailand has detected an mpox case in a europe...
3,https://www.eaglenews.ph/africa-could-start-mp...,Africa could start mpox vaccinations within days,1,By Anait MiridzhanianThe Democratic Republic o...,By Anait MiridzhanianThe Democratic Republic o...,by anait miridzhanianthe democratic republic o...
4,https://www.eaglenews.ph/one-case-of-clade-2-m...,health agency says,1,One case of the mpox virus has been detected i...,One case of the mpox virus has been detected i...,one case of the mpox virus has been detected i...


In [42]:
import re
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\agaro\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\agaro\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [43]:
#defining function for tokenization

#applying function to the column
df['msg_tokenized'] = df['msg_lower'].apply(lambda x: word_tokenize(x))

print(df['msg_tokenized'][1])

['havana', 'reuters', '–', 'cuban', 'health', 'authorities', 'launched', 'smallscale', 'fumigation', 'efforts', 'in', 'havana', 'on', 'friday', 'to', 'fight', 'the', 'spread', 'of', 'the', 'oropouche', 'virus', 'but', 'a', 'rainy', 'caribbean', 'summer', 'fuel', 'shortages', 'and', 'growing', 'roadside', 'trash', 'heaps', 'are', 'complicating', 'those', 'efforts', 'workers', 'and', 'officials', 'saidmore', 'than', '500', 'cases', 'of', 'the', 'virus', 'have', 'been', 'registered', 'since', 'may', 'when', 'the', 'disease', 'was', 'first', 'detected', 'in', 'fareastern', 'cuba', 'health', 'officials', 'said', 'this', 'weekthe', 'virus', 'also', 'known', 'as', 'sloth', 'fever', 'is', 'transmitted', 'by', 'the', 'bite', 'of', 'mosquitoes', 'and', 'midges', 'and', 'has', 'spread', 'quickly', 'across', 'all', 'of', 'the', 'country', '’', 's', 'provinces', 'and', 'major', 'cities', 'including', 'the', 'capital', 'havana', 'patients', 'complain', 'of', 'fever', 'body', 'aches', 'and', 'nausea'

In [44]:
#importing nlp library
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# stop_words = set(stopwords.words('english'))
#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\agaro\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [45]:
import nltk
# Download stopwords if you haven't already
nltk.download('stopwords')

# Initialize stopwords (use only ONE of these lines)
stopwords = set(nltk.corpus.stopwords.words('english'))

# Add custom stopwords
# custom_stopwords = ["coronavirus", "covid19", "people", "virus", "health", "china", "chinese", "new", "us", "vaccine", "wuhan", "video", "outbreak", "hospital", "says", "novel"]
# stopwords = stopwords | set(custom_stopwords)

# Add custom stopwords
custom_stopwords = ["'", '"', '—', '“', '”', '’', '––', '–', 'also', 'said', 'human', 'people', 'health']
stopwords = stopwords | set(custom_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\agaro\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [46]:

# Add custom stopwords
# custom_stopwords = ["coronavirus", "covid19", "people", "virus", "health", "china", "chinese", "new", "us", "vaccine", "wuhan", "video", "outbreak", "hospital", "says", "novel"]  # Add your custom stopwords here
# stopwords = stopwords.union(custom_stopwords)
# stopwords = set(stopwords).union(custom_stopwords)
# stopwords = set(stopwords) | set(custom_stopwords)
# stopwords = set(stopwords) | set(custom_stopwords)
# stopwords = stopwords | set(custom_stopwords)

# defining functions for removing stopwords
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

df['no_stopwords']= df['msg_tokenized'].apply(lambda x:remove_stopwords(x))
print(df['no_stopwords'][1])

['havana', 'reuters', 'cuban', 'authorities', 'launched', 'smallscale', 'fumigation', 'efforts', 'havana', 'friday', 'fight', 'spread', 'oropouche', 'virus', 'rainy', 'caribbean', 'summer', 'fuel', 'shortages', 'growing', 'roadside', 'trash', 'heaps', 'complicating', 'efforts', 'workers', 'officials', 'saidmore', '500', 'cases', 'virus', 'registered', 'since', 'may', 'disease', 'first', 'detected', 'fareastern', 'cuba', 'officials', 'weekthe', 'virus', 'known', 'sloth', 'fever', 'transmitted', 'bite', 'mosquitoes', 'midges', 'spread', 'quickly', 'across', 'country', 'provinces', 'major', 'cities', 'including', 'capital', 'havana', 'patients', 'complain', 'fever', 'body', 'aches', 'nausea', 'though', 'disease', 'rarely', 'fatalfumigation', 'workers', 'using', 'handheld', 'gaspowered', 'blowers', 'fired', 'smoke', 'dark', 'corners', 'alleyways', 'parts', 'havana', 'friday', 'though', 'efforts', 'stymied', 'part', 'limited', 'resources', 'past', 'blocks', 'fumigated', 'every', 'week', '…'

In [47]:
from nltk.stem.porter import PorterStemmer
#defining the object for stemming
porter_stemmer = PorterStemmer()

In [48]:
#defining a function for stemming
def stemming(text):
  stem_text = [porter_stemmer.stem(word) for word in text]
  return stem_text

# consideration: the results of stemming here is not human readable, but according to chatgpt: However, if your goal is purely text classification (like Naive Bayes or TF-IDF), these changes may not be a problem, as long as your model learns meaningful patterns.
df['msg_stemmed'] = df['no_stopwords'].apply(lambda x: stemming(x))
print(df['msg_stemmed'][1])

['havana', 'reuter', 'cuban', 'author', 'launch', 'smallscal', 'fumig', 'effort', 'havana', 'friday', 'fight', 'spread', 'oropouch', 'viru', 'raini', 'caribbean', 'summer', 'fuel', 'shortag', 'grow', 'roadsid', 'trash', 'heap', 'complic', 'effort', 'worker', 'offici', 'saidmor', '500', 'case', 'viru', 'regist', 'sinc', 'may', 'diseas', 'first', 'detect', 'fareastern', 'cuba', 'offici', 'weekth', 'viru', 'known', 'sloth', 'fever', 'transmit', 'bite', 'mosquito', 'midg', 'spread', 'quickli', 'across', 'countri', 'provinc', 'major', 'citi', 'includ', 'capit', 'havana', 'patient', 'complain', 'fever', 'bodi', 'ach', 'nausea', 'though', 'diseas', 'rare', 'fatalfumig', 'worker', 'use', 'handheld', 'gaspow', 'blower', 'fire', 'smoke', 'dark', 'corner', 'alleyway', 'part', 'havana', 'friday', 'though', 'effort', 'stymi', 'part', 'limit', 'resourc', 'past', 'block', 'fumig', 'everi', 'week', '…', 'due', 'fuel', 'shortag', 'focu', 'specif', 'case', 'fever', 'outbreak', 'occur', 'havana', 'fumig'

In [49]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\agaro\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [50]:
#defining the function for lemmatization
def lemmatizer(text):
  lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
  return lemm_text

df['msg_lemmatized'] = df['no_stopwords'].apply(lambda x:lemmatizer(x))
print(df['msg_lemmatized'][1])

['havana', 'reuters', 'cuban', 'authority', 'launched', 'smallscale', 'fumigation', 'effort', 'havana', 'friday', 'fight', 'spread', 'oropouche', 'virus', 'rainy', 'caribbean', 'summer', 'fuel', 'shortage', 'growing', 'roadside', 'trash', 'heap', 'complicating', 'effort', 'worker', 'official', 'saidmore', '500', 'case', 'virus', 'registered', 'since', 'may', 'disease', 'first', 'detected', 'fareastern', 'cuba', 'official', 'weekthe', 'virus', 'known', 'sloth', 'fever', 'transmitted', 'bite', 'mosquito', 'midge', 'spread', 'quickly', 'across', 'country', 'province', 'major', 'city', 'including', 'capital', 'havana', 'patient', 'complain', 'fever', 'body', 'ache', 'nausea', 'though', 'disease', 'rarely', 'fatalfumigation', 'worker', 'using', 'handheld', 'gaspowered', 'blower', 'fired', 'smoke', 'dark', 'corner', 'alleyway', 'part', 'havana', 'friday', 'though', 'effort', 'stymied', 'part', 'limited', 'resource', 'past', 'block', 'fumigated', 'every', 'week', '…', 'due', 'fuel', 'shortage

Second step we will have is processing

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Assuming df contains 'content' and 'annotation' columns
def prepare_data(df):
    # Process content column
    X = df['msg_lemmatized'].apply(lambda tokens: " ".join(tokens) if isinstance(tokens, list) else tokens)
    y = df['annotation']

    print(X[1])

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Vectorize text data
    vectorizer = CountVectorizer()
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    return X_train_vec, X_test_vec, y_train, y_test, vectorizer

def train_naive_bayes(X_train_vec, y_train):
    # Initialize and train Naive Bayes classifier
    nb_model = MultinomialNB()
    nb_model.fit(X_train_vec, y_train)
    return nb_model

def train_svm(X_train_vec, y_train):
    # Initialize and train SVM classifier
    # Using LinearSVC for better performance with text data
    svm_model = LinearSVC(random_state=42)
    svm_model.fit(X_train_vec, y_train)
    return svm_model

def train_logistic_regression(X_train_vec, y_train):
    # Train Logistic Regression
    logistic_model = LogisticRegression()
    logistic_model.fit(X_train_vec, y_train)
    return logistic_model

def evaluate_model(model, X_test_vec, y_test, model_name):
    # Make predictions
    y_pred = model.predict(X_test_vec)

    # Calculate and print metrics
    accuracy = accuracy_score(y_test, y_pred)
    print(f'\n{model_name} Results:')
    print(f'Accuracy: {accuracy * 100:.2f}%')
    print('\nClassification Report:')
    print(classification_report(y_test, y_pred))

    return y_pred

In [52]:
# Main execution
from sklearn.model_selection import cross_val_score

# Main execution with cross-validation and validation split
def main(df):
    # Prepare data
    X_train_vec, X_test_vec, y_train, y_test, vectorizer = prepare_data(df)

   # Cross-validation for Naive Bayes
    nb_model = train_naive_bayes(X_train_vec, y_train)
    nb_cv_scores = cross_val_score(nb_model, X_train_vec, y_train, cv=5, scoring='accuracy')
    print(f'Naive Bayes Cross-Validation Accuracy: {nb_cv_scores.mean() * 100:.2f}%')

    # Cross-validation for SVM
    svm_model = train_svm(X_train_vec, y_train)
    svm_cv_scores = cross_val_score(svm_model, X_train_vec, y_train, cv=5, scoring='accuracy')
    print(f'SVM Cross-Validation Accuracy: {svm_cv_scores.mean() * 100:.2f}%')

    # Logistic regression
    logistic_model = train_logistic_regression(X_train_vec, y_train)
    logistic_regression_cv_scores = cross_val_score(logistic_model, X_train_vec, y_train, cv=5, scoring='accuracy')
    print(f'Logistic Regression Cross-Validation Accuracy: {logistic_regression_cv_scores.mean() * 100:.2f}%')

    # Train and evaluate Naive Bayes
    nb_predictions = evaluate_model(nb_model, X_test_vec, y_test, "Naive Bayes")

    # Train and evaluate SVM
    svm_predictions = evaluate_model(svm_model, X_test_vec, y_test, "Support Vector Machine")

    # Train and evaluate logistic regression
    logistic_regression_predictions = evaluate_model(logistic_model, X_test_vec, y_test, "Logistic Regression")


    return {
        'naive_bayes': {
            'model': nb_model,
            'predictions': nb_predictions,
            'cv_scores': nb_cv_scores
        },
        'svm': {
            'model': svm_model,
            'predictions': svm_predictions,
            'cv_scores': svm_cv_scores
        },
        'logistic_regression': {
            'model': logistic_model,
            'predictions': logistic_regression_predictions,
            'val_accuracy': logistic_regression_cv_scores
        },
        'vectorizer': vectorizer
    }

In [53]:
# Example usage
if __name__ == "__main__":
    # Assuming df is your DataFrame
    results = main(df)

havana reuters cuban authority launched smallscale fumigation effort havana friday fight spread oropouche virus rainy caribbean summer fuel shortage growing roadside trash heap complicating effort worker official saidmore 500 case virus registered since may disease first detected fareastern cuba official weekthe virus known sloth fever transmitted bite mosquito midge spread quickly across country province major city including capital havana patient complain fever body ache nausea though disease rarely fatalfumigation worker using handheld gaspowered blower fired smoke dark corner alleyway part havana friday though effort stymied part limited resource past block fumigated every week … due fuel shortage focus specific case fever outbreak occur havana fumigation worker luís aguilarus authority earlier week 21 u citizen visited cuba summer month returned home case oropouchecuba reported relatively case compared country including brazil virus present fatalitiesbut economic crisis shortage f