***
# Pre Process
***

### <font color='saddlebrown'> Import Packages</font>

In [1]:
# Standard library imports
import re

# Basic library imports
import pandas as pd
import pickle
import wikipediaapi
import mwparserfromhell
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# NLTK submodules and functions
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import pandas as pd


### <font color='saddlebrown'> Fetch from Wikipedia</font>

In [7]:

# Define a proper user agent
user_agent = "MyApp/1.0 (melakemekonnen100@gmail.com)"

# Initialize the Wikipedia object with the user agent
wiki_wiki = wikipediaapi.Wikipedia(
    language='en',
    user_agent=user_agent
)

def get_wiki_page(title):
    page = wiki_wiki.page(title)
    wikicode = mwparserfromhell.parse(page.text)
    infobox = None
    for template in wikicode.filter_templates():
        if template.name.strip().lower().startswith("infobox"):
            infobox = template
            break
    
    return {
        "title": title,
        "text": page.text,
        "categories": list(page.categories.keys()),
        "infobox": str(infobox) if infobox else None
    }

# Test the function
example_topic = 'Cardiology'
page_data = get_wiki_page(example_topic)
print(page_data)


{'title': 'Cardiology', 'text': 'Cardiology (from Ancient Greek  καρδίᾱ (kardiā) \'heart\', and  -λογία (-logia) \'study\') is the study of the heart. Cardiology is a branch of medicine that deals with disorders of the heart and the cardiovascular system. The field includes medical diagnosis and treatment of congenital heart defects, coronary artery disease, heart failure, valvular heart disease, and electrophysiology. Physicians who specialize in this field of medicine are called cardiologists, a specialty of internal medicine. Pediatric cardiologists are pediatricians who specialize in cardiology. Physicians who specialize in cardiac surgery are called cardiothoracic surgeons or cardiac surgeons, a specialty of general surgery.\n\nSpecializations\nAll cardiologists in the branch of medicine study the disorders of the heart, but the study of adult and child heart disorders each require different training pathways. Therefore, an adult cardiologist (often simply called "cardiologist") i

### <font color='saddlebrown'> Text Preprocessing with NLTK</font>

In [8]:

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # Tokenize and remove stopwords
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words and word.isalpha()]

    # Stemming
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in filtered_words]
    
    return ' '.join(stemmed_words)

# Apply preprocessing to the article text
processed_text = preprocess_text(page_data['text'])


[nltk_data] Downloading package punkt to /home/melak/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/melak/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
processed_text

'cardiolog ancient greek καρδίᾱ kardiā studi heart cardiolog branch medicin deal disord heart cardiovascular system field includ medic diagnosi treatment congenit heart defect coronari arteri diseas heart failur valvular heart diseas electrophysiolog physician special field medicin call cardiologist specialti intern medicin pediatr cardiologist pediatrician special cardiolog physician special cardiac surgeri call cardiothorac surgeon cardiac surgeon specialti gener surgeri special cardiologist branch medicin studi disord heart studi adult child heart disord requir differ train pathway therefor adult cardiologist often simpli call cardiologist inadequ train take care children pediatr cardiologist train treat adult heart diseas surgic aspect includ cardiolog domain cardiothorac surgeri exampl coronari arteri bypass surgeri cabg cardiopulmonari bypass valv replac surgic procedur perform surgeon cardiologist howev minim invas procedur cardiac catheter pacemak implant perform cardiologist a

In [17]:
# Initialize a CountVectorizer and transform the processed text into a vector
vectorizer = CountVectorizer()
X = vectorizer.fit_transform([processed_text])


In [18]:
medical_topics = [
    "Cardiology", "Neurology", "Oncology", "Pediatrics", "Dermatology",
    "Endocrinology", "Gastroenterology", "Hematology", "Nephrology",
    "Ophthalmology", "Otolaryngology", "Pulmonology", "Rheumatology",
    "Urology", "Psychiatry", "Immunology", "Anesthesiology", "Radiology",
    "Pathology", "Infectious Disease", "Obstetrics", "Gynecology",
    "Geriatrics", "Orthopedics", "Surgery", "Virology", "Cardiothoracic Surgery",
    "Neonatology", "Pharmacology", "Nuclear Medicine", "Emergency Medicine",
    "Dental Surgery", "Podiatry", "Occupational Therapy", "Physical Therapy"
]

non_medical_topics = [
    "History", "Geography", "Technology", "Art", "Literature", "Economics",
    "Physics", "Chemistry", "Biology", "Mathematics", "Environmental Science",
    "Political Science", "Philosophy", "Music", "Astronomy", "Linguistics",
    "Sociology", "Archaeology", "Anthropology", "Psychology", "Computer Science",
    "Engineering", "Education", "Media Studies", "Culinary Arts", "Architecture",
    "Cinema", "Fashion Design", "Graphic Design", "Urban Planning", "Photography",
    "Theater", "Dance", "Folklore", "Library Science", "Journalism"
]



### <font color='saddlebrown'> Wikipedia Data Extraction and Text Preprocessing</font>

In [20]:


# Initialize NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Define user agent and initialize Wikipedia API
user_agent = "MyApp/1.0 (myemail@example.com)"
wiki_wiki = wikipediaapi.Wikipedia(language='en', user_agent=user_agent)

def get_wiki_page(title):
    page = wiki_wiki.page(title)
    return {
        "title": title,
        "text": page.text if page.exists() else ""
    }

def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words and word.isalpha()]
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in filtered_words]
    return ' '.join(stemmed_words)


# Collect and preprocess data
data = []
for topic in medical_topics + non_medical_topics:
    page_data = get_wiki_page(topic)
    preprocessed_text = preprocess_text(page_data['text'])
    label = "medical" if topic in medical_topics else "non-medical"
    data.append({"text": preprocessed_text, "label": label})

# Create a DataFrame
df = pd.DataFrame(data)

df

[nltk_data] Downloading package punkt to /home/melak/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/melak/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,text,label
0,cardiolog ancient greek καρδίᾱ kardiā studi he...,medical
1,neurolog greek νεῦρον neûron string nerv suffi...,medical
2,oncolog branch medicin deal studi treatment di...,medical
3,pediatr also spell paediatr pædiatric branch m...,medical
4,dermatolog branch medicin deal skin special me...,medical
...,...,...
66,theatr theater collabor form perform art use l...,non-medical
67,danc art form often classifi sport consist seq...,non-medical
68,folklor whole oral tradit share particular gro...,non-medical
69,librari inform scienc studi li interdisciplina...,non-medical


In [None]:
df

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71 entries, 0 to 70
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    71 non-null     object
 1   label   71 non-null     object
dtypes: object(2)
memory usage: 1.2+ KB


### <font color='saddlebrown'>Text Preprocessing: Lemmatization and Sentence Filtering
</font>

In [21]:
def get_wiki_page(title):
    page = wiki_wiki.page(title)
    return {
        "title": title,
        "text": page.text if page.exists() else ""
    }


In [22]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)
"""
def preprocess_text(text):
    words = word_tokenize(text.lower())
    lemmatizer = WordNetLemmatizer()
    lemmatized_output = ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in words if w.isalpha()])
    return lemmatized_output
"""


def preprocess_text(text):
    # Remove non-English characters
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)

    # Break text into sentences and filter out very short sentences
    sentences = sent_tokenize(text)
    filtered_sentences = [sent for sent in sentences if len(sent.split()) > 3]

    return ' '.join(filtered_sentences)


### <font color='saddlebrown'>Building a Medical vs. Non-Medical Text Dataset from Wikipedia
</font>

In [23]:

data = []
for topic in medical_topics + non_medical_topics:
    page_data = get_wiki_page(topic)
    preprocessed_text = preprocess_text(page_data['text'])
    label = "medical" if topic in medical_topics else "non-medical"
    data.append({"text": preprocessed_text, "label": label})

df = pd.DataFrame(data)
df

Unnamed: 0,text,label
0,Cardiology (from Ancient Greek (kardi ) 'he...,medical
1,"Neurology (from Greek: (ne ron), ""string, ne...",medical
2,Oncology is a branch of medicine that deals wi...,medical
3,Pediatrics (also spelled paediatrics or p diat...,medical
4,Dermatology is the branch of medicine dealing ...,medical
...,...,...
66,Theatre or theater is a collaborative form of ...,non-medical
67,"Dance is an art form, often classified as a sp...",non-medical
68,Folklore is the whole of oral traditions share...,non-medical
69,Library and information science(s) or studies ...,non-medical


### <font color='saddlebrown'>Feature Extraction and Label Encoding for Text Data
</font>

In [25]:
# Set up TF-IDF Vectorizer with 5000 features for efficiency
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['text'])  # Convert text data to TF-IDF vectors

# Use Label Encoder for converting labels to numeric
encoder = LabelEncoder()
y = encoder.fit_transform(df['label'])  # Numeric encoding of 'medical' and 'non-medical'


### <font color='saddlebrown'>Training a Multinomial Naive Bayes Classifier
</font>


In [50]:
# Feature Extraction
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['text'])

# Label Encoding
encoder = LabelEncoder()
y = encoder.fit_transform(df['label'])

# Splitting the Dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training
model = MultinomialNB()
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      0.50      0.67        10
           1       0.50      1.00      0.67         5

    accuracy                           0.67        15
   macro avg       0.75      0.75      0.67        15
weighted avg       0.83      0.67      0.67        15



In [51]:
# Define a dictionary of models, now including additional classifiers
models = {
    'MultinomialNB': MultinomialNB(),
    'LogisticRegression': LogisticRegression(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'SVC': SVC(),
    'RandomForestClassifier': RandomForestClassifier(),
    'RidgeClassifier': RidgeClassifier(),
    'GaussianNB': GaussianNB(),
    'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'BaggingClassifier': BaggingClassifier(n_estimators=1000),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    # Add other models as needed
}

# Initialize an empty list to store model performance metrics
model_metrics = []

# Evaluate each model
for name, model in models.items():
    # Fit the model and predict
    if name in ['GaussianNB', 'QuadraticDiscriminantAnalysis']:
        model.fit(X_train.toarray(), y_train)
        y_pred = model.predict(X_test.toarray())
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    # Calculate performance metrics
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)

    # Append metrics to the list
    model_metrics.append([name, precision, recall, f1, accuracy])

# Print the metrics in a table format
print(f"{'Model':<35} {'Precision':<10} {'Recall':<10} {'F1 Score':<10} {'Accuracy':<10}")
for metric in model_metrics:
    print(f"{metric[0]:<35} {metric[1]:<10.2f} {metric[2]:<10.2f} {metric[3]:<10.2f} {metric[4]:<10.2f}")




Model                               Precision  Recall     F1 Score   Accuracy  
MultinomialNB                       0.83       0.67       0.67       0.67      
LogisticRegression                  0.77       0.73       0.74       0.73      
KNeighborsClassifier                0.82       0.60       0.59       0.60      
DecisionTreeClassifier              0.87       0.87       0.87       0.87      
SVC                                 0.94       0.93       0.93       0.93      
RandomForestClassifier              1.00       1.00       1.00       1.00      
RidgeClassifier                     0.87       0.87       0.87       0.87      
GaussianNB                          0.94       0.93       0.93       0.93      
QuadraticDiscriminantAnalysis       0.67       0.53       0.53       0.53      
AdaBoostClassifier                  1.00       1.00       1.00       1.00      
BaggingClassifier                   0.87       0.87       0.87       0.87      
GradientBoostingClassifier          0.87

### <font color='saddlebrown'>HyperParameter Tuning
</font>


### <font color='saddlebrown'>Save Best Model
</font>


In [33]:
pickle.dump(model, open(f"../models/text_classifier_model.pkl", 'wb'))