<a href="https://colab.research.google.com/github/Hailemicael/NLP_Project/blob/master/nlp_assignment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
!pip install wikipedia-api



In [8]:
# Import libraries for our project
import pandas as pd
import wikipediaapi
from bs4 import BeautifulSoup
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
# Set up wikipediaapi
wiki_wiki = wikipediaapi.Wikipedia('NLP_Project(hailelulseged281913@gmail.com)', 'en')


In [10]:
# List of medical and non-medical keywords
medical_keywords = ["Medicine", "Cardiology", "Surgery", "Health", "Pharmacy", "Immunology",
                    "Pathology", "Pediatrics", "Oncology", "Neurology", "Dentistry", "vascular",
                    "orthopedic", "dermatology", "endocrinology", "gastroenterology", "pulmonary", "neurosurgery",
                    "ophthalmology", "radiology", "anesthesiology", "genetics", "oncologist", "hematology",
                    "immunotherapy", "pediatrician", "psychiatry", "dentist", "Anatomy",
                    "Physiology", "Biochemistry", "balanced diet", "Ailment", "Affliction", "Illness", "Sickness",
                    "Hereditary", "Infectious", "Pandemic", "nurse", "Doctor", "Alzheimer", "virus", "surgery"]

non_medical_keywords = ["Art", "Literature", "Philosophy", "Science", "Technology", "Space",
                        "Environment", "Food", "Cuisine", "Recipes", "Cooking", "History", "Ancient_Civilizations",
                        "Archaeology", "painting", "sculpture", "literary", "fiction", "poetry", "philosopher",
                        "culinary", "gastronomy", "recipe", "culinary", "history", "historical", "architectural",
                        "archaeological", "civilization", "culture", "Engineering", "Astronomy", "Cosmology", "country", "Industry", "ocean", "charger", "battery", "music", "dance", "painting", "sculpture", "artistic", "novel"]


In [45]:
# Function to fetch content from Wikipedia using wikipediaapi
def fetch_content(title, wiki_wiki):
    # Fetches content from Wikipedia for a given title.
    page_py = wiki_wiki.page(title)
    content = page_py.text

    return content

# Fetch content for medical keywords
medical_content_list = [fetch_content(keyword, wiki_wiki) for keyword in medical_keywords]
# print("Fetched content for medical keywords:")
# print(medical_content_list)
# Fetch content for non-medical keywords
non_medical_content_list = [fetch_content(keyword, wiki_wiki) for keyword in non_medical_keywords]
# print("Fetched content for non-medical keywords:")
# print(non_medical_content_list)

In [12]:
# Function to fetch content from Wikipedia using wikipediaapi
def fetch_content(title, wiki_wiki):
    # Fetches content from Wikipedia for a given title.
    page_py = wiki_wiki.page(title)
    content = page_py.text

    # Clean the content
    cleaned_content = clean_text(content)

    return cleaned_content




In [44]:
# Function to clean text (remove HTML tags, references, etc.)
def clean_text(text):
    # Remove HTML tags and comments
    soup = BeautifulSoup(text, "html.parser")
    clean_text = soup.get_text(separator=" ")

    # Remove special characters and non-alphabetic characters
    clean_text = re.sub(r"[^a-zA-Z\s]", "", clean_text)

    # Tokenize the text
    tokens = word_tokenize(clean_text.lower())

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove extra whitespaces
    clean_text = " ".join(tokens)

    return clean_text
# Clean medical content
cleaned_medical_content_list = [clean_text(content) for content in medical_content_list]
# print("Text cleaning applied to medical content:")
# print(cleaned_medical_content_list)
# Clean non-medical content
cleaned_non_medical_content_list = [clean_text(content) for content in non_medical_content_list]
# print("Text cleaning applied to non-medical content:")
# print(cleaned_non_medical_content_list)


In [14]:
# Create a DataFrame with the fetched and cleaned data
medical_data = {"text": cleaned_medical_content_list, "label": ["medical"] * len(cleaned_medical_content_list)}
non_medical_data = {"text": cleaned_non_medical_content_list, "label": ["non-medical"] * len(cleaned_non_medical_content_list)}

df_medical = pd.DataFrame(medical_data)
df_non_medical = pd.DataFrame(non_medical_data)
# Concatenate the dataframes and shuffle rows
df = pd.concat([df_medical, df_non_medical], ignore_index=True).sample(frac=1)



In [15]:
# Check the updated dataset
print("Updated Dataset:")
print(df)
df.to_csv('medical_non_medical_dataset.csv', index=False)
print("Dataset saved as 'medical_non_medical_dataset'")


Updated Dataset:
                                                 text        label
79  world ocean sea body salt water cover earth en...  non-medical
73  culture kulchr concept encompasses social beha...  non-medical
83  dance art form often classified sport consisti...  non-medical
54  cooking also known cookery professionally culi...  non-medical
59  sculpture branch visual art operates three dim...  non-medical
..                                                ...          ...
66  recipe set instruction describes prepare make ...  non-medical
53  recipe set instruction describes prepare make ...  non-medical
44  art diverse range human activity resulting pro...  non-medical
0   medicine science practice caring patient manag...      medical
41  alzheimers disease ad neurodegenerative diseas...      medical

[88 rows x 2 columns]
Dataset saved as 'medical_non_medical_dataset'


# Model training using Naive_bayes


In [30]:
## Model training using naive_bayes

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# Load the cleaned dataset
df = pd.read_csv('medical_non_medical_dataset.csv')

# Check class distribution
print("Original Data Distribution:")
print(df['label'].value_counts())


Original Data Distribution:
non-medical    44
medical        44
Name: label, dtype: int64


In [31]:
# Data Splitting
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.3, random_state=100)

# Handle missing values in X_test
X_test = X_test.fillna('')  # Replace NaN with an empty string or any other placeholder

# Handle missing values in X_train
X_train = X_train.fillna('')  # Replace NaN with an empty string or any other placeholder


In [32]:
# Feature Extraction (TF-IDF)
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [33]:
# Resampling using SMOTE
sampler = SMOTE(sampling_strategy='auto', k_neighbors=5)
X_train_resampled, y_train_resampled = sampler.fit_resample(X_train_tfidf, y_train)


In [34]:
# Model Selection and Training (Naive Bayes)
nb_model = MultinomialNB()
nb_model.fit(X_train_resampled, y_train_resampled)


In [35]:
# Model Evaluation (Naive Bayes)
nb_predictions = nb_model.predict(X_test_tfidf)
nb_accuracy = accuracy_score(y_test, nb_predictions)
print(f"\nNaive Bayes Accuracy: {nb_accuracy}")
print("Naive Bayes Classification Report:\n", classification_report(y_test, nb_predictions))



Naive Bayes Accuracy: 0.9629629629629629
Naive Bayes Classification Report:
               precision    recall  f1-score   support

     medical       1.00      0.92      0.96        13
 non-medical       0.93      1.00      0.97        14

    accuracy                           0.96        27
   macro avg       0.97      0.96      0.96        27
weighted avg       0.97      0.96      0.96        27



In [36]:
# Example of making predictions on Naive Bayes
new_data_extended = [
    "heart disease",
    "expression varies across cultures.",
    "New year in Ethiopia is good.",
    "my health is not good",
    "doctors support patient",
]

new_data_extended_tfidf = tfidf_vectorizer.transform(new_data_extended)
new_predictions_extended = nb_model.predict(new_data_extended_tfidf)

print("\nNaive Bayes Predictions :")
for text, prediction in zip(new_data_extended, new_predictions_extended):
    print(f"{text} - Predicted: {prediction}")



Naive Bayes Predictions :
heart disease - Predicted: medical
expression varies across cultures. - Predicted: non-medical
New year in Ethiopia is good. - Predicted: non-medical
my health is not good - Predicted: medical
doctors support patient - Predicted: medical


# Model training using LogisticRegression


In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# Load the cleaned dataset
df = pd.read_csv('medical_non_medical_dataset.csv')

# Check class distribution
print("Original Data Distribution:")
print(df['label'].value_counts())


Original Data Distribution:
non-medical    44
medical        44
Name: label, dtype: int64


In [38]:
# Data Splitting
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.3, random_state=100)

# Handle missing values in X_test
X_test = X_test.fillna('')  # Replace NaN with an empty string or any other placeholder

# Handle missing values in X_train
X_train = X_train.fillna('')  # Replace NaN with an empty string or any other placeholder


In [39]:
# Feature Extraction (TF-IDF)
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [40]:
# Resampling using SMOTE
sampler = SMOTE(sampling_strategy='auto', k_neighbors=5)
X_train_resampled, y_train_resampled = sampler.fit_resample(X_train_tfidf, y_train)


In [41]:
# Model Selection and Training (Logistic Regression)
model = LogisticRegression(random_state=100)
model.fit(X_train_resampled, y_train_resampled)


In [42]:
# Model Evaluation
predictions = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, predictions)
print(f"\nAccuracy: {accuracy}")
print("Classification Report:\n", classification_report(y_test, predictions))



Accuracy: 0.9259259259259259
Classification Report:
               precision    recall  f1-score   support

     medical       1.00      0.85      0.92        13
 non-medical       0.88      1.00      0.93        14

    accuracy                           0.93        27
   macro avg       0.94      0.92      0.93        27
weighted avg       0.94      0.93      0.93        27



In [43]:
# Example of making predictions on more new data using Logistic Regression
new_data_extended = [
    "heart disease",
    "expression varies across cultures.",
    "New year in Ethiopia is good."
]

new_data_extended_tfidf = tfidf_vectorizer.transform(new_data_extended)
new_predictions_extended = model.predict(new_data_extended_tfidf)

print("\nLogistic Regression Predictions on more new data:")
for text, prediction in zip(new_data_extended, new_predictions_extended):
    print(f"{text} - Predicted: {prediction}")



Logistic Regression Predictions on more new data:
heart disease - Predicted: medical
expression varies across cultures. - Predicted: non-medical
New year in Ethiopia is good. - Predicted: non-medical
