<a href="https://colab.research.google.com/github/Kidus-Bellete/NLP_Project1/blob/main/nlp_assignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Natural Language Processing Assignment I**
#     Classification of Texts using Wikipedia

In [1]:
import pandas as pd
import requests
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [10]:
titles = ["Medicine", "Hospital", "Surgery", "Health", "Heart", "Vaccine","endurance","brain","stomach","Therapy",
          "Pharmacy", "Immunology", "Pathology", "Treatment", "Diabetes", "Disease",
          "Therapy", "Dentistry", "Kidney", "Blood", "Blood pressure", "Virus",
          "Art", "Language", "Literature", "Philosophy",
          "Empire", "Space", "Environment", "Color", "Mountain","rule of law","justice",
          "Forest", "Cooking", "Theology", "Fashion","animal","love","tree",
          "History", "Geography", "Archaeology", "government", "Astronomy"]

# Create a data frame
df = pd.DataFrame(columns=["Text", "Label"])
dataArr = []

def fetch_data(title):
    url = 'https://en.wikipedia.org/w/api.php'
    params = {
        'action': 'query',
        'format': 'json',
        'titles': title,
        'prop': 'extracts',
        'exintro': True,
        'explaintext': True,
    }
    response = requests.get(url, params=params)
    data = response.json()
    return data

# Fetch data and preprocess text
for topic in titles:
    data = fetch_data(topic)
    topic_data = next(iter(data['query']['pages'].values()))
    dataArr.append(topic_data['extract'][:500])

data = {"Text": dataArr, "Label": ["Medical"] * 22 + ["Non-Medical"] * 23}
try:
    df_added = pd.DataFrame(data)
    df = pd.concat([df, df_added], ignore_index=True)
    df.to_csv('nlp_dataset.csv', index=False)

    # Tokenization, lemmatization, and stemming function
    def preprocess_text(text):
        # Tokenization
        tokens = word_tokenize(text.lower())  # Convert to lowercase for consistency

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token.isalnum() and token not in stop_words]

        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]

        # Stemming
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]

        return ' '.join(tokens)

    # Apply preprocessing to the entire dataset
    df['Text'] = df['Text'].apply(preprocess_text)

except ValueError as e:
    print(f"ValueError: {e}")
print(df)

                                                 Text        Label
0   medicin scienc practic care patient manag diag...      Medical
1   hospit healthcar institut provid patient treat...      Medical
2   surgeri medic specialti us manual instrument t...      Medical
3   common usag medicin health accord world health...      Medical
4   heart muscular organ anim organ pump blood blo...      Medical
5   vaccin biolog prepar provid activ acquir immun...      Medical
6   endur also relat suffer forbear resili constit...      Medical
7   brain encephalon organ serf center nervou syst...      Medical
8   stomach muscular hollow organ gastrointestin t...      Medical
9   therapi medic treatment attempt remedi health ...      Medical
10  pharmaci scienc practic discov produc prepar d...      Medical
11  immunolog branch biolog medicin cover studi im...      Medical
12  patholog studi diseas injuri word patholog als...      Medical
13  treatment may refer treatment song 2012 song l...      Med

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
#import pandas as pd

# read data
df = pd.read_csv('nlp_dataset.csv')
print("Original data distribution..")
print(df['Label'].value_counts())

# training and testing
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Label'], test_size=0.3, random_state=42)

# Fill missing values
X_train = X_train.fillna('')
X_test = X_test.fillna('')

# feature extraction
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Resampling using SMOTE
sampler = SMOTE(sampling_strategy='auto', k_neighbors=5)
X_train_resampled, y_train_resampled = sampler.fit_resample(X_train_tfidf, y_train)

# Model selection and training
model = MultinomialNB()
model.fit(X_train_resampled, y_train_resampled)

# model evaluation
predictions = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, predictions)
print(f"\nAccuracy: {accuracy}")
print("Classification Report:\n", classification_report(y_test, predictions))

# making predictions
# data to predict
new_data = ["head ache is a type of disease ",
            "Ethiopia is the oldest city with many historical heritages.",
            "He has a chance to recover from his disease",
            "I love to see Italian historical places.",
            "Have even been in such kind of discomfort situation? ",
            "How can I feel better and to improve my mood?",
            "does eating food changes my feeling?"
            #add any sentences to check its prediction value
            ]

new_data_tfidf = tfidf_vectorizer.transform(new_data)
new_predictions = model.predict(new_data_tfidf)

print("\nPredictions on new data:\n")

for text, prediction in zip(new_data, new_predictions):
    print(f"{text} - Predicted: {prediction}")

Original data distribution..
Non-Medical    23
Medical        22
Name: Label, dtype: int64

Accuracy: 0.9285714285714286
Classification Report:
               precision    recall  f1-score   support

     Medical       1.00      0.83      0.91         6
 Non-Medical       0.89      1.00      0.94         8

    accuracy                           0.93        14
   macro avg       0.94      0.92      0.93        14
weighted avg       0.94      0.93      0.93        14


Predictions on new data:

head ache is a type of disease  - Predicted: Medical
Ethiopia is the oldest city with many historical heritages. - Predicted: Non-Medical
He has a chance to recover from his disease - Predicted: Medical
I love to see Italian historical places. - Predicted: Non-Medical
Have even been in such kind of discomfort situation?  - Predicted: Non-Medical
How can I feel better and to improve my mood? - Predicted: Medical
does eating food changes my feeling? - Predicted: Non-Medical
