In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **1.	Load data**

Load the given labeled dataset such that, each sentence from all documents is an element in the list.

Load the given unlabeled dataset such that, each sentence from all documents is an element in the list.


In [18]:
import os

labeled_path = "/content/drive/MyDrive/NLP/Assigment5/labeled_datasets"
unlabeled_path = "/content/drive/MyDrive/NLP/Assigment5/unlabeled_datasets"

# Use the os library to read and combine all lines from each file in the labeled dataset directory
labeled_corpus = []
for filename in os.listdir(labeled_path):
    with open(os.path.join(labeled_path, filename), 'r') as f:
        lines = f.readlines()
        labeled_corpus.extend(lines)


unlabeled_corpus = []
for filename in os.listdir(unlabeled_path):
    with open(os.path.join(unlabeled_path, filename), 'r') as f:
        lines = f.readlines()
        unlabeled_corpus.extend(lines)

print(labeled_corpus[:5])


['### abstract ###\n', 'AIMX\tin this paper we derive the equations for loop corrected belief propagation on a continuous variable gaussian model\n', 'OWNX\tusing the exactness of the averages for belief propagation for gaussian models  a  different way of obtaining the covariances is found   based on belief propagation on cavity graphs\n', 'OWNX\twe discuss the relation of this  loop correction algorithm to expectation propagation  algorithms for the case in which the model is no longer  gaussian  but slightly perturbed by nonlinear terms\n', '### introduction ###\n']


# **2.	Pre-process the text (on both labeled and unlabeled dataset)**
You can perform any of the followings:
•	Remove the '### abstract ###' and '### introduction ###' titles
•	Remove stopwords
•	Remove punctuation
After this step, you should be working on a clean dataset.


In [19]:
!pip install nltk




In [20]:
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [22]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

#  Preprocess the text
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = re.sub(r"^#{3}\s?[a-zA-Z]+\s?#{3}$", "", text.strip())
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in string.punctuation]
    tokens = [t for t in tokens if t not in stop_words]
    return " ".join(tokens)

# Extract labels and preprocess each labeled sentence
labels = []
cleaned_labeled_sentences = []

for line in labeled_corpus:
    if "\t" in line:
        tag, sentence = line.strip().split("\t", 1) # split
        labels.append(tag) # store labels
        cleaned_labeled_sentences.append(preprocess(sentence)) # store sentence

cleaned_unlabeled_sentences = [preprocess(s.strip()) for s in unlabeled_corpus]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# **3.	Vectorize the clean labeled dataset**



In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectorizer.fit(cleaned_labeled_sentences + cleaned_unlabeled_sentences)

# label
x = vectorizer.transform(cleaned_labeled_sentences)
y = labels
# unlabels
x_unlabeled = vectorizer.transform(cleaned_unlabeled_sentences)

# **4.	Create the Logistic Regression classifier (sample code available in blackboard)**

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

log_reg = LogisticRegression()
log_reg.fit(x_train, y_train)
y_pred_log = log_reg.predict(x_test)

# **5.	Evaluate the Logistic Regression classifier**

In [29]:
from sklearn.metrics import classification_report
print("\n=== Logistic Regression Report ===")
log_reg_report = classification_report(
    y_test, y_pred_log,
    labels = ['AIMX', 'OWNX', 'CONT', 'BASE', 'NUMBER', 'MISC'],
    zero_division=0 # to avoid warning
)
print(log_reg_report)


=== Logistic Regression Report ===
              precision    recall  f1-score   support

        AIMX       1.00      0.50      0.67         2
        OWNX       0.43      0.67      0.52         9
        CONT       1.00      0.33      0.50         6
        BASE       0.00      0.00      0.00         1
      NUMBER       0.00      0.00      0.00         0
        MISC       0.71      0.75      0.73        16

    accuracy                           0.62        34
   macro avg       0.52      0.38      0.40        34
weighted avg       0.68      0.62      0.61        34



# **6.	Try other classifiers: support vector machine and decision tree. **

In [30]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

# SVM
svm = SVC()
svm.fit(x_train, y_train)
y_pred_svm = svm.predict(x_test)
print("\n=== SVM Report ===")
print(classification_report(
    y_test, y_pred_svm,
    labels=['AIMX', 'OWNX', 'CONT', 'BASE', 'NUMBER', 'MISC'],
    zero_division=0
))

# Decision Tree
tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)
y_pred_tree = tree.predict(x_test)
print("\n=== Decision Tree Report ===")
print(classification_report(
    y_test, y_pred_tree,
    labels=['AIMX', 'OWNX', 'CONT', 'BASE', 'NUMBER', 'MISC'],
    zero_division=0
))

# NaiveBayes
NaiveBayes= MultinomialNB()
NaiveBayes.fit(x_train, y_train)
y_pred_NB = NaiveBayes.predict(x_test)
print("\n=== Naive Bayes Report ===")
print(classification_report(
    y_test, y_pred_NB,
    labels=['AIMX', 'OWNX', 'CONT', 'BASE', 'NUMBER', 'MISC'],
    zero_division=0
))


=== SVM Report ===
              precision    recall  f1-score   support

        AIMX       1.00      0.50      0.67         2
        OWNX       0.50      1.00      0.67         9
        CONT       1.00      0.33      0.50         6
        BASE       0.00      0.00      0.00         1
      NUMBER       0.00      0.00      0.00         0
        MISC       0.85      0.69      0.76        16

    accuracy                           0.68        34
   macro avg       0.56      0.42      0.43        34
weighted avg       0.77      0.68      0.66        34


=== Decision Tree Report ===
              precision    recall  f1-score   support

        AIMX       1.00      0.50      0.67         2
        OWNX       0.40      0.89      0.55         9
        CONT       1.00      0.50      0.67         6
        BASE       0.00      0.00      0.00         1
      NUMBER       0.00      0.00      0.00         0
        MISC       0.80      0.50      0.62        16

    accuracy               

In [31]:
# === Predict on Unlabeled Data ===
unlabeled_predictions = NaiveBayes.predict(x_unlabeled)

print("\n Printout of Predicted Labels for Unlabeled Sentences:\n")
for label, sentence in zip(unlabeled_predictions, unlabeled_corpus):
    print(f"{label}\t{sentence.strip()}")



 Printout of Predicted Labels for Unlabeled Sentences:

MISC	### abstract ###
OWNX	Whole-genome transporter analyses have been conducted on 141 organisms whose complete genome sequences are available.
OWNX	For each organism, the complete set of membrane transport systems was identified with predicted functions, and classified into protein families based on the transporter classification system.
OWNX	Organisms with larger genome sizes generally possessed a relatively greater number of transport systems.
MISC	In prokaryotes and unicellular eukaryotes, the significant factor in the increase in transporter content with genome size was a greater diversity of transporter types.
OWNX	In contrast, in multicellular eukaryotes, greater number of paralogs in specific transporter families was the more important factor in the increase in transporter content with genome size.
OWNX	Both eukaryotic and prokaryotic intracellular pathogens and endosymbionts exhibited markedly limited transport capabili

# **Summary **
In this project, I worked with two sets of datasets: one labeled and one unlabeled. I first cleaned both datasets by removing headers, stopwords, and punctuation. Then I used TF-IDF to turn the sentences into numeric features.

I trained a Logistic Regression model to classify the sentences into six categories (AIMX, OWNX, CONT, BASE, NUMBER, MISC) using the labeled data. I tested the model's performance using accuracy reports. I also compared it with other models like SVM, Decision Tree, and Naive Bayes.

Finally, I used the trained Logistic Regression model to predict the categories for the unlabeled sentences and printed the predicted labels along with each sentence.

This process showed that Logistic Regression worked well for this classification task, and the model was able to apply the learned categories to unseen data.