## Part 1: Data Collection
- Collect links for all the archives
- For each archive, collect all the articles (main body and categories), and store them

In [None]:
# import packages
from bs4 import BeautifulSoup
import requests

In [None]:
base_url = 'http://mlg.ucd.ie/modules/COMP41680/archive/'
index_url = 'http://mlg.ucd.ie/modules/COMP41680/archive/index.html' # I use this

# main lists
month_links = []
articles = []


In [None]:
# In this step, I get all links from each month from the main page

req = requests.get(index_url)
soup = BeautifulSoup(req.text, 'html.parser')

list_items = soup.find_all('li')

# month_links = [base_url + li.find('a').get('href') for li in list_items]

for li in list_items:
    link_item = li.find('a').get('href')
    month_links.append(base_url + link_item)

month_links

In [None]:
# In this step, from each link from each month_link, I get the category, title and link of each article
import urllib.request
id = 1
article_category = dict()
for month_link in month_links:
    req = requests.get(month_link)
    soup = BeautifulSoup(req.text, 'html.parser')
    
    tbody = soup.find('tbody')
    article_rows = tbody.find_all('tr')
    
    for article_row in article_rows:
        category = article_row.find(class_='category').text
        reference_elem = article_row.find('a')
        
        #here I filter all the articles with no content
        if reference_elem is None:
            continue
        else:
            # Store the category for current article.
            article_category[str(id)] = category
                
            title = reference_elem.text
            link = base_url + reference_elem.get('href')
            article_response = urllib.request.urlopen(link)
            article_html = article_response.read().decode()
            soup = BeautifulSoup(article_html, 'html.parser') 
            
            # Get the content/body text from articles
            for element in soup.find_all("body"):
                article_text = ""
                article_text += element.find("h2").text
                article_text += " "
                
                for p in element.find_all("p"):
                    if "Return to article search results" in p.text:
                        continue
                    else:
                        article_text += p.text
                
                # Store the content/body text from articles in a separate file.
                content_file = open(str(id) + ".txt", "w", encoding="utf-8")
                content_file.write(article_text)
                content_file.close()
                id += 1

In [None]:
newfile = open("categories4.csv", "w", encoding="utf-8")
content = "id,category\n"
for id in article_category:
    content += id
    content += ","
    content += article_category[id]
    content += "\n"
newfile.write(content)
newfile.close()

## Part 2: Text Classification

- load the set of raw documents in notebook with class labels:

In [None]:
import pandas as pd
f = pd.read_csv("categories4.csv")
f.head()

In [None]:
# Check the file is correct with all the articles
content_file

In [None]:
article_bodies = []
import numpy as np
id_list = f.get("id")
for id in id_list:
    article_file = open(str((id))+".txt", "r", encoding="utf-8")
    article_text = article_file.read()
    article_file.close()
    article_bodies.append(article_text)

In [None]:
article_bodies[:4]

- Create a document-term matrix, using appropriate text pre-processing and term weighting steps:

In [None]:
print("Read %d raw text documents" % len(article_bodies))

In [None]:
df = pd.read_csv("categories4.csv")
df['body'] = article_bodies
df.head()

In [None]:
import nltk
#nltk.download()
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

articles = df["body"]
    
def lemmatizer_token(text):

    tokens=nltk.tokenize.word_tokenize(text)
    lemmatizer=nltk.stem.WordNetLemmatizer()
    lemma_tokens= []
    for token in tokens:
        token = token.lower()
        lemma_tokens.append(lemmatizer.lemmatize(token) )
    return lemma_tokens

vectorizer=TfidfVectorizer(stop_words="english", tokenizer = lemmatizer_token, min_df = 5)
tfidf =vectorizer.fit_transform(df["body"])
document =  vectorizer.get_feature_names()

tf_idf =  tfidf.toarray()
tf_df = pd.DataFrame(data=tf_idf, index = df['id'], columns= document)
tf_df.head()

- Build two multi-class classification models:

In [None]:
# KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
X=tf_idf
y=df['category']
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X,y)
score1 = cross_val_score(model,X,y,cv=5)
print(score1)

In [None]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
model2 = GaussianNB()
model2.fit(X,y)
score2 = cross_val_score(model2,X,y,cv=5)
print(score2)

- Evaluation:

In [None]:
# KNN
from sklearn.metrics import confusion_matrix
predicted = model.predict(X)
cm = confusion_matrix(y, predicted)
print(cm)

In [None]:
# NB
predicted = model2.predict(X)
cm2 = confusion_matrix(y, predicted)
print(cm2)

- vissualitation the evaluation results:

In [None]:
import matplotlib.pyplot as plt
plt.figure()
plt.hist(score1)
plt.show()

In [None]:
plt.figure()
plt.hist((score2), color="red")
plt.show()

plt.figure()
plt.plot(score2)
plt.show()

- Conclusion:
 With all the data stored and having created the document-term matrix, the next step has been create two multi-class classification models. It has been used tthe k-Nearest Neighbour Classifier and the Naive Bayes classifier . From the results obtained by both, the KNN classifier has more accurate scores than the NB classifier, therefore the first one has more precision., and in order to check that in the evaluation stage to measure the Accuracy I used the confusion matrix as a tool, which in the visual representation it can be seen larger accomulation of higher scores with respect to the first classifier than with the second one. therefore I can affirm that the first classifier(KNN) has worked better than the second one(NB).