In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn import preprocessing

import nltk
nltk.download('punkt')
nltk.download('stopwords')
  

import time
import numpy as np
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
#import stop words, data and do some pre processing
stop_words = set(stopwords.words('english'))
with open('/content/stopwords.txt') as f:
    for line in f:
        stop_words.add(line[:-1])
stop_words = list(stop_words)
#Read the csv file and change the encoding, remove tags,lower them
df_train = pd.read_csv('/content/drive/MyDrive/bigdata2023-exercise1-classification/train.csv', encoding='utf-8')
df_train['Title'] = df_train['Title'].str.encode('ascii', 'ignore').str.decode('ascii').str.lower().str.replace('<br />','')
df_train['Content'] = df_train['Content'].str.encode('ascii', 'ignore').str.decode('ascii').str.lower().str.replace('<br />','')
df_train['Label'] = df_train['Label'].str.encode('ascii', 'ignore').str.decode('ascii').str.lower().str.replace('<br />','')

#make a new column as a combination of title & content 
df_train['Combined']  = 3*(df_train['Title'] + ' ')  + df_train['Content']

In [None]:
#initializations
le = preprocessing.LabelEncoder()
y = le.fit_transform(df_train['Label'])
clf = KNeighborsClassifier(n_neighbors=10)


In [None]:
total_time = time.time()
vectorizer = TfidfVectorizer(stop_words=stop_words)
X = vectorizer.fit_transform(df_train['Combined'])

kfold_time = time.time()
kf = KFold(n_splits=5)
accuracy = 0
precision = 0
recall = 0
fmeasure = 0

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    
    accuracy += accuracy_score(y_test, predictions)
    precision += precision_score(y_test, predictions, average='macro')
    recall += recall_score(y_test, predictions, average='macro')
    fmeasure += f1_score(y_test, predictions, average='macro')

accuracy /= 5
precision /= 5
recall /= 5
fmeasure /= 5

print('accuracy = {}, precision = {}, recall = {}, f1-measure = {}'.format(round(accuracy, 4), round(precision,4), round(recall,4), round(fmeasure,4)))
print('5-fold time: {} s'.format(time.time() - kfold_time))
print('Total for KNeighborsClassifier: {} s'.format(time.time() - total_time))



accuracy = 0.9725, precision = 0.9703, recall = 0.9691, f1-measure = 0.9697
5-fold time: 783.4761099815369 s
Total for KNeighborsClassifier: 815.9080400466919 s


In [None]:
total_time = time.time()
# with SVD
svd = TruncatedSVD(n_components=20, random_state=42) 
X = svd.fit_transform(X)

print('Starting 5-fold for SVD')
kfold_time = time.time()
kf = KFold(n_splits=5)
accuracy = 0
precision = 0
recall = 0
fmeasure = 0

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    
    accuracy += accuracy_score(y_test, predictions)
    precision += precision_score(y_test, predictions, average='macro')
    recall += recall_score(y_test, predictions, average='macro')
    fmeasure += f1_score(y_test, predictions, average='macro')

accuracy /= 5
precision /= 5
recall /= 5
fmeasure /= 5

print('accuracy = {}, precision = {}, recall = {}, f1-measure = {}'.format(round(accuracy, 4), round(precision,4), round(recall,4), round(fmeasure,4)))
print('5-fold time: {} s'.format(time.time() - kfold_time))
print('Total for KNeighborsClassifier with SVD: {} s'.format(time.time() - total_time))


Starting 5-fold for SVD


KeyboardInterrupt: ignored