In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn import preprocessing

import nltk
nltk.download('punkt')
nltk.download('stopwords')
  

import time
import numpy as np
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
#import stop words, data and do some pre processing
stop_words = set(stopwords.words('english'))
with open('/content/stopwords.txt') as f:
    for line in f:
        stop_words.add(line[:-1])
stop_words = list(stop_words)
#Read the csv file and change the encoding, remove tags,lower them
df_train = pd.read_csv('/content/drive/MyDrive/bigdata2023-exercise1-classification/train.csv', encoding='utf-8')
df_train['Title'] = df_train['Title'].str.encode('ascii', 'ignore').str.decode('ascii').str.lower().str.replace('<br />','')
df_train['Content'] = df_train['Content'].str.encode('ascii', 'ignore').str.decode('ascii').str.lower().str.replace('<br />','')
df_train['Label'] = df_train['Label'].str.encode('ascii', 'ignore').str.decode('ascii').str.lower().str.replace('<br />','')

#make a new column as a combination of title & content 
df_train['Combined']  = 3*(df_train['Title'] + ' ')  + df_train['Content']

In [4]:
#initializations
le = preprocessing.LabelEncoder()
y = le.fit_transform(df_train['Label'])
clf = RandomForestClassifier(n_estimators=100, max_depth=18, random_state=42)

In [5]:
total_time = time.time()
vectorizer = TfidfVectorizer(stop_words=stop_words)
X = vectorizer.fit_transform(df_train['Combined'])

kfold_time = time.time()
kf = KFold(n_splits=5)
accuracy = 0
precision = 0
recall = 0
fmeasure = 0

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    
    accuracy += accuracy_score(y_test, predictions)
    precision += precision_score(y_test, predictions, average='macro')
    recall += recall_score(y_test, predictions, average='macro')
    fmeasure += f1_score(y_test, predictions, average='macro')

accuracy /= 5
precision /= 5
recall /= 5
fmeasure /= 5

print('accuracy = {}, precision = {}, recall = {}, f1-measure = {}'.format(round(accuracy, 4), round(precision,4), round(recall,4), round(fmeasure,4)))
print('5-fold time: {} s'.format(time.time() - kfold_time))
print('Total for RandomForestClassifier: {} s'.format(time.time() - total_time))

total_time = time.time()



accuracy = 0.7569, precision = 0.866, recall = 0.6749, f1-measure = 0.7228
5-fold time: 467.7227358818054 s
Total for RandomForestClassifier: 509.08939146995544 s


In [None]:
#with SVD
svd = TruncatedSVD(n_components=20, random_state=42) 
X = svd.fit_transform(X)

print('Starting 5-fold for SVD')
kfold_time = time.time()
kf = KFold(n_splits=5)
accuracy = 0
precision = 0
recall = 0
fmeasure = 0

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    
    accuracy += accuracy_score(y_test, predictions)
    precision += precision_score(y_test, predictions, average='macro')
    recall += recall_score(y_test, predictions, average='macro')
    fmeasure += f1_score(y_test, predictions, average='macro')

accuracy /= 5
precision /= 5
recall /= 5
fmeasure /= 5

print('accuracy = {}, precision = {}, recall = {}, f1-measure = {}'.format(round(accuracy, 4), round(precision,4), round(recall,4), round(fmeasure,4)))
print('5-fold time: {} s'.format(time.time() - kfold_time))
print('Total for RandomForestClassifier with SVD: {} s'.format(time.time() - total_time))




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Starting 5-fold for BOW
accuracy = 0.7558, precision = 0.8656, recall = 0.6746, f1-measure = 0.7225
5-fold time: 428.73845505714417 s
Total for BOW: 469.54180216789246 s
Starting 5-fold for SVD
accuracy = 0.9351, precision = 0.9305, recall = 0.9243, f1-measure = 0.9273
5-fold time: 282.126202583313 s
Total for SVD: 295.05093145370483 s
