In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn import preprocessing


import nltk
nltk.download('punkt')
nltk.download('stopwords')
  

import time
import numpy as np
from nltk.corpus import stopwords

import tensorflow as tf


from keras.models import Sequential
from keras.layers import Dense
from keras.utils import plot_model

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#import stop words, data and do some pre processing
stop_words = set(stopwords.words('english'))
with open('/content/drive/My Drive/stopwords.txt') as f:
    for line in f:
        stop_words.add(line[:-1])
stop_words = list(stop_words)
#Read the csv file and change the encoding, remove tags,lower them
df_train = pd.read_csv('/content/drive/MyDrive/bigdata2023-exercise1-classification/train.csv', encoding='utf-8')
df_train['Title'] = df_train['Title'].str.encode('ascii', 'ignore').str.decode('ascii').str.lower().str.replace('<br />','')
df_train['Content'] = df_train['Content'].str.encode('ascii', 'ignore').str.decode('ascii').str.lower().str.replace('<br />','')
df_train['Label'] = df_train['Label'].str.encode('ascii', 'ignore').str.decode('ascii').str.lower().str.replace('<br />','')

#make a new column as a combination of title & content 
df_train['Combined']  = 3*(df_train['Title'] + ' ')  + df_train['Content']

In [None]:
#initializations
le = preprocessing.LabelEncoder()
y = le.fit_transform(df_train['Label'])


In [None]:
vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=1000)
X = vectorizer.fit_transform(df_train['Combined']).toarray()


In [None]:
X.shape

(111795, 1000)

In [None]:
type(X)

numpy.ndarray

In [None]:
model = Sequential()
model.add(Dense(256, activation='relu', input_dim=1000))
model.add(Dense(128, activation='relu'))
model.add(Dense(4, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])


In [None]:
print(y)

[1 1 3 ... 2 1 1]


In [22]:
total_time = time.time()



kfold_time = time.time()
kf = KFold(n_splits=5)
accuracy = 0
precision = 0
recall = 0
fmeasure = 0


for train_index, test_index in kf.split(X):
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model.fit(X_train, y_train,epochs=3, batch_size=1, verbose=1)
    plot_model(model, to_file='dnn.png', show_shapes=True)
    y_pred = model.predict(X_test)
    y_pred = np.argmax(y_pred, axis = 1)
    
    accuracy += accuracy_score(y_test, y_pred)
    precision += precision_score(y_test, y_pred, average='macro')
    recall += recall_score(y_test, y_pred, average='macro')
    fmeasure += f1_score(y_test, y_pred, average='macro')

accuracy /= 5
precision /= 5
recall /= 5
fmeasure /= 5

print('accuracy = {}, precision = {}, recall = {}, f1-measure = {}'.format(round(accuracy, 4), round(precision,4), round(recall,4), round(fmeasure,4)))
print('3-fold time: {} s'.format(time.time() - kfold_time))
print('Total for Neural Network Classifier: {} s'.format(time.time() - total_time))

accuracy = 0.9923, precision = 0.992, recall = 0.9909, f1-measure = 0.9914
3-fold time: 747.9303843975067 s
Total for Neural Network Classifier: 747.9315810203552 s
