In [1]:
import pandas as pd
import numpy as np
from google.colab import drive


# Mount Google Drive
drive.mount('/content/drive')

# read train data
with open('/content/drive/MyDrive/LUN_data/raw_data/fulltrain.csv', 'r') as file:
    train = pd.read_csv(file, names=['class', 'text'])

# read test data
with open('/content/drive/MyDrive/LUN_data/raw_data/balancedtest.csv', 'r') as file:
    test = pd.read_csv(file, names=['class', 'text'])

# read stop words
with open('/content/drive/MyDrive/LUN_data/raw_data/stopwords_en.txt', 'r') as file:
    stop_words = file.read().splitlines()

Mounted at /content/drive


## Feature Engineering

In [2]:
def remove_inconsistent(df):
    grouped = df.groupby('text')['class']
    consistent_duplicates = grouped.transform(lambda x: x.nunique() == 1)
    inconsistent_duplicates = df[~consistent_duplicates].copy()
    df.drop(inconsistent_duplicates.index, inplace=True)
    return df

def remove_all_duplicates(df):
    return df.drop_duplicates(subset='text', keep='first')

# Remove inconsistent entries
train = remove_inconsistent(train)

# Remove all remaining duplicates
train = remove_all_duplicates(train)

In [8]:
import re
import string
import nltk

def preprocess_text(text):
    """
    Preprocesses text data:
      * Lowercasing
      * Removing square brackets and content
      * Removing links
      * Removing punctuation
      * Removing numbers
      * Removing stop words
      * Stemming (optional)

    Args:
       text: Input text string

    Returns:
       Cleaned text string
    """
    text = text.lower()
    # text = re.sub('\[.*?\]', '', text)
    # text = re.sub('https?://\S+|www\.\S+', '', text)
    # text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    # text = re.sub('\w*\d\w*', '', text)

    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)

    words = [w for w in tokens if w not in stop_words]

    return ' '.join(words)

In [9]:
train['text'] = train['text'].apply(preprocess_text)
test['text'] = test['text'].apply(preprocess_text)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

train_data, eval_data = train_test_split(train, test_size=0.2, random_state=42)
test_data = test

X_train = train['text']
X_test = test['text']

# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=20000)

train_data_X = tfidf_vectorizer.fit_transform(train_data['text'])
eval_data_X = tfidf_vectorizer.transform(eval_data['text'])
test_data_X = tfidf_vectorizer.transform(test['text'])

from sklearn.neural_network import MLPClassifier
model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000)
model.fit(train_data_X, train_data['class'])

# # Convert labels to one-hot form
# y_train = pd.get_dummies(df['class'])

In [20]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
y_pred = model.predict(eval_data_X)

# Calculate metrics
accuracy = accuracy_score(eval_data['class'], y_pred)
f1 = f1_score(eval_data['class'], y_pred, average='macro')
precision = precision_score(eval_data['class'], y_pred, average='macro')
recall = recall_score(eval_data['class'], y_pred, average='macro')
report = classification_report(eval_data['class'], y_pred)

# Print formatted output
print(f'Accuracy: {accuracy:.4f}') # 4 decimal places for accuracy
print(f'F1 Score (Macro Average): {f1:.4f}')
print(f'Precision (Macro Average): {precision:.4f}')
print(f'Recall (Macro Average): {recall:.4f}')
print('Classification Report:\n', report)

Accuracy: 0.9617
F1 Score (Macro Average): 0.9599
Precision (Macro Average): 0.9616
Recall (Macro Average): 0.9583
Classification Report:
               precision    recall  f1-score   support

           1       0.96      0.96      0.96      2764
           2       0.97      0.96      0.97      1366
           3       0.97      0.98      0.97      3594
           4       0.95      0.94      0.94      2007

    accuracy                           0.96      9731
   macro avg       0.96      0.96      0.96      9731
weighted avg       0.96      0.96      0.96      9731



In [21]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
y_pred = model.predict(test_data_X)

accuracy = accuracy_score(test_data['class'], y_pred)
f1 = f1_score(test_data['class'], y_pred, average='macro')
precision = precision_score(test_data['class'], y_pred, average='macro')
recall = recall_score(test_data['class'], y_pred, average='macro')
report = classification_report(test_data['class'], y_pred)

print(f'Accuracy: {accuracy:.4f}') # 4 decimal places for accuracy
print(f'F1 Score (Macro Average): {f1:.4f}')
print(f'Precision (Macro Average): {precision:.4f}')
print(f'Recall (Macro Average): {recall:.4f}')
print('Classification Report:\n', report)

Accuracy: 0.7370
F1 Score (Macro Average): 0.7304
Precision (Macro Average): 0.7503
Recall (Macro Average): 0.7370
Classification Report:
               precision    recall  f1-score   support

           1       0.84      0.78      0.81       750
           2       0.78      0.48      0.60       750
           3       0.61      0.78      0.69       750
           4       0.76      0.91      0.83       750

    accuracy                           0.74      3000
   macro avg       0.75      0.74      0.73      3000
weighted avg       0.75      0.74      0.73      3000



In [None]:
# train_vectors = pd.DataFrame(X_train, columns=tfidf_vectorizer.get_feature_names_out())
# train_vectors.to_csv('fulltrain_tfidf_vectors.csv', index=False)

# test_vectors = pd.DataFrame(X_test, columns=tfidf_vectorizer.get_feature_names_out())
# test_vectors.to_csv('test_tfidf_vectors.csv', index=False)