In [None]:
import pandas as pd
import numpy as np
from google.colab import drive


# Mount Google Drive
drive.mount('/content/drive')

# read train data
with open('/content/drive/MyDrive/LUN_data/raw_data/fulltrain.csv', 'r') as file:
    train = pd.read_csv(file, names=['class', 'text'])

# read test data
with open('/content/drive/MyDrive/LUN_data/raw_data/balancedtest.csv', 'r') as file:
    test = pd.read_csv(file, names=['class', 'text'])

# read stop words
with open('/content/drive/MyDrive/LUN_data/raw_data/stopwords_en.txt', 'r') as file:
    stop_words = file.read().splitlines()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Feature Engineering

In [None]:
def remove_inconsistent(df):
    grouped = df.groupby('text')['class']
    consistent_duplicates = grouped.transform(lambda x: x.nunique() == 1)
    inconsistent_duplicates = df[~consistent_duplicates].copy()
    df.drop(inconsistent_duplicates.index, inplace=True)
    return df

def remove_all_duplicates(df):
    return df.drop_duplicates(subset='text', keep='first')

# Remove inconsistent entries
train = remove_inconsistent(train)

# Remove all remaining duplicates
train = remove_all_duplicates(train)

In [None]:
import re
import string
import nltk

def preprocess_text(text):
    """
    Preprocesses text data:
      * Lowercasing
      * Removing square brackets and content
      * Removing links
      * Removing punctuation
      * Removing numbers
      * Removing stop words
      * Stemming (optional)

    Args:
       text: Input text string

    Returns:
       Cleaned text string
    """
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)

    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)

    words = [w for w in tokens if w not in stop_words]

    return ' '.join(words)

In [None]:
train['text'] = train['text'].apply(preprocess_text)
test['text'] = test['text'].apply(preprocess_text)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

X_train = train['text']
X_test = test['text']

# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=20000)

X_train = tfidf_vectorizer.fit_transform(X_train)
X_train = X_train.toarray()

X_test = tfidf_vectorizer.transform(X_test)
X_test = X_test.toarray()

# # Convert labels to one-hot form
# y_train = pd.get_dummies(df['class'])

In [None]:
# train_vectors = pd.DataFrame(X_train, columns=tfidf_vectorizer.get_feature_names_out())
# train_vectors.to_csv('fulltrain_tfidf_vectors.csv', index=False)

# test_vectors = pd.DataFrame(X_test, columns=tfidf_vectorizer.get_feature_names_out())
# test_vectors.to_csv('test_tfidf_vectors.csv', index=False)