# **Text classification and Authorship Attribution**

## Import libraries:

In [1]:
# PyTorch and Transformers installation
# (not needed for the basic classificaiton models):
! pip install transformers datasets
! pip3 install torch
! pip install spacy
!python -m spacy download en_core_web_lg

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m


In [2]:
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
import nltk, scipy
import spacy
import time
import sklearn
import torch
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, make_scorer, recall_score, average_precision_score,precision_score
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from datetime import datetime, date
import gensim
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split
import matplotlib.pyplot as plt
import re
import torch.optim as optim
import tensorflow as tf
from torch.utils.data import TensorDataset, DataLoader
from keras.optimizers import Adam
from keras import metrics
from keras import backend as K
import datetime
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import (
    cross_val_score,
    train_test_split,
    GridSearchCV,
    KFold)

In [3]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
def who_am_i():  # this is not a class method
    """Returns a ductionary with your name, id number and email. keys=['name', 'id','email']
        Make sure you return your own info!
    """
    return {'name': 'Maxim Katz & Yuval Levi', 'id': '322406604 and 325120384', 'email': 'katzmax@post.bgu.ac.il and yulev@post.bgu.ac.il' }

## Data Processing:

In [5]:
def clean_text(text):
    lemmatizer = WordNetLemmatizer()
    stopwords_set = set(stopwords.words('english'))

    # Function to remove specific patterns
    def remove_patterns(text, patterns):
        for pattern in patterns:
            text = re.sub(pattern, ' ', text)
        return text

    # Convert to lowercase:
    text = text.lower()

    # Define patterns to be removed:
    patterns = [
        r'\d+',  # Remove digits
        r'[^\x00-\x7F]+',  # Remove non-ASCII characters
        r'@\w+',  # Remove mentions
        r'#\w+',  # Remove hashtags
        r'http\S+|www\S+|https\S+',  # Remove URLs
        r'[^\w\s]',  # Remove punctuation
        r'\s+'  # Remove extra white space
    ]
    # Apply pattern removal:
    text = remove_patterns(text, patterns).strip()
    # Tokenize the text:
    words = text.split()
    # Remove stopwords and lemmatize:
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords_set]
    # Join words back to a string:
    text = ' '.join(words)
    return text

In [6]:
def preprocess_df(trump_data, split=False, scale_type=None, nonvector=False):
    trump_df = trump_data.copy()
    trump_df['y_true'] = trump_df['device'].apply(lambda x: 0 if x == 'android' else 1)

    # Label encoding user_handle as categorical:
    label_encoder = LabelEncoder()
    trump_df['user_handle_encoded'] = label_encoder.fit_transform(trump_df['user_handle'])

    # Define regex pattern for the format '%Y-%m-%d %H:%M:%S'
    pattern = r'\b\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\b'
    # Extract timestamps from rows with invalid timestamps
    trump_df['corrected_timestamp'] = trump_df['timestamp']
    invalid_timestamp_mask = pd.to_datetime(trump_df['timestamp'], errors='coerce').isnull()
    invalid_timestamp_df = trump_df[invalid_timestamp_mask].copy()
    invalid_timestamp_df['corrected_timestamp'] = invalid_timestamp_df['tweet_text'].apply(
        lambda text: re.findall(pattern, text)[0] if re.findall(pattern, text) else None)
    trump_df.loc[invalid_timestamp_mask, 'corrected_timestamp'] = invalid_timestamp_df['corrected_timestamp']
    # Ensure the corrected_timestamp column is datetimelike
    trump_df['corrected_timestamp'] = pd.to_datetime(trump_df['corrected_timestamp'], errors='coerce')
    # Preprocess timestamp column:
    trump_df['timestamp_year'] = trump_df['corrected_timestamp'].dt.year
    trump_df['timestamp_month'] = trump_df['corrected_timestamp'].dt.month
    trump_df['timestamp_day'] = trump_df['corrected_timestamp'].dt.day
    trump_df['timestamp_hour'] = trump_df['corrected_timestamp'].dt.hour
    trump_df['timestamp_minute'] = trump_df['corrected_timestamp'].dt.minute
    trump_df['timestamp_second'] = trump_df['corrected_timestamp'].dt.second

    # Number of some unique characters:
    trump_df['num_hashtags'] = trump_df['tweet_text'].apply(lambda x: x.count('#'))
    trump_df['num_mentions'] = trump_df['tweet_text'].apply(lambda x: x.count('@'))
    trump_df['num_urls'] = trump_df['tweet_text'].apply(lambda x: x.count('http://') + x.count('https://'))
    trump_df['num_questions'] = trump_df['tweet_text'].apply(lambda x: x.count('?'))
    trump_df['num_exclamations'] = trump_df['tweet_text'].apply(lambda x: x.count('!'))
    trump_df['num_words'] = trump_df['tweet_text'].apply(lambda x: len(x.split()))
    trump_df['num_uppercase_words'] = trump_df['tweet_text'].apply(lambda x: sum([1 for word in x.split() if word.isupper()]))
    trump_df['cleaned_tweet_text'] = trump_df['tweet_text'].apply(clean_text)
    if nonvector==False:
      # Vectorize the words in the text in specific column:
      nlp = spacy.load('en_core_web_lg')
      vector_list = []
      for plot in tqdm(trump_df['cleaned_tweet_text']):
          vector_list.append(nlp(plot).vector)

      features = [f"Feature {i}" for i in range(len(vector_list[0]))]
      features_data = pd.DataFrame(vector_list, columns=features)
      new_trump_df = pd.concat([trump_df, features_data], axis=1)
    else:
      new_trump_df = trump_df
    # Clean the dataframe:
    new_trump_df = new_trump_df.dropna()
    new_trump_df = new_trump_df.drop_duplicates()

    # Remove specific columns:
    columns_to_remove = ['tweet_id', 'user_handle', 'tweet_text', 'timestamp', 'device', 'cleaned_tweet_text',"corrected_timestamp"]
    new_trump_df = new_trump_df.drop(columns_to_remove, axis=1)

    x = new_trump_df.drop('y_true', axis=1)
    y = new_trump_df['y_true']

    # Apply scaling if specified
    if scale_type == 'min_max':
        scaler = MinMaxScaler()
    elif scale_type == 'standard':
        scaler = StandardScaler()
    else:
        scaler = None

    if scaler:
        x = pd.DataFrame(scaler.fit_transform(x), columns=x.columns)

    if split:
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, shuffle=True)
        return X_train, X_test, y_train, y_test

    return x, y

In [7]:
def preprocess_df_test(trump_data, scale_type=None, nonvector=False):
    trump_df = trump_data.copy()

    # Preprocess user_handle column:
    label_encoder = LabelEncoder()
    trump_df['user_handle_encoded'] = label_encoder.fit_transform(trump_df['user_handle'])

    # Define regex pattern for the format '%Y-%m-%d %H:%M:%S'
    pattern = r'\b\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\b'

    # Extract timestamps from rows with invalid timestamps
    trump_df['corrected_timestamp'] = trump_df['timestamp']
    invalid_timestamp_mask = pd.to_datetime(trump_df['timestamp'], errors='coerce').isnull()
    invalid_timestamp_df = trump_df[invalid_timestamp_mask].copy()
    invalid_timestamp_df['corrected_timestamp'] = invalid_timestamp_df['tweet_text'].apply(
        lambda text: re.findall(pattern, text)[0] if re.findall(pattern, text) else None)
    trump_df.loc[invalid_timestamp_mask, 'corrected_timestamp'] = invalid_timestamp_df['corrected_timestamp']
    # Ensure the corrected_timestamp column is datetimelike
    trump_df['corrected_timestamp'] = pd.to_datetime(trump_df['corrected_timestamp'], errors='coerce')

    # Continue with timestamp column preprocessing
    trump_df['timestamp_year'] = trump_df['corrected_timestamp'].dt.year
    trump_df['timestamp_month'] = trump_df['corrected_timestamp'].dt.month
    trump_df['timestamp_day'] = trump_df['corrected_timestamp'].dt.day
    trump_df['timestamp_hour'] = trump_df['corrected_timestamp'].dt.hour
    trump_df['timestamp_minute'] = trump_df['corrected_timestamp'].dt.minute
    trump_df['timestamp_second'] = trump_df['corrected_timestamp'].dt.second

    # Number of some unique characters:
    trump_df['num_hashtags'] = trump_df['tweet_text'].apply(lambda x: x.count('#'))
    trump_df['num_mentions'] = trump_df['tweet_text'].apply(lambda x: x.count('@'))
    trump_df['num_urls'] = trump_df['tweet_text'].apply(lambda x: x.count('http://') + x.count('https://'))
    trump_df['num_questions'] = trump_df['tweet_text'].apply(lambda x: x.count('?'))
    trump_df['num_exclamations'] = trump_df['tweet_text'].apply(lambda x: x.count('!'))
    trump_df['num_words'] = trump_df['tweet_text'].apply(lambda x: len(x.split()))
    trump_df['num_uppercase_words'] = trump_df['tweet_text'].apply(lambda x: sum([1 for word in x.split() if word.isupper()]))
    trump_df['cleaned_tweet_text'] = trump_df['tweet_text'].apply(clean_text)

    if not nonvector:
        # Vectorize the words in the text in specific column:
        nlp = spacy.load('en_core_web_lg')
        vector_list = []
        for plot in tqdm(trump_df['cleaned_tweet_text']):
            vector_list.append(nlp(plot).vector)
        features = [f"Feature {i}" for i in range(len(vector_list[0]))]
        features_df = pd.DataFrame(vector_list, columns=features)
        trump_df = pd.concat([trump_df, features_df], axis=1)

    # Clean the dataframe:
    trump_df = trump_df.dropna()
    trump_df = trump_df.drop_duplicates()

    # Remove specific columns:
    columns_to_remove = ['user_handle', 'tweet_text', 'timestamp', 'cleaned_tweet_text', 'corrected_timestamp']
    trump_df = trump_df.drop(columns_to_remove, axis=1)

    # Apply scaling if specified
    if scale_type == 'min_max':
        scaler = MinMaxScaler()
    elif scale_type == 'standard':
        scaler = StandardScaler()
    else:
        scaler = None

    if scaler:
        trump_df = pd.DataFrame(scaler.fit_transform(trump_df), columns=trump_df.columns)

    return trump_df

## Data about Trump (train and test):

In [8]:
from google.colab import drive
drive.mount('/content/drive')
columns = ['tweet_id', 'user_handle', 'tweet_text', 'timestamp', 'device']
train_trump = pd.read_csv('/content/drive/MyDrive/NLP3/trump_train.tsv', sep='\t', quoting=3, header=None, names=columns)

Mounted at /content/drive


In [9]:
test = pd.read_csv('/content/drive/MyDrive/NLP3/trump_tweets_test_a.tsv', sep='\t', quoting=3, header=None, names=columns[1:-1])

## Models:

In [10]:
def perform_GS(clf, parameters, X, y, verbose=False):
    """
    Perform Grid Search Cross Validation for hyperparameters of a given classifier and return the best one.

    Args:
    clf (classifier): Classifier to apply grid search on
    parameters (dict): Model parameters to use in search
    X (pd.DataFrame): Training feature set
    y (pd.Series or np.ndarray): Training label set
    verbose (bool): Flag to print results

    Returns:
    best_estimator (classifier): Best estimator found by grid search
    """
    scoring = {
        'AUC': 'roc_auc',
        'Accuracy': make_scorer(accuracy_score),
        'Precision': make_scorer(average_precision_score, average='weighted'),
        'Recall': make_scorer(recall_score, average='weighted'),
        'F1': make_scorer(f1_score, average='weighted')
    }

    grid_search = GridSearchCV(clf, parameters, scoring=scoring, return_train_score=True, refit='Accuracy')
    start_time = time.time()
    # Ensuring column names are strings
    X.columns = X.columns.astype(str)
    grid_search.fit(X, y)
    elapsed_time = round(time.time() - start_time)
    results = pd.DataFrame(grid_search.cv_results_)
    if verbose:
        print(f"Search Time: {elapsed_time} sec")
        print("Best Parameters: ", grid_search.best_params_)
        print("Best Accuracy Score: ", grid_search.best_score_)

    return grid_search.best_estimator_

## 1. Logistic Regression

In [None]:
def val_LReg(X, y):
    # Initialize Logistic Regression model
    model = LogisticRegression()
    logistic_parameters = {'max_iter': [int(x) for x in np.linspace(start = 10000, stop = 15000, num = 5)],
                       'C': [float(x) for x in np.linspace(start = 0.2, stop = 0.8, num = 4)]}
    logistic_reg_model = perform_GS(model, logistic_parameters,X, y,True)
    return logistic_reg_model

In [None]:
def train_LReg(X, y):
    model = LogisticRegression(C=0.2, max_iter=10000)
    model.fit(X, y)
    return model

Experiment Logistic Regression

In [None]:
X_train, Y_train = preprocess_df(train_trump,False,"standard",True)
model_logreg = val_LReg(X_train,Y_train)
print(model_logreg)

Search Time: 8 sec
Best Parameters:  {'C': 0.2, 'max_iter': 10000}
Best Accuracy Score:  0.823124585618709
LogisticRegression(C=0.2, max_iter=10000)


## 2. SVM (linear):

In [None]:
def val_SVM_lnkernel(X, y):
    model = SVC(kernel='linear')
    SVC_parameters = {"max_iter": [int(x) for x in np.linspace(start = 400, stop = 800, num = 4)],
                       "C": [float(x) for x in np.linspace(start = 0.2, stop = 1.0, num = 4)], "degree": [int(x) for x in np.linspace(start = 1, stop = 5, num = 5)]}
    SVC_model = perform_GS(model, SVC_parameters,X, y,True)
    return SVC_model

In [None]:
def train_SVM_lnkernel(X, y):
    model = SVC(C=0.2, degree=1, kernel='linear', max_iter=800)
    model.fit(X, y)
    return model

Experiment SVM (linear)

In [None]:
X_strain, Y_strain = preprocess_df(train_trump,False,"standard",True)
model_svmln = val_SVM_lnkernel(X_strain,Y_strain)
print(model_svmln)



Search Time: 162 sec
Best Parameters:  {'C': 0.2, 'degree': 1, 'max_iter': 800}
Best Accuracy Score:  0.818305105177506
SVC(C=0.2, degree=1, kernel='linear', max_iter=800)




## 2. SVM (non-linear):

In [None]:
def val_SVM_nonlnkernel(X, y):
    model = SVC(kernel='sigmoid')
    SVC_parameters = {'max_iter': [int(x) for x in np.linspace(start = 400, stop = 800, num = 4)],
                       'C': [float(x) for x in np.linspace(start = 0.2, stop = 1.0, num = 4)], "degree": [int(x) for x in np.linspace(start = 1, stop = 5, num = 5)]}
    SVC_model = perform_GS(model, SVC_parameters,X, y,True)
    return SVC_model

In [None]:
def train_SVM_nonlnkernel(X, y):
    model = SVC(C=0.2, degree=1, kernel='sigmoid', max_iter=800)
    model.fit(X, y)
    return model

Experiment SVM (non-linear)

In [None]:
X_nstrain, Y_nstrain = preprocess_df(train_trump,False,"standard",True)
model_svmnonln = val_SVM_nonlnkernel(X_nstrain,Y_nstrain)
print(model_svmnonln)



Search Time: 308 sec
Best Parameters:  {'C': 0.2, 'degree': 1, 'max_iter': 800}
Best Accuracy Score:  0.8044196652803729
SVC(C=0.2, degree=1, kernel='sigmoid', max_iter=800)




## 3. FFNN:

In [19]:
import torch
import torch.nn as nn

class FFNN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(FFNN, self).__init__()
        self.seq1 = nn.Sequential(
            nn.Linear(input_size, 4096),
            nn.BatchNorm1d(4096),
            nn.LeakyReLU())

        self.seq2 = nn.Sequential(
            nn.Linear(4096, 2048),
            nn.BatchNorm1d(2048),
            nn.LeakyReLU())

        self.seq3 = nn.Sequential(
            nn.Linear(2048, 1024),
            nn.BatchNorm1d(1024),
            nn.LeakyReLU())

        self.seq4 = nn.Sequential(
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU())

        self.seq5 = nn.Sequential(
            nn.Linear(512, 128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU())

        self.seq6 = nn.Sequential(
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU())

        self.seq7 = nn.Sequential(
            nn.Linear(64, 16),
            nn.BatchNorm1d(16),
            nn.LeakyReLU())

        self.cls = nn.Sequential(
            nn.Linear(16, num_classes),
            nn.Softmax(dim=1))

    def forward(self, X):
        X = self.seq1(X)
        X = self.seq2(X)
        X = self.seq3(X)
        X = self.seq4(X)
        X = self.seq5(X)
        X = self.seq6(X)
        X = self.seq7(X)
        X = self.cls(X)
        return X



In [None]:
class DataSetTrump(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        curr_y = [0, 0]
        curr_y[int(self.y[idx])] = 1
        return (self.X[idx], torch.Tensor(curr_y))

In [None]:
def ffnn_train(train_loader, val_loader, input_size, num_classes,verbose=True):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = FFNN(input_size, num_classes)
    model.to(device)
    epochs = 100
    criterion = nn.BCELoss()
    opt = torch.optim.Adam(model.parameters(), lr=1e-5)
    training_loss = []
    validation_loss = []

    for epoch in range(epochs):
        curlos_train, curlos_val = 0, 0
        model.train()
        for x, y in train_loader:
            input = x.to(torch.float32).to(device)
            labels = y.to(torch.float32).to(device)
            pred = model(input)
            loss = criterion(pred, labels)
            curlos_train += loss.item()
            opt.zero_grad()
            loss.backward()
            opt.step()
        training_loss.append(curlos_train / len(train_loader))
        model.eval()
        with torch.no_grad():
            for x, y in val_loader:
                input = x.to(torch.float32).to(device)
                labels = y.to(torch.float32).to(device)
                pred = model(input)
                loss = criterion(pred, labels)
                curlos_val += loss.item()
            validation_loss.append(curlos_val / len(val_loader))
        if verbose:
          print(f"Epoch {epoch+1}/{epochs}.. Train loss: {training_loss[-1]:.4f}.. Val loss: {validation_loss[-1]:.4f}")

    return model, training_loss, validation_loss

In [None]:
def predict_FFNN(model, X_test):
    # convert DataFrame to numpy arrays, then to PyTorch tensors
    tensor_x = torch.Tensor(X_test.values)
    with torch.no_grad():
        outputs = model(tensor_x)
    _, predicted = torch.max(outputs.data, 1)
    return predicted.numpy().tolist()

In [None]:
def val_FFNN(model, X_val, y_val):
    # Convert DataFrame to numpy arrays, then to PyTorch tensors
    tensor_x = torch.Tensor(X_val.values)
    tensor_y = torch.Tensor(y_val.values)

    with torch.no_grad():
        outputs = model(tensor_x)

    _, predicted = torch.max(outputs.data, 1)
    y_true = tensor_y.numpy().astype(int)  # Ensure the labels are integer type
    y_pred = predicted.cpu().numpy()

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')

    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1-Score: {f1:.4f}')

    return accuracy, precision, recall, f1

Experiment Neural Network (FFNN)

In [None]:
from torch.utils.data import Dataset, DataLoader

X_train, X_val, y_train, y_val = preprocess_df(train_trump,True,None,False)

# Create DataLoader instances
train_dataset = DataSetTrump(X_train.values, y_train.values)
val_dataset = DataSetTrump(X_val.values, y_val.values)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Train the model with validation
input_size = X_train.shape[1]
num_classes = 2
model, train_loss, val_loss = ffnn_train(train_loader, val_loader, input_size, num_classes)

# Predict the validation with the trained model
predictions = val_FFNN(model, X_val,y_val)

100%|██████████| 3528/3528 [00:36<00:00, 97.01it/s] 


Epoch 1/100.. Train loss: 0.6868.. Val loss: 0.6784
Epoch 2/100.. Train loss: 0.5674.. Val loss: 0.6316
Epoch 3/100.. Train loss: 0.5281.. Val loss: 0.6233
Epoch 4/100.. Train loss: 0.4957.. Val loss: 0.5994
Epoch 5/100.. Train loss: 0.4741.. Val loss: 0.6045
Epoch 6/100.. Train loss: 0.4692.. Val loss: 0.6034
Epoch 7/100.. Train loss: 0.4474.. Val loss: 0.5978
Epoch 8/100.. Train loss: 0.4269.. Val loss: 0.5925
Epoch 9/100.. Train loss: 0.4190.. Val loss: 0.5785
Epoch 10/100.. Train loss: 0.4106.. Val loss: 0.5895
Epoch 11/100.. Train loss: 0.4076.. Val loss: 0.5691
Epoch 12/100.. Train loss: 0.3987.. Val loss: 0.5812
Epoch 13/100.. Train loss: 0.3908.. Val loss: 0.5671
Epoch 14/100.. Train loss: 0.3813.. Val loss: 0.5744
Epoch 15/100.. Train loss: 0.3781.. Val loss: 0.5531
Epoch 16/100.. Train loss: 0.3796.. Val loss: 0.5474
Epoch 17/100.. Train loss: 0.3732.. Val loss: 0.5673
Epoch 18/100.. Train loss: 0.3584.. Val loss: 0.5590
Epoch 19/100.. Train loss: 0.3501.. Val loss: 0.5554
Ep

## 4. XGB classifier:

In [11]:
import xgboost as xgb
def val_XGB(X, y):
    XGBoost = xgb.XGBClassifier()
    xgb_params = {
        'booster': ['gbtree', 'gblinear', 'dart'],
        'learning_rate': [0.01, 0.1, 0.2, 0.3],
        'n_estimators': [50, 100, 200]
    }
    xgb_model = perform_GS(XGBoost, xgb_params,X, y,True)
    return xgb_model

In [12]:
def train_XGB(X, y):
    XGBoost = xgb.XGBClassifier(booster='gbtree', learning_rate=0.1, n_estimators=200)
    XGBoost.fit(X, y)  # train the model on the training data
    return XGBoost  # return the trained model

Experiment XGB

In [16]:
X_xgtrain, Y_xgtrain = preprocess_df(train_trump,False,None,False)
model_xgb = val_XGB(X_xgtrain,Y_xgtrain)
print(model_xgb)

100%|██████████| 3528/3528 [00:40<00:00, 87.73it/s] 


Search Time: 9944 sec
Best Parameters:  {'booster': 'gbtree', 'learning_rate': 0.1, 'n_estimators': 200}
Best Accuracy Score:  0.8599774978401944
XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=200, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)


## 5. RNN:

In [18]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True, dropout=0.5)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out

In [None]:
class rDataSetTrump(Dataset):
    def __init__(self, X, y, sequence_length=20):
        self.sequence_length = sequence_length

        # Reshape X to have sequence length
        self.X = self._reshape_data(X, sequence_length)
        self.y = torch.LongTensor(y)  # Ensure y is a LongTensor for indexing

    def _reshape_data(self, data, sequence_length):
        num_samples, input_size = data.shape
        # Compute the number of sequences
        num_sequences = num_samples // sequence_length
        reshaped_data = data[:num_sequences * sequence_length].reshape(-1, sequence_length, input_size)
        return torch.Tensor(reshaped_data)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        curr_y = torch.zeros(2)  # Assuming binary classification with 2 classes
        curr_y[int(self.y[idx])] = 1
        return (self.X[idx], curr_y)

In [None]:
def train_RNN(train_loader, val_loader, input_size, num_classes, epochs=50, batch_size=32, learning_rate=1e-3, verbose=True):

    model = RNN(input_size, hidden_size=100, num_layers=2, num_classes=num_classes)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)

    training_loss = []
    validation_loss = []
    training_accuracy = []
    validation_accuracy = []

    for epoch in range(epochs):
        model.train()
        curlos_train = 0
        correct_train = 0
        total_train = 0
        num_batches_train = 0

        for x, y in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            optimizer.zero_grad()
            outputs = model(x)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()

            curlos_train += loss.item()
            num_batches_train += 1

            probabilities = torch.sigmoid(outputs)
            predictions = (probabilities > 0.5).float()
            correct_train += (predictions == y).sum().item()
            total_train += y.numel()

        avg_train_loss = curlos_train / num_batches_train
        avg_train_accuracy = correct_train / total_train
        training_loss.append(avg_train_loss)
        training_accuracy.append(avg_train_accuracy)

        model.eval()
        curlos_val = 0
        correct_val = 0
        total_val = 0
        num_batches_val = 0

        with torch.no_grad():
            for x, y in val_loader:
                outputs = model(x)
                loss = criterion(outputs, y)
                curlos_val += loss.item()
                num_batches_val += 1

                probabilities = torch.sigmoid(outputs)
                predictions = (probabilities > 0.5).float()
                correct_val += (predictions == y).sum().item()
                total_val += y.numel()

        avg_val_loss = curlos_val / num_batches_val
        avg_val_accuracy = correct_val / total_val
        validation_loss.append(avg_val_loss)
        validation_accuracy.append(avg_val_accuracy)

        if verbose:
            print(f"Epoch {epoch+1}/{epochs}.. Train loss: {avg_train_loss:.4f}.. Train accuracy: {avg_train_accuracy:.4f}.. Val loss: {avg_val_loss:.4f}.. Val accuracy: {avg_val_accuracy:.4f}")

    return model, training_loss, validation_loss

In [None]:
def predict_RNN(model, X_test):
    tensor_x = torch.Tensor(X_test.values).unsqueeze(1)
    with torch.no_grad():
        outputs = model(tensor_x)
    _, predicted = torch.max(outputs.data, 1)
    return predicted.numpy().tolist()

Experiment RNN

In [None]:
def preprocess_text(trump_data, split=False):
    trump_df = trump_data.copy()
    trump_df['y_true'] = trump_df['device'].apply(lambda x: 0 if x == 'android' else 1)
    trump_df['cleaned_tweet_text'] = trump_df['tweet_text'].apply(clean_text)
    # Vectorize the words in the text in specific column:
    nlp = spacy.load('en_core_web_lg')
    vector_list = []
    for plot in tqdm(trump_df['cleaned_tweet_text']):
        vector_list.append(nlp(plot).vector)

    features = [f"Feature {i}" for i in range(len(vector_list[0]))]
    features_data = pd.DataFrame(vector_list, columns=features)
    new_trump_df = pd.concat([trump_df, features_data], axis=1)

    # Remove specific columns:
    columns_to_remove = ['tweet_id', 'user_handle', 'tweet_text', 'timestamp', 'device', 'cleaned_tweet_text']
    new_trump_df = new_trump_df.drop(columns_to_remove, axis=1)

    x = new_trump_df.drop('y_true', axis=1)
    y = new_trump_df['y_true']

    if split:
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, shuffle=True)
        return X_train, X_test, y_train, y_test

    return x, y

In [None]:
X_rtrain, X_rval, y_rtrain, y_rval = preprocess_text(train_trump,True)
sequence_length = 10
rtrain_dataset = rDataSetTrump(X_rtrain.values, y_rtrain.values, sequence_length=sequence_length)
rval_dataset = rDataSetTrump(X_rval.values, y_rval.values, sequence_length=sequence_length)
rtrain_loader = DataLoader(rtrain_dataset, batch_size=64, shuffle=True)
rval_loader = DataLoader(rval_dataset, batch_size=64, shuffle=False)

100%|██████████| 3528/3528 [00:35<00:00, 99.23it/s] 


In [None]:
for x, y in rtrain_loader:
    print(x.shape)  # Should be [batch_size, sequence_length, input_size]
    print(y.shape)  # Should be [batch_size, num_classes]
    break

torch.Size([64, 10, 300])
torch.Size([64, 2])


In [None]:
# Define model parameters
input_size = X_rtrain.shape[1]  # Number of features
num_classes = 2  # Binary classification

# Train the RNN model:
model, train_loss, val_loss = train_RNN(
    rtrain_loader,
    rval_loader,
    input_size,
    num_classes,
    epochs=40,
    batch_size=64,
    learning_rate=1e-4,
    verbose=True
)

Epoch 1/40: 100%|██████████| 5/5 [00:00<00:00, 57.95it/s]


Epoch 1/40.. Train loss: 0.6950.. Train accuracy: 0.4876.. Val loss: 0.6508.. Val accuracy: 0.6357


Epoch 2/40: 100%|██████████| 5/5 [00:00<00:00, 64.31it/s]


Epoch 2/40.. Train loss: 0.6858.. Train accuracy: 0.5567.. Val loss: 0.6212.. Val accuracy: 0.7357


Epoch 3/40: 100%|██████████| 5/5 [00:00<00:00, 56.48it/s]


Epoch 3/40.. Train loss: 0.6855.. Train accuracy: 0.5798.. Val loss: 0.6004.. Val accuracy: 0.7643


Epoch 4/40: 100%|██████████| 5/5 [00:00<00:00, 53.19it/s]


Epoch 4/40.. Train loss: 0.6802.. Train accuracy: 0.6082.. Val loss: 0.5844.. Val accuracy: 0.7571


Epoch 5/40: 100%|██████████| 5/5 [00:00<00:00, 53.00it/s]


Epoch 5/40.. Train loss: 0.6690.. Train accuracy: 0.6152.. Val loss: 0.5771.. Val accuracy: 0.7643


Epoch 6/40: 100%|██████████| 5/5 [00:00<00:00, 68.15it/s]


Epoch 6/40.. Train loss: 0.6687.. Train accuracy: 0.6028.. Val loss: 0.5712.. Val accuracy: 0.7571


Epoch 7/40: 100%|██████████| 5/5 [00:00<00:00, 71.00it/s]


Epoch 7/40.. Train loss: 0.6736.. Train accuracy: 0.5993.. Val loss: 0.5684.. Val accuracy: 0.7571


Epoch 8/40: 100%|██████████| 5/5 [00:00<00:00, 67.61it/s]


Epoch 8/40.. Train loss: 0.6803.. Train accuracy: 0.6117.. Val loss: 0.5665.. Val accuracy: 0.7500


Epoch 9/40: 100%|██████████| 5/5 [00:00<00:00, 64.69it/s]


Epoch 9/40.. Train loss: 0.6644.. Train accuracy: 0.6082.. Val loss: 0.5660.. Val accuracy: 0.7500


Epoch 10/40: 100%|██████████| 5/5 [00:00<00:00, 60.10it/s]


Epoch 10/40.. Train loss: 0.6581.. Train accuracy: 0.6206.. Val loss: 0.5653.. Val accuracy: 0.7429


Epoch 11/40: 100%|██████████| 5/5 [00:00<00:00, 63.57it/s]


Epoch 11/40.. Train loss: 0.6540.. Train accuracy: 0.6099.. Val loss: 0.5655.. Val accuracy: 0.7429


Epoch 12/40: 100%|██████████| 5/5 [00:00<00:00, 65.84it/s]


Epoch 12/40.. Train loss: 0.6445.. Train accuracy: 0.6170.. Val loss: 0.5641.. Val accuracy: 0.7429


Epoch 13/40: 100%|██████████| 5/5 [00:00<00:00, 71.58it/s]


Epoch 13/40.. Train loss: 0.6623.. Train accuracy: 0.6152.. Val loss: 0.5607.. Val accuracy: 0.7429


Epoch 14/40: 100%|██████████| 5/5 [00:00<00:00, 44.41it/s]


Epoch 14/40.. Train loss: 0.6486.. Train accuracy: 0.6206.. Val loss: 0.5607.. Val accuracy: 0.7429


Epoch 15/40: 100%|██████████| 5/5 [00:00<00:00, 40.67it/s]


Epoch 15/40.. Train loss: 0.6570.. Train accuracy: 0.6206.. Val loss: 0.5607.. Val accuracy: 0.7429


Epoch 16/40: 100%|██████████| 5/5 [00:00<00:00, 46.12it/s]


Epoch 16/40.. Train loss: 0.6553.. Train accuracy: 0.6152.. Val loss: 0.5622.. Val accuracy: 0.7429


Epoch 17/40: 100%|██████████| 5/5 [00:00<00:00, 51.79it/s]


Epoch 17/40.. Train loss: 0.6463.. Train accuracy: 0.6206.. Val loss: 0.5648.. Val accuracy: 0.7429


Epoch 18/40: 100%|██████████| 5/5 [00:00<00:00, 58.59it/s]


Epoch 18/40.. Train loss: 0.6324.. Train accuracy: 0.6277.. Val loss: 0.5683.. Val accuracy: 0.7429


Epoch 19/40: 100%|██████████| 5/5 [00:00<00:00, 65.93it/s]


Epoch 19/40.. Train loss: 0.6386.. Train accuracy: 0.6170.. Val loss: 0.5676.. Val accuracy: 0.7429


Epoch 20/40: 100%|██████████| 5/5 [00:00<00:00, 68.66it/s]


Epoch 20/40.. Train loss: 0.6400.. Train accuracy: 0.6277.. Val loss: 0.5664.. Val accuracy: 0.7429


Epoch 21/40: 100%|██████████| 5/5 [00:00<00:00, 64.45it/s]


Epoch 21/40.. Train loss: 0.6418.. Train accuracy: 0.6507.. Val loss: 0.5664.. Val accuracy: 0.7429


Epoch 22/40: 100%|██████████| 5/5 [00:00<00:00, 46.40it/s]


Epoch 22/40.. Train loss: 0.6327.. Train accuracy: 0.6543.. Val loss: 0.5670.. Val accuracy: 0.7429


Epoch 23/40: 100%|██████████| 5/5 [00:00<00:00, 55.21it/s]


Epoch 23/40.. Train loss: 0.6322.. Train accuracy: 0.6489.. Val loss: 0.5685.. Val accuracy: 0.7357


Epoch 24/40: 100%|██████████| 5/5 [00:00<00:00, 56.87it/s]


Epoch 24/40.. Train loss: 0.6382.. Train accuracy: 0.6489.. Val loss: 0.5694.. Val accuracy: 0.7357


Epoch 25/40: 100%|██████████| 5/5 [00:00<00:00, 65.46it/s]


Epoch 25/40.. Train loss: 0.6213.. Train accuracy: 0.6365.. Val loss: 0.5686.. Val accuracy: 0.7357


Epoch 26/40: 100%|██████████| 5/5 [00:00<00:00, 62.48it/s]


Epoch 26/40.. Train loss: 0.6206.. Train accuracy: 0.6312.. Val loss: 0.5669.. Val accuracy: 0.7357


Epoch 27/40: 100%|██████████| 5/5 [00:00<00:00, 63.89it/s]


Epoch 27/40.. Train loss: 0.6139.. Train accuracy: 0.6560.. Val loss: 0.5681.. Val accuracy: 0.7286


Epoch 28/40: 100%|██████████| 5/5 [00:00<00:00, 72.85it/s]


Epoch 28/40.. Train loss: 0.6067.. Train accuracy: 0.6649.. Val loss: 0.5665.. Val accuracy: 0.7286


Epoch 29/40: 100%|██████████| 5/5 [00:00<00:00, 62.31it/s]


Epoch 29/40.. Train loss: 0.6038.. Train accuracy: 0.6809.. Val loss: 0.5657.. Val accuracy: 0.7286


Epoch 30/40: 100%|██████████| 5/5 [00:00<00:00, 68.91it/s]


Epoch 30/40.. Train loss: 0.6133.. Train accuracy: 0.6472.. Val loss: 0.5655.. Val accuracy: 0.7286


Epoch 31/40: 100%|██████████| 5/5 [00:00<00:00, 38.03it/s]


Epoch 31/40.. Train loss: 0.5963.. Train accuracy: 0.6560.. Val loss: 0.5671.. Val accuracy: 0.7286


Epoch 32/40: 100%|██████████| 5/5 [00:00<00:00, 35.42it/s]


Epoch 32/40.. Train loss: 0.5956.. Train accuracy: 0.6809.. Val loss: 0.5635.. Val accuracy: 0.7286


Epoch 33/40: 100%|██████████| 5/5 [00:00<00:00, 27.75it/s]


Epoch 33/40.. Train loss: 0.5962.. Train accuracy: 0.6649.. Val loss: 0.5619.. Val accuracy: 0.7286


Epoch 34/40: 100%|██████████| 5/5 [00:00<00:00, 11.34it/s]


Epoch 34/40.. Train loss: 0.5921.. Train accuracy: 0.6933.. Val loss: 0.5610.. Val accuracy: 0.7286


Epoch 35/40: 100%|██████████| 5/5 [00:00<00:00,  9.07it/s]


Epoch 35/40.. Train loss: 0.5855.. Train accuracy: 0.6773.. Val loss: 0.5643.. Val accuracy: 0.7143


Epoch 36/40: 100%|██████████| 5/5 [00:00<00:00, 42.99it/s]


Epoch 36/40.. Train loss: 0.5802.. Train accuracy: 0.6915.. Val loss: 0.5687.. Val accuracy: 0.7071


Epoch 37/40: 100%|██████████| 5/5 [00:00<00:00, 41.21it/s]


Epoch 37/40.. Train loss: 0.5821.. Train accuracy: 0.7234.. Val loss: 0.5685.. Val accuracy: 0.7071


Epoch 38/40: 100%|██████████| 5/5 [00:00<00:00, 41.44it/s]


Epoch 38/40.. Train loss: 0.5742.. Train accuracy: 0.7305.. Val loss: 0.5724.. Val accuracy: 0.7071


Epoch 39/40: 100%|██████████| 5/5 [00:00<00:00, 46.85it/s]


Epoch 39/40.. Train loss: 0.5623.. Train accuracy: 0.7163.. Val loss: 0.5754.. Val accuracy: 0.6929


Epoch 40/40: 100%|██████████| 5/5 [00:00<00:00, 45.27it/s]

Epoch 40/40.. Train loss: 0.5404.. Train accuracy: 0.7394.. Val loss: 0.5754.. Val accuracy: 0.7000





In [None]:
def preprocess_text_test(trump_data):
    trump_df = trump_data.copy()
    trump_df['cleaned_tweet_text'] = trump_df['tweet_text'].apply(clean_text)
    # Vectorize the words in the text in specific column:
    nlp = spacy.load('en_core_web_lg')
    vector_list = []
    for plot in tqdm(trump_df['cleaned_tweet_text']):
        vector_list.append(nlp(plot).vector)
    features = [f"Feature {i}" for i in range(len(vector_list[0]))]
    features_data = pd.DataFrame(vector_list, columns=features)
    new_trump_df = pd.concat([trump_df, features_data], axis=1)
    # Remove specific columns:
    columns_to_remove = ['tweet_id', 'user_handle', 'tweet_text', 'timestamp', 'device', 'cleaned_tweet_text']
    new_trump_df = new_trump_df.drop(columns_to_remove, axis=1)
    return new_trump_df

## Train functions:

In [13]:
def train_models(algorithm, url_trump):
    columns = ['tweet_id', 'user_handle', 'tweet_text', 'timestamp', 'device']
    train_trump = pd.read_csv(url_trump, sep='\t', quoting=3, header=None, names=columns)
    if algorithm == 1:
        X_LRtrain, y_LRtrain = preprocess_df(train_trump,False,"standard",True)
        model = train_LReg(X_LRtrain, y_LRtrain)

    if algorithm == 2:
        X_strain, y_strain = preprocess_df(train_trump,False,"standard",True)
        model = train_SVM_lnkernel(X_strain, y_strain)

    if algorithm == 3:
        X_nstrain, Y_nstrain = preprocess_df(train_trump,False,"standard",True)
        model = train_SVM_nonlnkernel(X_nstrain, Y_nstrain)

    if algorithm == 4:
        X_train, X_val, y_train, y_val = preprocess_df(train_trump,True,None,False)
        # Create DataLoader instances
        train_dataset = DataSetTrump(X_train.values, y_train.values)
        val_dataset = DataSetTrump(X_val.values, y_val.values)
        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
        # Train the model with validation
        input_size = X_train.shape[1]
        num_classes = 2
        model, train_loss, val_loss = ffnn_train(train_loader, val_loader, input_size, num_classes)

    if algorithm == 5:
        X_xgtrain, Y_xgtrain = preprocess_df(train_trump,False,None,False)
        model = train_XGB(X_xgtrain,Y_xgtrain)

    if algorithm == 6:
        X_rtrain, X_rval, y_rtrain, y_rval = preprocess_text(train_trump,True)
        sequence_length = 10
        rtrain_dataset = rDataSetTrump(X_rtrain.values, y_rtrain.values, sequence_length=sequence_length)
        rval_dataset = rDataSetTrump(X_rval.values, y_rval.values, sequence_length=sequence_length)
        rtrain_loader = DataLoader(rtrain_dataset, batch_size=64, shuffle=True)
        rval_loader = DataLoader(rval_dataset, batch_size=64, shuffle=False)
        # Define model parameters
        input_size = X_rtrain.shape[1]  # Number of features
        num_classes = 2  # Binary classification
        # Train the RNN model:
        model, train_loss, val_loss = train_RNN(
            rtrain_loader,
            rval_loader,
            input_size,
            num_classes,
            epochs=40,
            batch_size=64,
            learning_rate=1e-4,
            verbose=True)

    return model

In [14]:
def training_pipeline(alg, train_fn):
    """Returns a trained model given the specific task and algorithm.
    Args:
        alg (int): an integer between 1-5, indicating the algorithmic approach as
                    specified above (1: logistic regression, 2: svm, 3:FFNN, etc.).
        train_ fn (str): full path to the file containing the training data.
    """
    model = train_models(alg, train_fn)
    return model

In [15]:
def retrain_best_model():
  """ Retrains and returns the best performing model.
  """
  res_model = training_pipeline(5, "trump_train.tsv")
  return res_model

In [32]:
def predict(m, fn):
    """ Returns a list of 0s and 1s, corresponding to the lines in the specified file.
        Args:
        model: the trained model to be used.
        fn: the full path to a file in the same format as the test set we have provided."""
    columns = ['user_handle', 'tweet_text', 'timestamp']
    test = pd.read_csv(fn, sep='\t', quoting=3, header=None, names=columns)
    if type(m) == FFNN:
        test_data = preprocess_df_test(test,None,False)
        res = predict_FFNN(m, test_data)
    elif type(m) == RNN:
        test_data = preprocess_text_test(test)
        res = predict_RNN(m, test_data)
    elif type(m) == xgb.XGBClassifier:
        test_data = preprocess_df_test(test,None,False)
        res = list(m.predict(test_data))
    else:
        test_data = preprocess_df_test(test,"standard",True)
        res = list(m.predict(test_data))
    return res

##Train and Test Experiments:

In [33]:
best_model = retrain_best_model()
predictions = predict(best_model, "trump_tweets_test_a.tsv")

100%|██████████| 3528/3528 [00:30<00:00, 114.41it/s]
100%|██████████| 200/200 [00:02<00:00, 80.72it/s]


In [34]:
str_pred = []
for i in predictions:
  str_pred.append(str(i))
pred = " ".join(str_pred)

In [35]:
def write_list_to_txt(data, filename):
    """
    Writes a list of items to a text file, each item on a new line.

    Args:
    - data (list): The list of items to write to the file.
    - filename (str): The name of the file to write the data to.
    """
    with open(filename, 'w') as file:
      file.write(data)

In [36]:
write_list_to_txt(pred, 'predictions.txt')