In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("tfidf_features.csv")
df.info()

In [None]:
df

In [None]:
df.describe()

In [None]:
X = df.copy()

In [None]:
random_state=10
X_np = X.to_numpy()

dims = [2, 5, 10, 20, 50, 100, 200]
mean_dists = []

for d in dims:
    Xd = X_np[:, :d] # Select the first d features
    dists = []
    n = len(Xd) # Number of samples

    #Compute pairwise Euclidean distances
    for i in range(n): # Select the first sample
        for j in range(i + 1, n): # Select the second sample (bigger than i to avoid duplicates and self-comparisons)
            dist = np.linalg.norm(np.array(Xd[i]) - np.array(Xd[j])) #Euclidean Distance
            dists.append(dist)
    mean_dists.append(np.mean(dists))

mean_dists_df = pd.DataFrame({"Dimension": dims, "Mean Euclidean Distance": mean_dists})
print(mean_dists_df)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(9,5))
plt.plot(mean_dists_df["Dimension"], mean_dists_df["Mean Euclidean Distance"], marker='o')
plt.xlabel("Number of dimensions")
plt.ylabel("Mean Euclidean Distance")
plt.grid(True)
plt.show()

In [None]:
random_state=10
X_np = X.to_numpy()

dims = [2, 5, 10, 20, 50, 100, 200]
log_dists = []

for d in dims:
    Xd = X_np[:, :d]
    n = len(Xd)

    # Initialise nearsest and farthest distances
    d_min = [float('inf')] * n # Smallest distance -> start from inf
    d_max = [0] * n # Largest distance -> start from 0
    
    for i in range(n):
        for j in range(i + 1, n):
            dist = np.linalg.norm(Xd[i] - Xd[j])

            # Update nearest distances 
            if dist < d_min[i]: d_min[i] = dist
            if dist < d_min[j]: d_min[j] = dist

            # Update farthest distances
            if dist > d_max[i]: d_max[i] = dist
            if dist > d_max[j]: d_max[j] = dist

    d_min = np.array(d_min)
    d_max = np.array(d_max)
    
    valid = (d_min > 0) & (d_min < np.inf) # d_min must be > 0 except isolated, self distances and infinites
    d_min_valid = d_min[valid]
    d_max_valid = d_max[valid]

    log_dists.append(np.mean(np.log((d_max_valid - d_min_valid) / (d_min_valid))))

log_df = pd.DataFrame({"Dimension": dims, "log((max-min)/min)": log_dists})
print(log_df)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(9,5))
plt.plot(log_df["Dimension"], log_df["log((max-min)/min)"], marker='o')
plt.xlabel("Number of dimensions")
plt.ylabel("log((max-min)/min)")
plt.grid(True)
plt.show()

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

comps_num = np.arange(1, 200+1)
comps_variance = []

for num_components in comps_num:
    pca = PCA(n_components=num_components, random_state=10)
    pca.fit(X.values)
    total_explained_variance = sum(pca.explained_variance_ratio_)
    comps_variance.append(total_explained_variance)

plt.figure(figsize=(12, 6))
plt.grid(True)
plt.plot(comps_num, comps_variance, marker='o')
plt.xlabel("# Components")
plt.ylabel(" Cumulative explained variance")
plt.show()

In [None]:
pd.DataFrame(comps_variance, index=comps_num, columns=["total_explained_variance"])

In [None]:
top5_each = pca.explained_variance_ratio_[:5]
top5_total = sum(top5_each)

print("Top 5 each:", top5_each)
print("Top 5 explained variance:", top5_total)

In [None]:
from sklearn.decomposition import PCA

pca_tsne = PCA(n_components=50, random_state=10)
X_pca50 = pca_tsne.fit_transform(X.values)
print("Shape after PCA:", X_pca50.shape)

In [None]:
from sklearn.manifold import TSNE
random_state = 10

tsne_comps_num = np.arange(1, 3+1)
tsne_comps_divergence = []

for num_components in tsne_comps_num:
    tsne = TSNE(n_components=num_components, random_state=10)
    tsne.fit(X_pca50)
    tsne_comps_divergence.append(tsne.kl_divergence_)
    print(f"KL Divergence for {num_components}: {tsne.kl_divergence_}")

In [None]:
tsne_perplexity_num = np.arange(5, 50+1, 5)
tsne_perplexity_divergence = []

for perplexity in tsne_perplexity_num:
    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=10)
    tsne.fit(X_pca50)
    tsne_perplexity_divergence.append(tsne.kl_divergence_)
    print(f"KL Divergence for perplexity of {perplexity}: {tsne.kl_divergence_}")

In [None]:
plt.figure(1, figsize=(12, 6))
plt.grid()
plt.plot(tsne_perplexity_num, tsne_perplexity_divergence, marker='o')
plt.xlabel("Perplexity")
plt.ylabel("KL Divergence")
plt.show()

In [None]:
#PCA 2 Dimensionality
pca = PCA(n_components=2, random_state=10)
X_pca = pca.fit_transform(X.values)
print("Total explained variance:", sum(pca.explained_variance_ratio_))

In [None]:
plt.figure(1, figsize=(12, 6))
plt.grid()

scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], s=20)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

In [None]:
#PCA 3 Dimensionality
pca = PCA(n_components=3, random_state=10)
X_pca = pca.fit_transform(X.values)
print("Total explained variance:", sum(pca.explained_variance_ratio_))

In [None]:
fig = plt.figure(1, figsize=(10, 10))
ax = fig.add_subplot(projection="3d")

scatter = ax.scatter(X_pca[:, 0], X_pca[:, 1], X_pca[:, 2], s=20)
ax.set(xlabel="PC1", ylabel="PC2", zlabel="PC3")
plt.show()

In [None]:
#t-SNE 2 Dimensionality
tsne = TSNE(n_components=2, perplexity=50, random_state=random_state)
X_tsne = tsne.fit_transform(X.values)
print("KL Divergence with 2 components and perplexity of 50:", tsne.kl_divergence_)

In [None]:
plt.figure(1, figsize=(12, 6))
plt.grid()
scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], s=20)
plt.xlabel("C1")
plt.ylabel("C2")
plt.show()

In [None]:
tsne_perplexity_num = np.arange(5, 50+1, 5)
tsne_perplexity_divergence = []

for perplexity in tsne_perplexity_num:
    tsne= TSNE(n_components=3, perplexity=perplexity, random_state=random_state)
    tsne.fit(X.values)
    tsne_perplexity_divergence.append(tsne.kl_divergence_)
    print(f"KL Divergence for perplexity of {perplexity}: {tsne.kl_divergence_}")

In [None]:
#t-SNE 3 Dimensionality
tsne = TSNE(n_components=3, perplexity=50, random_state=random_state)
X_tsne = tsne.fit_transform(X.values)
print("KL Divergence with 3 components and perplexity of 50:", tsne.kl_divergence_)

In [None]:
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(projection="3d")
scatter = ax.scatter(X_tsne[:, 0], X_tsne[:, 1], X_tsne[:, 2], s=20)
ax.set(xlabel=("C1"), ylabel="C2", zlabel="C3")
plt.show()

## 2.1 Preprocessing

In [None]:
import pandas as pd
import numpy as ns
import matplotlib.pyplot as plt

df = pd.read_csv("kick.csv", na_filter=False)
df.info()

In [None]:
#VehOdo
print(df['VehOdo'].describe())
print(df['VehOdo'].value_counts())
print("=====================")
print(df['VehOdo'].unique())

#MMRAcquisitionAuctionAveragePrice
print(df['MMRAcquisitionAuctionAveragePrice'].describe())
print(df['MMRAcquisitionAuctionAveragePrice'].value_counts())
print("=====================")
print(df['MMRAcquisitionAuctionAveragePrice'].unique())

#Make
print(df['Make'].describe())
print(df['Make'].value_counts())
print("=====================")
print(df['Make'].unique())

#WarrantyCost
print(df['WarrantyCost'].describe())
print(df['WarrantyCost'].value_counts())
print("=====================")
print(df['WarrantyCost'].unique())

#IsBadBuy
print(df['IsBadBuy'].describe())
print(df['IsBadBuy'].value_counts())
print("=====================")
print(df['IsBadBuy'].unique())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Distribution of VehOdo
regdens_dist = sns.histplot(df['VehOdo'].dropna(), kde=True, stat="density",
kde_kws=dict(cut=3))
plt.show()
# Distribution of MMRAcquisitionAuctionAveragePrice
medhhinc_dist = sns.histplot(df['MMRAcquisitionAuctionAveragePrice'].dropna(), kde=True, stat="density",
kde_kws=dict(cut=3))
plt.show()
# Distribution of WarrantyCost
meanhhsz_dist = sns.histplot(df['WarrantyCost'].dropna(), kde=True, stat="density",
kde_kws=dict(cut=3))
plt.show()
#countplot of IsBadBuy(binary)
sns.countplot(x='IsBadBuy', data=df)
plt.title('Distribution of IsBadBuy (Target Variable)')
plt.show()

In [None]:
VehOdo_threshold = [1000,2500,5000]
for t in VehOdo_threshold:
    df[f'HasError_VehOdo_{t}'] = df['VehOdo'] < t
    g = sns.FacetGrid(df, col=f'HasError_VehOdo_{t}')
    g = g.map(plt.hist, 'MMRAcquisitionAuctionAveragePrice', bins=100)
    plt.suptitle(f"VehOdo < {t}",y=1.05)
    plt.show()
    
VehOdo_threshold = [110000,130000,150000]
for t in VehOdo_threshold:
    df[f'HasError_VehOdo_{t}'] = df['VehOdo'] > t
    g = sns.FacetGrid(df, col=f'HasError_VehOdo_{t}')
    g = g.map(plt.hist, 'MMRAcquisitionAuctionAveragePrice', bins=100)
    plt.suptitle(f"VehOdo > {t}",y=1.05)
    plt.show()   

VehOdo_threshold = [1000,2500,5000]
for t in VehOdo_threshold:
    df[f'HasError_VehOdo_{t}'] = df['VehOdo'] < t
    g = sns.FacetGrid(df, col=f'HasError_VehOdo_{t}')
    g = g.map(plt.hist, 'WarrantyCost', bins=100)
    plt.suptitle(f"VehOdo < {t}",y=1.05)
    plt.show()
    
VehOdo_threshold = [110000,130000,150000]
for t in VehOdo_threshold:
    df[f'HasError_VehOdo_{t}'] = df['VehOdo'] > t
    g = sns.FacetGrid(df, col=f'HasError_VehOdo_{t}')
    g = g.map(plt.hist, 'WarrantyCost', bins=100)
    plt.suptitle(f"VehOdo > {t}",y=1.05)
    plt.show()   

df['HasError_MMRA']=(df['MMRAcquisitionAuctionAveragePrice'] < 500)|(df['MMRAcquisitionAuctionAveragePrice'] > 20000)
g = sns.FacetGrid(df, col='HasError_MMRA')
g = g.map(plt.hist, 'WarrantyCost', bins=100)
plt.show()

In [None]:
# before
print("Row VehOdo before dropping errorneous rows", len(df))
df = df[(df['VehOdo'] >= 5000)&(df['VehOdo'] <= 110000)]
# after
print("Row VehOdo after dropping errorneous rows", len(df))

#MMRAcquisitionAuctionAveragePrice
print("Row MMRA before dropping errorneous rows", len(df))
df = df[(df['MMRAcquisitionAuctionAveragePrice'] >= 500)&(df['MMRAcquisitionAuctionAveragePrice'] <= 20000)]
print("Row MMRA after dropping errorneous rows", len(df))

In [None]:
# Distribution of VehOdo
regdens_dist = sns.histplot(df['VehOdo'].dropna(), kde=True, stat="density",
kde_kws=dict(cut=3))
plt.show()
# Distribution of MMRAcquisitionAuctionAveragePrice
medhhinc_dist = sns.histplot(df['MMRAcquisitionAuctionAveragePrice'].dropna(), kde=True, stat="density",
kde_kws=dict(cut=3))
plt.show()
# Distribution of WarrantyCost
meanhhsz_dist = sns.histplot(df['WarrantyCost'].dropna(), kde=True, stat="density",
kde_kws=dict(cut=3))
plt.show()
#countplot of IsBadBuy(binary)
sns.countplot(x='IsBadBuy', data=df)
plt.title('Distribution of IsBadBuy (Target Variable)')
plt.show()

## 2.2 Clustering Model

In [None]:
from sklearn.preprocessing import StandardScaler

# take 3 variables and drop the rest. copy the dataframe to avoid warnings later
df2 = df[['VehOdo', 'MMRAcquisitionAuctionAveragePrice', 'WarrantyCost']].copy() # convert df2 to matrix
X = df2.to_numpy()

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
from sklearn.cluster import KMeans
for seed in [1, 5, 10, 42, 100]:
    model = KMeans(n_clusters=3, random_state=seed)
    model.fit(X)
    print(f"Seed {seed} -> Inertia: {model.inertia_}")
    print("Centroid locations:")
    for centroid in model.cluster_centers_:
        print(centroid)

In [None]:
#Set a several n_clusters
for k in range(1,11):
    model = KMeans(n_clusters=k, random_state=42)
    model.fit(X)
    print("The number of cluster:", k)
    print("Sum of intra-cluster distance:", model.inertia_)
    print("Centroid locations:")
    for centroid in model.cluster_centers_:
         print(centroid)
    print("====================================================")

In [None]:
# list to save the clusters and cost
clusters = []
inertia_vals = []
for k in range(1,11):
 # train clustering with the specified K
 model = KMeans(n_clusters=k, random_state=42)
 model.fit(X)
 
 # append model to cluster list
 clusters.append(model)
 inertia_vals.append(model.inertia_)
 
# plot the inertia vs K values
plt.plot(range(1,11), inertia_vals, marker='*')
plt.show()

In [None]:
from sklearn.metrics import silhouette_score
print(clusters[1])
print("Silhouette score for k=2", silhouette_score(X, clusters[1].predict(X)))
print("=========================================")
print(clusters[2])
print("Silhouette score for k=3", silhouette_score(X, clusters[2].predict(X)))
print("=========================================")
print(clusters[3])
print("Silhouette score for k=4", silhouette_score(X, clusters[3].predict(X)))

In [None]:
# visualisation of K=3 clustering solution
model = KMeans(n_clusters=3, random_state=42)
model.fit(X)
# sum of intra-cluster distances
print("Sum of intra-cluster distance:", model.inertia_)
print("Centroid locations:")
for centroid in model.cluster_centers_:
 print(centroid)

## 2.3 The optimal number of clusters

In [None]:
y = model.predict(X)
df2['Cluster_ID'] = y
# how many in each
print("Cluster membership")
print(df2['Cluster_ID'].value_counts()) 

# pairplot
# added alpha value to assist with overlapping points
cluster_g = sns.pairplot(df2, hue='Cluster_ID', diag_kind='hist')
plt.show()

## 2.4 New Clustering Model

In [None]:
df.info()

In [None]:
df_4 = df[['VehOdo', 'MMRAcquisitionAuctionAveragePrice', 'WarrantyCost', 'Make']].copy()
print("Task 2.4. data:")
print(df_4.info())
print(df_4['Make'].value_counts())

In [None]:
from sklearn.preprocessing import LabelEncoder
# convert string labels to numerical
le = LabelEncoder()
df_4['Make_encoded'] = le.fit_transform(df_4['Make'].values)
print(df_4[['Make', 'Make_encoded']].head(10))
df_4 = df_4[['VehOdo', 'MMRAcquisitionAuctionAveragePrice', 'WarrantyCost', 'Make_encoded']]

In [None]:
# convert df to matrix
X = df_4.to_numpy()
# scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
from kmodes.kmodes import KModes
from kmodes.kprototypes import KPrototypes
# list to save the clusters and cost
clusters = []
cost_vals = []
# this process is computationally expensive and may take some time
for k in range(1,11):
    # train clustering with the specified K
    model = KPrototypes(n_clusters=k, random_state=42, n_jobs=-1)
    model.fit_predict(X, categorical=[3])
 
    # append model to cluster list
    clusters.append(model)
    cost_vals.append(model.cost_)
# plot the cost vs K values
plt.plot(range(1,11), cost_vals, marker='*')
plt.show()

In [None]:
X_num = [[row[0], row[1], row[2]] for row in X] # Variables of X with numeric datatype
X_cat = [[row[3]] for row in X] # variables of X with categorical datatype
#K=3
model = clusters[2] # cluster[1] holds the K-prtotypes model with K=3
# Calculate the Silhouette Score for the numeric and categorical variables seperately
silScoreNums = silhouette_score(X_num, model.fit_predict(X, categorical=[2]),
metric='euclidean')
print("Silscore for numeric variables:", silScoreNums)
silScoreCats = silhouette_score(X_cat, model.fit_predict(X, categorical=[2]),
metric='hamming')
print("Silscore for categorical variables:", silScoreCats)
# Average the silhouette scores
silScore = (silScoreNums + silScoreCats) / 2
print("The avg silhouette score for k=2:", silScore)


#K=4
model = clusters[3] # cluster[1] holds the K-prtotypes model with K=4
# Calculate the Silhouette Score for the numeric and categorical variables seperately
silScoreNums = silhouette_score(X_num, model.fit_predict(X, categorical=[3]),
metric='euclidean')
print("Silscore for numeric variables:", silScoreNums)
silScoreCats = silhouette_score(X_cat, model.fit_predict(X, categorical=[3]),
metric='hamming')
print("Silscore for categorical variables:", silScoreCats)
# Average the silhouette scores
silScore = (silScoreNums + silScoreCats) / 2
print("The avg silhouette score for k=4:", silScore)

In [None]:
#Optical K=4
import seaborn as sns
import matplotlib.pyplot as plt
model = clusters[3]
y=model.fit_predict(X, categorical=[3])
df_4['Cluster_ID'] = y
# how many records are in each cluster
print("Cluster membership")
print(df_4['Cluster_ID'].value_counts())
# pairplot the cluster distribution.
cluster_g = sns.pairplot(df_4, hue='Cluster_ID', diag_kind='hist',
                        height=3,       
                        aspect=1,       
                        plot_kws={'alpha': 0.6, 's': 30})
plt.show()

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('tsdm.csv')
df.info()

## 3.1 Preprocessing

In [None]:
# convert date type of OBSERVATION_DATE (datetime)
df['OBSERVATION_DATE'] = pd.to_datetime(df['OBSERVATION_DATE'])

# sort: essential time series prediction
df = df.sort_values(['PADDOCK_ID','OBSERVATION_DATE']).reset_index(drop=True)

df.info()

In [None]:
df.groupby("PADDOCK_ID").size()

### Function

- create_sequence(sequence, lookback, forecast_horizon, target_col)
- data_prep(df, feature_columns, lookback, test_steps, target_col)
- MyLSTMNet(nn.Module)
- train_predict_model(model, n_epochs, lr, X_all, y_all, lengths, validation_split=0.2)
- pred_eval(model, X, y, lengths, train_d, test_d, lookback, target_col)

In [None]:
# create_sequences function
def create_sequences(sequence, lookback, forecast_horizon, target_col):
    T, num_features = sequence.shape
    X, y, lengths = [], [], []
    pad_vector = np.zeros((lookback, num_features))

    # Fixed-length lookback with pre-padding
    for t in range(1, T - forecast_horizon + 1):
        context = sequence[:t]
        if len(context) > lookback:
            context = context[-lookback:]

        padded_context = pad_vector.copy()
        padded_context[-len(context):] = context

        X.append(padded_context)
        y.append(sequence[t:t + forecast_horizon, target_col])
        lengths.append(min(len(context), lookback))

    return np.array(X), np.array(y), lengths

In [None]:
# data_prep function: each location, split the data, scaler
import numpy as np
import torch
from sklearn.preprocessing import MinMaxScaler

def data_prep(df, feature_columns, lookback, test_steps, target_col):
    # prepare to store all training data
    X_all, y_all = [], []
    location_ids = [] # to track which location each sample comes from
    test_data = [] # to store test data for each location
    train_data = []
    lengths_all = []

    # Fit a global scaler
    all_train_values = []
    for _, group in df.groupby("PADDOCK_ID"):
        feature_values = group[feature_columns].values

        if len(feature_values) > lookback + test_steps:
            all_train_values.append(feature_values[:-test_steps])
        all_train_values = np.vstack(all_train_values)

        global_scaler = MinMaxScaler()
        global_scaler.fit(all_train_values)

        for location_id, group in df.groupby("PADDOCK_ID"):
            feature_values = group[feature_columns].values

            if len(feature_values) <= 194:
                continue

            # split and scale
            train_sample = global_scaler.transform(feature_values[:-test_steps])
            test_sample = global_scaler.transform(feature_values[-test_steps:])

            train_data.append((location_id, train_sample))
            test_data.append((location_id, test_sample, global_scaler))

            # prepare LSTM sequence data for training
            X_location, y_location, lengths = create_sequences(train_sample, lookback, test_steps, target_col)

            # append to the overall dataset
            X_all.append(X_location)
            y_all.append(y_location)
            lengths_all.append(lengths)

            # store location ID for tracking
            location_ids.extend([location_id] * len(y_location))

        # concatenate all locations' training data for model training
        X_all = np.concatenate(X_all, axis=0)
        y_all = np.concatenate(y_all, axis=0)
        lengths_all = np.concatenate(lengths_all, axis=0)

        X_all = X_all.reshape((X_all.shape[0], X_all.shape[1], X_all.shape[2]))

        return(torch.Tensor(X_all), torch.Tensor(y_all),
               torch.Tensor(lengths_all), train_data, test_data)
        

In [None]:
# Defining the LSTM network

import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class MyLSTMNet(nn.Module):
    def __init__(self, num_features, hidden_layer_size, num_layers, output_size, dropout_prob):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=num_features,
            hidden_size=hidden_layer_size,
            num_layers=num_layers,
            batch_first=True)
        self.dropout = nn.Dropout(dropout_prob)
        self.fc = nn.Linear(hidden_layer_size, output_size)

    def forward(self, data, lengths):
        packed_data = pack_padded_sequence(data, lengths.cpu(), batch_first=True, enforce_sorted=False)
        
        # Run through LSTM
        packed_output, (hn, cn) = self.lstm(packed_data)

        # Use the last layer's hidden state
        last_hidden = hn[-1]

        # apply dropout and final linear layer
        out = self.dropout(last_hidden)
        out = self.fc(out)
        return out

In [None]:
# Defining Training Process
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, TensorDataset, random_split

def train_predict_model(model, n_epochs, lr, X_all, y_all, lengths, validation_split=0.2):
    batch_size = 32

    # split data into train and validation sets
    dataset = TensorDataset(X_all, y_all, lengths)
    val_size = int(len(dataset) * validation_split)
    train_size = len(dataset) - val_size
    train_set, val_set = random_split(dataset, [train_size, val_size])

    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)

    loss_fn = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    print(f"The model has {sum(p.numel() for p in model.parameters() if p.requires_grad)} trainable parameters")

    train_loss_history = []
    val_loss_history = []

    best_val_loss = float('inf')
    best_model_state = None

    for epoch in range(n_epochs):
        model.train()
        for X_batch, y_batch, lengths_batch in train_loader:
            y_pred = model(X_batch, lengths_batch)
            loss = loss_fn(y_pred, y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # validation check every 100 epochs
        if epoch % 100 == 0:
            model.eval()
            with torch.no_grad():
                train_preds = model(X_all[train_set.indices], lengths[train_set.indices])
                train_loss = loss_fn(train_preds, y_all[train_set.indices]).item()

                val_preds = model(X_all[val_set.indices], lengths[val_set.indices])
                val_loss = loss_fn(val_preds, y_all[val_set.indices]).item()

                print(f"Epoch {epoch+1}: train loss {train_loss:.4f}, val_loss {val_loss:.4f}")

                train_loss_history.append(train_loss)
                val_loss_history.append(val_loss)

                # save best model
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    best_model_state = model.state_dict()

    # restore best model state
    if best_model_state is not None:
        model.load_state_dict(best_model_state)

    return train_loss_history, val_loss_history, model
    

In [None]:
# visualisation of train loss
def vis_train_loss(train_loss_history, val_loss_history):
    epochs = range(0, n_epochs, 100)
    plt.plot(epochs, train_loss_history, label='Training Loss')
    plt.plot(epochs, val_loss_history, label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Loss Convergence')
    plt.legend()
    plt.grid()
    plt.show()

In [None]:
# computing the RMSE: root_mean_squared_error
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score

def pred_eval(model, X, y, lengths, train_d, test_d, lookback, target_col):
    model.eval()
    with torch.no_grad():
        train_preds = model(X, lengths)
        print("Training RMSE:", root_mean_squared_error(y.flatten().tolist(), train_preds.flatten().tolist()))
        print("Training R2:", r2_score(y.flatten().tolist(), train_preds.flatten().tolist()))

        X_test = []
        y_test = []
        lengths_test = []

        for count, (location_id, test_values, scaler) in enumerate(test_d):
            train_values = train_d[count][1]
            X_test.append(train_values[-lookback:])
            y_test.append(test_values[:, target_col])

            # append the actual lengths (just like the training phase)
            lengths_test.append(len(train_values[-lookback:]))

        X_test = torch.Tensor(np.array(X_test))
        y_test = torch.Tensor(np.array(y_test))
        lengths_test = torch.Tensor(lengths_test).long()
        test_preds = model(X_test, lengths_test)

        print("Test RMSE:", root_mean_squared_error(y_test.flatten().tolist(), test_preds.flatten().tolist()))
        print("Test R2:", r2_score(y_test.flatten().tolist(), test_preds.flatten().tolist()))
        
        plt.figure(figsize = (10, 6))
        plt.plot(y_test.flatten().tolist(), label="Expected Value")
        plt.plot(test_preds.flatten().tolist(), label="Predicted Value")
        plt.grid()
        plt.legend(fontsize=10)
        plt.tight_layout()
        plt.show()

In [None]:
# ensure the last 5 timesteps of each paddock for test: test_steps=5

lookback = 5
test_steps = 5
target_col = 0
X_5, y_5, lengths_5, train_d_5, test_d_5 = data_prep(df, ['TSDM'], lookback, test_steps, target_col)

print("Shape of input data after sequence creation:", X_5.shape)
print("Shape of targets after sequence creation:", y_5.shape)

## 3.2 Univariate LSTM Model 1

In [None]:
# Univariate LSTM model (lookback=5, predict=5)
num_features = X_5.shape[2]
hidden_layer_size = 10
output_size = test_steps
num_layers = 2
dropout_prob = 0.2
model_lstm_5 = MyLSTMNet(num_features, hidden_layer_size, num_layers, output_size, dropout_prob)

print(model_lstm_5)
print("============================================================")

# training the Univariate LSTM Model
n_epochs = 201
lr = 0.001
train_loss_history_5, val_loss_history_5, model_lstm_5 = train_predict_model(model_lstm_5, n_epochs, lr, X_5, y_5, lengths_5)

# visualisation of train loss
vis_train_loss(train_loss_history_5, val_loss_history_5)
print( )

# RMSE of Univariate LSTM Model(lookback=5, predict=5)
lookback = 5
target_col = 0
pred_eval(model_lstm_5, X_5, y_5, lengths_5, train_d_5, test_d_5, lookback, target_col)


In [None]:
# try to find optimal hyperparameters

lookback = 5
test_steps = 5  # ensure the last 5 timesteps of each paddock for test: test_steps=5
target_col = 0
X_5, y_5, lengths_5, train_d_5, test_d_5 = data_prep(df, ['TSDM'], lookback, test_steps, target_col)

print("Shape of input data after sequence creation:", X_5.shape)
print("Shape of targets after sequence creation:", y_5.shape)
print("============================================================")

# Univariate LSTM model (lookback=5, predict=5)
num_features = X_5.shape[2]
hidden_layer_size = 15
output_size = test_steps
num_layers = 1 # to check more simply and reduce overfiting risk
dropout_prob = 0.2
model_lstm_5 = MyLSTMNet(num_features, hidden_layer_size, num_layers, output_size, dropout_prob)

print(model_lstm_5)
print("============================================================")

# training the Univariate LSTM Model
n_epochs = 201
lr = 0.001
train_loss_history_5, val_loss_history_5, model_lstm_5 = train_predict_model(model_lstm_5, n_epochs, lr, X_5, y_5, lengths_5)

# visualisation of train loss
vis_train_loss(train_loss_history_5, val_loss_history_5)
print( )

# RMSE of Univariate LSTM Model(lookback=5, predict=5)
lookback = 5
target_col = 0
pred_eval(model_lstm_5, X_5, y_5, lengths_5, train_d_5, test_d_5, lookback, target_col)


## 3.3 Univariate LSTM Model 2

In [None]:
# Univariate LSTM Model (Lookback=10, Predict=5)
lookback = 10
test_steps = 5
target_col = 0
X_10, y_10, lengths_10, train_d_10, test_d_10 = data_prep(df, ['TSDM'], lookback, test_steps, target_col)

print("Shape of input data after sequence creation:", X_10.shape)
print("Shape of targets after sequence creation:", y_10.shape)
print("============================================================")

num_features = X_10.shape[2]
hidden_layer_size = 15
output_size = test_steps
num_layers = 2
dropout_prob = 0.2 # bigger number can make underfitting.
model_lstm_10 = MyLSTMNet(num_features, hidden_layer_size, num_layers, output_size, dropout_prob)

print(model_lstm_10)
print("============================================================")

n_epochs = 201
lr = 0.001
train_loss_history_10, val_loss_history_10, model_lstm_10 = train_predict_model(model_lstm_10, n_epochs, lr, X_10, y_10, lengths_10)
print( )
vis_train_loss(train_loss_history_10, val_loss_history_10)

print( )
lookback = 10
target_col = 0
pred_eval(model_lstm_10, X_10, y_10, lengths_10, train_d_10, test_d_10, lookback, target_col)

## 3.4 Multivariate LSTM Model

## 3.4.1

In [None]:
# Functions for no restriction to a fixed lookback
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import root_mean_squared_error, r2_score
from torch.utils.data import DataLoader, TensorDataset, random_split
from torch.nn.utils.rnn import pad_sequence

def create_sequences(sequence, lookback, forecast_horizon, target_col, pad_value=0.0):
    T, num_features = sequence.shape
    X, y, lengths = [], [], []

    if lookback > 0:
        pad_vector = np.zeros((lookback, num_features))

        for t in range(1, T - forecast_horizon + 1):
            context = sequence[:t]
            if len(context) > lookback:
                context = context[-lookback:]
            elif len(context) == 0:
                continue  # to resolve null context problem

            padded_context = pad_vector.copy()
            padded_context[-len(context):] = context

            X.append(padded_context)
            y.append(sequence[t:t + forecast_horizon, target_col])
            lengths.append(min(len(context), lookback))

        return np.array(X), np.array(y), lengths
    else:
        for t in range(1, T - forecast_horizon + 1):
            context = torch.tensor(sequence[:t], dtype=torch.float32)
            
            lengths.append(t)

            X.append(context) # No manual padding
            y.append(torch.tensor(sequence[t:t + forecast_horizon, target_col], dtype=torch.float32))

        X_padded = pad_sequence(X, batch_first=True, padding_value=pad_value)
        y_tensor = torch.stack(y)

        return X_padded.numpy(), y_tensor.numpy(), lengths


## 3.4.2

In [None]:
lookback = 0 # mean no restriction of lookback
test_steps = 5
target_col = 0

climate_features = ['TSDM','15D_AVG_DAILY_RAIN', '15D_AVG_MAX_TEMP', '15D_AVG_MIN_TEMP',
                    '15D_AVG_RH_TMAX', '15D_AVG_RH_TMIN','15D_AVG_EVAP_SYN', '15D_AVG_RADIATION']

X_f, y_f, lengths_f, train_d_f, test_d_f = data_prep(df, climate_features, lookback, test_steps, target_col)

print("Shape of input data after sequence creation:", X_f.shape)
print("Shape of targets after sequence creation:", y_f.shape)
print("============================================================")

num_features = X_f.shape[2]
hidden_layer_size = 20 # because of multivariate - need to increase hidden layer size
output_size = test_steps
n_epochs = 201 # to reduce running time
lr = 0.001
num_layers = 2
dropout_prob = 0.2

model_lstm_f = MyLSTMNet(num_features, hidden_layer_size, num_layers, output_size, dropout_prob)
print(model_lstm_f)
print("============================================================")

train_loss_history_f, val_loss_history_f, model_lstm_f = train_predict_model(model_lstm_f, n_epochs, lr, X_f, y_f, lengths_f)
print( )
vis_train_loss(train_loss_history_f, val_loss_history_f)
print( )
pred_eval(model_lstm_f, X_f, y_f, lengths_f, train_d_f, test_d_f, lookback, target_col)


## 4.1 Preprocessing

In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import roc_auc_score, roc_curve
import torch
from transformers import BertweetTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments 
#Roberta model, Auto Tokenizer
from transformers import DataCollatorWithPadding, EarlyStoppingCallback
from datasets import Dataset
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('hydrogen_small.csv')
df.info()

In [None]:
df["label"].unique()

In [None]:
df["text"].unique()

In [None]:
df["text"].iloc[18]

In [None]:
def clean_message(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"[^\x00-\x7F]+", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [None]:
df["text"] = df["text"].apply(clean_message)

In [None]:
df["text"].iloc[18]

In [None]:
df["text"].unique()

In [None]:
df["label"] = df["label"].map({
    'Irrelevant': 0, # Negative = 0
    'Relevant': 1 # Positive = 1
})


In [None]:
df["label"].unique()

In [None]:
df[df["label"] == 0].head(5)

In [None]:
df["label"].value_counts()

## 4.2 Two pre-trained BERT models

In [None]:
X = df["text"].values
y = df["label"].values
random_state = 42

X_train, X_test, y_train, y_test = train_test_split(X,y,
stratify=y, test_size=0.3, random_state=random_state)
print("Training set size:", len(X_train))
print("Testing set size:", len(X_test))

In [None]:
train_df = pd.DataFrame({"text": X_train, "label": y_train})
test_df = pd.DataFrame({"text": X_test, "label": y_test})
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

print("Train dataset:", train_ds)
print("Test dataset:", test_ds)


## 4.2.1 Bertweet Model (Vinai)

In [None]:
model_name1 = "vinai/bertweet-base"
tokenizer1 = BertweetTokenizer.from_pretrained(model_name1)

In [None]:
# Function that is applied to all samples in the dataset.

def tokenize_bertweet(batch):
 # We set truncation=True to truncate (cut off) messages that are too long.
 # NOTE: Not all models require this, you may get a warning indicating that it has no effect.
 # Padding is set to True if the model requires a fixed sequence length.
    return tokenizer1(batch['text'], truncation=True, padding=True)
# Apply to both the training and testing datasets.
# We set batched to True which can enable parallel processing, however on my machine I found
# it did not scale to a greater number of threads.
train_ds_bertweet = train_ds.map(tokenize_bertweet, batched=True)
test_ds_bertweet = test_ds.map(tokenize_bertweet, batched=True)

In [None]:
train_ds

In [None]:
# Ensure the resources for any existing model has been freed.
try:
    del model
except NameError:
    pass
# Download/load the base model. We use the "vinai/bertweet-base" model here.
# Set the number of labels to the number of unique labels in the dataframe, which is 2.
# Set the problem type to single label classification, since we want one class for each sample.
model1 = RobertaForSequenceClassification.from_pretrained(
    model_name1,
    num_labels=df["label"].nunique(),
    problem_type="single_label_classification")

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    prec, recall, f1, _ = precision_recall_fscore_support(
    labels, preds, average="binary", pos_label=1)
    return {
        "accuracy": acc,
        "precision": prec,
        "recall": recall,
        "f1": f1
    }

In [None]:
EarlyStopping_model1 = RobertaForSequenceClassification.from_pretrained(
 model_name1,
 num_labels=df["label"].nunique(),
 problem_type="single_label_classification")
EarlyStopping_model1.train()
EarlyStopping_training_args = TrainingArguments(
 output_dir="./results",
 num_train_epochs=10,
 per_device_train_batch_size=16,
 per_device_eval_batch_size=64,
 eval_strategy="epoch",
 save_strategy="epoch",
 learning_rate=1e-5,
 weight_decay=0.01,
 logging_dir="./logs",
 logging_steps=10,
 # Added for early stopping.
 metric_for_best_model = "loss",
 load_best_model_at_end = True
)
EarlyStopping_trainer1 = Trainer(
 model=EarlyStopping_model1,
 args=EarlyStopping_training_args,
 train_dataset=train_ds_bertweet,
 eval_dataset=test_ds_bertweet,
 processing_class=tokenizer1,
 data_collator=DataCollatorWithPadding(tokenizer1),
 compute_metrics=compute_metrics,
 callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
)
EarlyStopping_trainer1.train()


In [None]:
# Switch the model to evaluation mode, disabling dropout etc layers.
model1.eval()
# Evaluate the datasets.
train_results_bertweet = EarlyStopping_trainer1.evaluate(train_ds_bertweet)
test_results_bertweet = EarlyStopping_trainer1.evaluate(test_ds_bertweet)

In [None]:
def display_evaluation(setname_bertweet, results_bertweet):
 print(f"{setname_bertweet} Set Accuracy:", round(results_bertweet["eval_accuracy"], 3))
 print(f"{setname_bertweet} Set Precision:", round(results_bertweet["eval_precision"], 3))
 print(f"{setname_bertweet} Set Recall:", round(results_bertweet["eval_recall"], 3))
 print(f"{setname_bertweet} Set F1 score:", round(results_bertweet["eval_f1"], 3))
display_evaluation("Training", train_results_bertweet)
display_evaluation("Testing", test_results_bertweet)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
 X, y, stratify=y, test_size=0.3, random_state=random_state)
train_df = pd.DataFrame({"text": X_train, "label": y_train})
test_df = pd.DataFrame({"text": X_test, "label": y_test})
train_ds_bertweet = Dataset.from_pandas(train_df)
test_ds_bertweet = Dataset.from_pandas(test_df)
train_ds_bertweet = train_ds_bertweet.map(tokenize_bertweet, batched=True)
test_ds_bertweet = test_ds_bertweet.map(tokenize_bertweet, batched=True)
print("Training set size:", len(train_df))
print("Testing set size:", len(test_df))

## 4.2.2 Roberta

In [None]:
model_name2  = 'roberta-base'
tokenizer2 = AutoTokenizer.from_pretrained(model_name2)

In [None]:
def tokenize_roberta(batch):
    return tokenizer2(batch['text'], truncation=True, padding=True)

train_ds_roberta = train_ds.map(tokenize_roberta, batched=True)
test_ds_roberta = test_ds.map(tokenize_roberta, batched=True)

In [None]:
train_ds

In [None]:
try:
    del model
except NameError:
    pass

In [None]:
model2 = AutoModelForSequenceClassification.from_pretrained(
    model_name2,
    num_labels=df["label"].nunique(),
    problem_type="single_label_classification")

In [None]:
EarlyStopping_model2 = AutoModelForSequenceClassification.from_pretrained(
 model_name2,
 num_labels=df["label"].nunique(),
 problem_type="single_label_classification")
EarlyStopping_model2.train()
EarlyStopping_training_args = TrainingArguments(
 output_dir="./results",
 num_train_epochs=10,
 per_device_train_batch_size=16,
 per_device_eval_batch_size=64,
 eval_strategy="epoch",
 save_strategy="epoch",
 learning_rate=2e-5,
 weight_decay=0.01,
 logging_dir="./logs",
 logging_steps=10,
 # Added for early stopping.
 metric_for_best_model = "loss",
 load_best_model_at_end = True
)
EarlyStopping_trainer2 = Trainer(
 model=EarlyStopping_model2,
 args=EarlyStopping_training_args,
 train_dataset=train_ds_roberta,
 eval_dataset=test_ds_roberta,
 processing_class=tokenizer2
    ,
 data_collator=DataCollatorWithPadding(tokenizer2),
 compute_metrics=compute_metrics,
 callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
)
EarlyStopping_trainer2.train()

In [None]:
model2.eval()
# Evaluate the datasets.
train_results_roberta = EarlyStopping_trainer2.evaluate(train_ds_roberta)
test_results_roberta = EarlyStopping_trainer2.evaluate(test_ds_roberta)

In [None]:
def display_evaluation(setname_roberta, results_roberta):
 print(f"{setname_roberta} Set Accuracy:", round(results_roberta["eval_accuracy"], 3))
 print(f"{setname_roberta} Set Precision:", round(results_roberta["eval_precision"], 3))
 print(f"{setname_roberta} Set Recall:", round(results_roberta["eval_recall"], 3))
 print(f"{setname_roberta} Set F1 score:", round(results_roberta["eval_f1"], 3))
display_evaluation("Training", train_results_roberta)
display_evaluation("Testing", test_results_roberta)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
 X, y, stratify=y, test_size=0.3, random_state=random_state)
train_df = pd.DataFrame({"text": X_train, "label": y_train})
test_df = pd.DataFrame({"text": X_test, "label": y_test})
train_ds_roberta = Dataset.from_pandas(train_df)
test_ds_roberta = Dataset.from_pandas(test_df)
train_ds_roberta = train_ds_roberta.map(tokenize_roberta, batched=True)
test_ds_roberta = test_ds_roberta.map(tokenize_roberta, batched=True)
print("Training set size:", len(train_df))
print("Testing set size:", len(test_df))

In [None]:
# Returns (matrix, tokens)
def compute_attention_matrix(tokenizer, model, text):
    # Feed into the model, you could also grab the token embedding directly
    # from the dataset, in which case this step would be unnecessary. We want
    # the output in Tensor format that we can feed to the model, so we use
    # return_tensors="pt" (PyTorch Tensor). Lastly, send the tensor to
    # whichever device the model is located on. This is unnecessary if you
    # are running purely on the CPU, but needed for models on GPUs.
    tokens = tokenizer(text, return_tensors="pt").to(model.device)
    # We use torch.no_grad() to ensure the weights in the model are unchanged.
    with torch.no_grad():
        pred = model(**tokens, output_attentions=True)
    # Stack layers. Depending on your model, this may have no effect.
    # Move it back to the GPU if it was previously on the GPU.
    attentions = torch.stack(pred.attentions).cpu()
    # Remove the batch dimension, as there is only a zero value there.
    attentions = attentions.squeeze(1)
    # Average over the transformer layers and heads.
    attentions = attentions.mean(dim=0).mean(dim=0)
    # attentions now contains a matrix of importance from every token to every
    # other token. e.g. if the message contained 10 tokens, it would be 10x10.
    # Select the predicted class.
    pred_class = pred.logits.cpu().argmax(-1).item()
    # Also return a string representation of the tokens in the message.
    # Plotting the integer token IDs would not be very meaningful. 
    token_strs = tokenizer.convert_ids_to_tokens(tokens["input_ids"][0]) 
    return (attentions, pred_class, token_strs)


In [None]:
def plot_attention(attentions, tokens, title):
 # Enlarge figure to take up more of the width.
 plt.figure(figsize=(10, 8))
 plt.title(title)
 # Plot heatmap.
 sns.heatmap(
 attentions, # Plot our attention matrix.
 xticklabels=tokens, # Display token names on X axis.
 yticklabels=tokens, # Display token names on Y axis.
 cmap='binary', # Black for low, white for high
 cbar=True # Display colour bar.
 )
 
 plt.show()

In [None]:
def display_attention_matrix(tokenizer, model, text, model_name="Model"):
 attention, pred_class, tokens = compute_attention_matrix(tokenizer, model, text)
 pred_label = "Positive" if pred_class == 1 else "Negative"
 title=f"{model_name}\n{text}\nPredicted class: {pred_label}"
 plot_attention(attention, tokens, title)

In [None]:
display_attention_matrix(tokenizer1, model1, df[df["label"] == 0].iloc[18]["text"], model_name="BERTweet Model")
display_attention_matrix(tokenizer2, model2, df[df["label"] == 0].iloc[18]["text"], model_name="RoBERTa Model")

In [None]:
display_attention_matrix(tokenizer1, model1, df[df["label"] == 0].iloc[72]["text"], model_name="BERTweet Model")
display_attention_matrix(tokenizer2, model2, df[df["label"] == 0].iloc[72]["text"], model_name="RoBERTa Model")

In [None]:
pred_bertweet = EarlyStopping_trainer1.predict(test_ds_bertweet)
pred_roberta = EarlyStopping_trainer2.predict(test_ds_roberta)

In [None]:
# Convert to PyTorch tensor, apply softmax, and convert back to a numpy array.
pred_probs_bertweet = torch.nn.functional.softmax(torch.Tensor(pred_bertweet.predictions)).numpy()
pred_probs_roberta = torch.nn.functional.softmax(torch.Tensor(pred_roberta.predictions)).numpy()

In [None]:
# Compute the ROC index. Recall y_test contains our original labels for the testing set.
roc_index_bertweet = roc_auc_score(y_test, pred_probs_bertweet[:, 1])
roc_index_roberta = roc_auc_score(y_test, pred_probs_roberta[:, 1])
# Compute the ROC curve.
fpr_bertweet,tpr_bertweet, thresholds_bertweet = roc_curve(y_test, pred_probs_bertweet[:,1])
fpr_roberta,tpr_roberta, thresholds_roberta = roc_curve(y_test, pred_probs_roberta[:,1])
# And plot it on a line graph, similarly to what we did in previous weeks.
plt.plot(fpr_bertweet, tpr_bertweet, label="BERTweet Model: {:.3f}".format(roc_index_bertweet),
color='red', lw=0.5)
plt.plot(fpr_roberta, tpr_roberta, label="RoBerta Model: {:.3f}".format(roc_index_roberta),
color='navy', lw=0.5)
plt.plot([0, 1], [0, 1], color='black', lw=0.5, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic for positive sentiment")
plt.legend(loc="lower right")
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

tfidf_df = pd.read_csv('tfidf_features_small.csv')
tfidf_df.info()

In [None]:
X = tfidf_df

df= pd.read_csv("hydrogen_small.csv")
y= df['label'].values

random_state = 42
test_set_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=test_set_size, stratify=y, 
                                                    random_state=random_state)
model = LogisticRegression(random_state=random_state)

model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report
# training and test accuracy
print("Train accuracy:", model.score(X_train, y_train))
print("Test accuracy:", model.score(X_test, y_test))

# classification report on test data
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
label_map = {'Irrelevant': 0, 'Relevant': 1}
y_test_bin = np.array([label_map[y] for y in y_test])

y_pred_proba = model.predict_proba(X_test)[:, 1]

# ROC/AUC
fpr, tpr, thresholds = roc_curve(y_test_bin, y_pred_proba) 
roc_auc = roc_auc_score(y_test_bin, y_pred_proba)

plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, label=f'ROC (AUC={roc_auc:.2f})')
plt.plot([0,1], [0,1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(8,6))

# BERTweet ROC
plt.plot(fpr_bertweet, tpr_bertweet,
         label=f'BERTweet Model (AUC = {roc_index_bertweet:.3f})',
         color='red', lw=1.5)
#Roberta ROV
plt.plot(fpr_roberta, tpr_roberta,
         label=f'Roberta Model (AUC = {roc_index_roberta:.3f})',
         color='blue', lw=1.5)

plt.plot(fpr, tpr,
         label=f'Logistic Regression (AUC = {roc_auc:.3f})',
         color='black', lw=1.5)

plt.plot([0,1], [0,1], color='grey', linestyle='--', lw=1)

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel("False Positive Rate (1 - Specificity)")
plt.ylabel("True Positive Rate (Sensitivity)")
plt.title("ROC Curve Comparison: BERTweet vs Logistic Regression")

plt.legend(loc="lower right")

plt.show()

# Task 5

In [None]:
import os
import torch
import numpy as np
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from rouge_score import rouge_scorer
from tqdm import tqdm
from transformers import logging as transformers_logging

In [None]:
# Set seed for reproducibility
torch.manual_seed(42)

## 5.1 Preprocessing

In [None]:
# Load and preprocess SQuAD dataset

dataset = load_dataset("squad")

print("Number of training examples:", len(dataset['train']))
print("Number of validation examples:", len(dataset['validation']))

dataset['train'][0]

In [None]:
dataset['train'][1]

In [None]:
# Data Filtering
filtered_dataset = dataset.filter(lambda x: x["answers"]['text'][0].upper() !="CANNOTANSWER")
print("Size of training set after removing unanswerable questions:", len(filtered_dataset['train']))
print("Size of validation set after removing unanswerable questions:", len(filtered_dataset['validation']))

In [None]:
# Creating a Testing Set

# Take subsets to avoid overload
train_dataset = dataset["train"].select(range(10000,11000))
val_dataset = dataset["validation"].select(range(3000,3100))
test_dataset = dataset["validation"].select(range(3100, 3200))  # No official SQuAD test set

training_set = train_dataset
validation_set = val_dataset
testing_set = test_dataset

print("Size of training set:", len(train_dataset))
print("Size of validation set:", len(val_dataset))
print("Size of testing set:", len(test_dataset))

In [None]:
# Loading the Tokenizer
MODEL_NAME = "t5-small"

MAX_INPUT_LENGTH = 512
MAX_OUTPUT_LENGTH = 128

# Load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

In [None]:
def encode_question_and_context(question, context):
    return f"question: {question}  context: {context}"

# Obtains the context, question and answer from a given sample.
def extract_sample_parts(sample):
    context = sample["context"]
    question = sample["question"]
    answer = sample["answers"]['text'][0]
    question_with_context = encode_question_and_context(question, context)
    return (question_with_context, question, answer)

# Encodes the sample, returning token IDs.
def preprocess(sample):
    # Extract data from sample.
    question_with_context, question, answer = extract_sample_parts(sample)

    # Using truncation causes the tokenizer to emit a warning for every sample.
    # This will generate a significant amount of messages, and likely crash
    # your browser tab. We temporarily disable log messages to work around this.
    # See https://github.com/huggingface/transformers/issues/14285
    old_level = transformers_logging.get_verbosity()
    transformers_logging.set_verbosity_error()
    
    # Generate tokens for the input.
    # We include both the context and the question (first two parameters).
    input_tokens = tokenizer(question_with_context, question, padding="max_length",
                             truncation=True, max_length=MAX_INPUT_LENGTH)

    # Generate tokens for the expected answer. There is no need to include the 
    output_tokens = tokenizer(answer, padding="max_length", truncation=True,
                              max_length=MAX_OUTPUT_LENGTH)

    # Restore old logging level, see above.
    transformers_logging.set_verbosity(old_level)

    # The output of the tokenizer is a map containing {input_ids, attention_mask}.
    # For trianing, we need to add the labels (answer/output tokens) to the map.
    input_tokens["labels"] = np.array(output_tokens["input_ids"])

    return input_tokens

In [None]:
# Preprocess the datasets
training_set_enc = train_dataset.map(preprocess, batched=False)
validation_set_enc = val_dataset.map(preprocess, batched=False)
testing_set_enc = test_dataset.map(preprocess, batched=False)

In [None]:
# Prepare 20 data points for qualitative analysis
q_data = test_dataset.select(range(20))
q_data

## 5.2 Fine-tuning the T5 Model

In [None]:
# Loading the Model
# Ensure the resources for any existing model have been freed.
try:
    del model
except NameError:
    pass
    
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

In [None]:
columns = ["input_ids", "attention_mask", "labels"]
training_set_enc.set_format(type="torch", columns=columns)
validation_set_enc.set_format(type="torch", columns=columns)
testing_set_enc.set_format(type="torch", columns=columns)

### First Setting

In [None]:
from transformers import EarlyStoppingCallback

# hyperparameter: setting1

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3, #5-10
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-4, 
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
    metric_for_best_model = "loss", 
    load_best_model_at_end = True #Early Stopping
)

# Train T5 model: setting1

model.train()
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training_set_enc,
    eval_dataset=validation_set_enc,
    processing_class=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer),
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)], #Early Stopping
)

trainer.train()

### Second Setting

In [None]:
try:
    del model
except NameError:
    pass

model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

In [None]:
# hyperparameter: setting2

training_args_change = TrainingArguments(
    output_dir="./results",
    num_train_epochs=8, 
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=20,
    metric_for_best_model = "loss", 
    load_best_model_at_end = True #Early Stopping
)

# Train T5 model: setting2

model.train()
trainer_change = Trainer(
    model=model,
    args=training_args_change,
    train_dataset=training_set_enc,
    eval_dataset=validation_set_enc,
    processing_class=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer),
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)], #Early Stopping
)

trainer_change.train()

In [None]:
# Saving the Model
trainer.save_model("t5_pretrained")

## 5.3 Evaluation the fine-tuned model

#### Evaluation Functions

- display_evaluation(setname, results)
- generate_response(tokenizer, model, question)
- generate_answers(tokenizer, model, dataset, use_context=True, limit=None)
- display_answer_and_references(question, answer, reference)
- compute_average_score(scores, metric, key)
- compute_rouge(predictions, references)

In [None]:
from itertools import batched
def display_evaluation(setname, results):
    print(f"{setname} Set Loss:", round(results["eval_loss"], 3))

# Generates a response for a single input/question.
def generate_response(tokenizer, model, question):
    # Convert the sentences into a list of numeric tokens. We instruct the tokenizer
    # to return PyTorch tensors ("pt") so that we can feed them directly into the model.
    tokenized = tokenizer(question, return_tensors="pt", padding=True, truncation=True,
                          max_length=MAX_OUTPUT_LENGTH).to(model.device)
    # Generate outputs using the model.
    with torch.no_grad():
        outputs = model.generate(**tokenized)
        
    # The model outputs a list of numeric tokens. To convert these tokens back to
    # sentences, we can use the batch_decode function from the tokenizer.
    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs
    
# Generates a list of responses from the specified model, optionally including
# the context in the prompt. If limit is set, then answers will only be generated
# for the first N questions of the dataset.
def generate_answers(tokenizer, model, dataset, use_context=True, limit=None):
    # Subsampling if requested.
    if limit is not None:
        dataset = dataset.select(range(limit))
        
    # Create list of encoded tokens, similarly to how we preprocessed the data for
    # training. We do this so we can use batch processing to speed up inference.
    questions = []
    inputs = []
    references = []
    for sample in dataset:
        question_with_context, question, answer = extract_sample_parts(sample)
        
        # Only include the context if the caller requested it.
        if use_context:
            inputs.append(question_with_context)
        else:
            inputs.append(question)
            
        # Include the original question/answer.
        questions.append(question)
        references.append(answer)
        
    # Generate responses for each of the prompts/inputs.
    # Submitting each question to the model separately would significantly
    # increase processing time, especially if the model is located on the GPU.
    # Instead, we group questions together in the same batch size that we used
    # for training.
    outputs = []
    for samples in batched(inputs, 128):
        # Python's batched() function returns a tuple of the batch
        # size, which we have to first convert to a list.
        responses = generate_response(tokenizer, model, list(samples))
        
    # generate_responses() returns an equal-sized list of responses.
    outputs.extend(responses)
    
    # The length of the reference responses should equal the length of the
    # generated responses.
    assert (len(outputs) == len(references))
    return outputs, references, questions

def display_answer_and_references(question, answer, reference):
    print("Question:", question)
    print("Generated answer:", answer)
    print("Reference answer:", reference)
    print()


#### ROUGE

In [None]:
# Computes the average score of a given metric from a list of ROUGE scores.
def compute_average_score(scores, metric, key):
    total = 0
    for i in range(len(scores)):
        # Since it's not a map, we have to manually read the attribute.
        total += getattr(scores[i][metric], key)
    return total / len(scores)
    
# Computes ROGUE-1, ROGUE-2 and ROGUE-L scores for the given generated
# answers and reference answers.
def compute_rouge(predictions, references):
    # Compute ROUGE-1, ROGUE-2 and ROUGE-L.
    metrics = ["rouge1", "rouge2", "rougeL"]
    
    # Use Porter stemmer to strip word suffixes to improve matching.
    scorer = rouge_scorer.RougeScorer(metrics, use_stemmer=True)
    
    # For each answer/reference pair, compute the ROUGE metrics.
    scores = []
    for prediction, reference in zip(predictions, references):
        scores.append(scorer.score(reference, prediction))
        
    # Compute the average precision, recall and F1 score for each metric.
    results = {}
    for metric in metrics:
        for k in ["precision", "recall", "fmeasure"]:
            results[f"{metric}_{k}"] = compute_average_score(
                scores, metric, k)
    return results

#### ROUGE Metrics: Fine-tuned model

In [None]:
# Switch the model to evaluation mode, disabling dropout etc layers.
model.eval()

# Evaluate the datasets.
display_evaluation("Training", trainer.evaluate(training_set_enc))
display_evaluation("Testing", trainer.evaluate(testing_set_enc))

In [None]:
answers_ctx, refs_ctx, questions_ctx = generate_answers(tokenizer, model, testing_set, True, 100)
answers_noctx, refs_noctx, questions_noctx = generate_answers(tokenizer, model, testing_set, False, 100)

In [None]:
print("ROUGE with context:", compute_rouge(answers_ctx, refs_ctx))
print()
print("ROUGE without context:", compute_rouge(answers_noctx, refs_noctx))

## 5.4 Generative Avalysis

In [None]:
# 5.4 Generative Analysis
def display_answer_and_references(question, answer, reference):
    print("Question:", question)
    print("Generated answer:", answer)
    print("Reference answer:", reference)
    print()

# 5.4.a question + context
print("*** With context ***")
for i in range(5):
    display_answer_and_references(questions_ctx[i], answers_ctx[i],
                                  refs_ctx[i])

# 5.4.b question
print("*** Without context ***")
for i in range(5):
    display_answer_and_references(questions_noctx[i],
                                  answers_noctx[i], refs_noctx[i])

## 5.5 Comparison with a Pre-trained model

In [None]:
# Load and preprocess SQuAD dataset
dataset = load_dataset("squad")

In [None]:
# Take subsets to avoid overload
train_dataset = dataset["train"].select(range(10000,11000))
val_dataset = dataset["validation"].select(range(3000,3100))
test_dataset = dataset["validation"].select(range(3100, 3200))  # No official SQuAD test set

training_set = train_dataset
validation_set = val_dataset
testing_set = test_dataset

print("Size of training set:", len(train_dataset))
print("Size of validation set:", len(val_dataset))
print("Size of testing set:", len(test_dataset))

In [None]:
MODEL_NAME = "mrm8488/t5-base-finetuned-squadv2"

MAX_INPUT_LENGTH = 512
MAX_OUTPUT_LENGTH = 128

In [None]:
try:
    del model
except NameError:
    pass

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)

In [None]:
# Evaluation
# Switch the model to evaluation mode, disabling dropout etc layers.
model.eval()

In [None]:
answers_ctx, refs_ctx, questions_ctx = generate_answers(
    tokenizer, model, testing_set, True, 100)
answers_noctx, refs_noctx, questions_noctx = generate_answers(
    tokenizer, model, testing_set, False, 100)

In [None]:
print("ROUGE with context:", compute_rouge(answers_ctx, refs_ctx))
print()
print("ROUGE without context:", compute_rouge(answers_noctx, refs_noctx))

In [None]:
def display_answer_and_references(question, answer, reference):
    print("Question:", question)
    print("Generated answer:", answer)
    print("Reference answer:", reference)
    print()
      
print("*** With context ***")
for i in range(5):
    display_answer_and_references(questions_ctx[i], answers_ctx[i],
                                  refs_ctx[i])

# 5.4.b question
print("*** Without context ***")
for i in range(5):
    display_answer_and_references(questions_noctx[i],
                                  answers_noctx[i], refs_noctx[i])