In [96]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from skorch import NeuralNetClassifier
import torch
import torch.nn as nn
import torch.optim as optim
from deep_translator import GoogleTranslator
from langdetect import detect, DetectorFactory
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    auc,
    RocCurveDisplay,
    ConfusionMatrixDisplay,
    accuracy_score,
    classification_report,
)
from nltk.sentiment import SentimentIntensityAnalyzer
import matplotlib
from textblob import TextBlob

In [104]:
# Function to translate text if not in English
def translate_if_not_english(text):
    try:
        lang = detect(text)
        if lang != "en":
            return GoogleTranslator(source=lang, target="en").translate(text)
        else:
            return text
    except Exception as e:
        print(f"Error detecting or translating text: {text}, Error: {e}")
        return text  # Return original text in case of error

In [105]:
# Read and preprocess Jomama CSV
df_jomama = pd.read_csv("jomama.csv")
df_jomama = df_jomama[["q21", "q22", "label"]]
df_jomama = df_jomama[df_jomama["q21"].notna() | df_jomama["q22"].notna()]
df_jomama["combined"] = df_jomama["q21"].fillna("") + " " + df_jomama["q22"].fillna("")
df_jomama["combined"] = df_jomama["combined"].str.strip()
df_jomama["translated_text"] = df_jomama["combined"].apply(translate_if_not_english)

# Prepare training data from Jomama CSV
df_train = df_jomama[["translated_text", "label"]]
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(
    df_train["translated_text"]
).toarray()  # Convert to array
y_train = df_train["label"].values  # Assuming 'label' is your target

# Convert to proper dtypes for training
X_train = torch.FloatTensor(X_train)  # Convert input features to FloatTensor
y_train = torch.LongTensor(y_train)  # Convert labels to LongTensor

In [106]:
# Read and preprocess Examples CSV
df_examples = pd.read_csv("examples.csv")
df_examples = df_examples[["q21", "q22", "label"]]
df_examples = df_examples[df_examples["q21"].notna() | df_examples["q22"].notna()]
df_examples["combined"] = (
    df_examples["q21"].fillna("") + " " + df_examples["q22"].fillna("")
)
df_examples["combined"] = df_examples["combined"].str.strip()
df_examples["translated_text"] = df_examples["combined"].apply(translate_if_not_english)

# Prepare testing data from Examples CSV
df_test = df_examples[["translated_text", "label"]]
X_test = vectorizer.transform(
    df_test["translated_text"]
).toarray()  # Use transform for test data
y_test = df_test["label"].values  # Assuming 'label' is your target

# Convert to proper dtypes for testing
X_test = torch.FloatTensor(X_test)  # Convert input features to FloatTensor
y_test = torch.LongTensor(y_test)  # Convert labels to LongTensor

In [107]:
class FRMod(nn.Module):
    def __init__(self, inp_size, hidden_sizes, num_classes=6, nonlin=nn.ReLU()):
        super().__init__()
        self.hidden = nn.ModuleList()
        self.nonlin = nonlin  # Store the non-linear activation function
        for i in range(len(hidden_sizes)):
            if i == 0:
                self.hidden.append(nn.Linear(inp_size, hidden_sizes[i]))
            else:
                self.hidden.append(nn.Linear(hidden_sizes[i - 1], hidden_sizes[i]))
            self.hidden.append(self.nonlin)  # Append the non-linear function

    def forward(self, X):
        for i, layer in enumerate(self.hidden):
            X = layer(X)
            if (
                i % 2 == 0 and i < len(self.hidden) - 1
            ):  # Apply nonlin only for Linear layers
                X = self.nonlin(X)  # Apply the non-linear activation
        return X  # Return raw logits for CrossEntropyLoss

In [108]:
# Neural Network Classifier
net = NeuralNetClassifier(
    FRMod,
    module__inp_size=X_train.shape[1],
    batch_size=256,
    module__hidden_sizes=[32, 16],  # Specify a hidden layer configuration
    optimizer=torch.optim.Adam,
    criterion=torch.nn.CrossEntropyLoss,
    iterator_train__shuffle=True,
)

# Deactivate skorch-internal train-valid split and verbose logging
net.set_params(train_split=False, verbose=0)

# Define the parameters you want to search over as a dict
params = {
    "lr": [0.01],  # Learning rates to test
    "max_epochs": [100],  # Number of training epochs to test
    "module__hidden_sizes": [
        [32, 16],
        [64, 32],
        [128, 64],
    ],  # Different hidden layer sizes
    "module__nonlin": [
        nn.ReLU(),
        nn.LeakyReLU(),
    ],  # Different activation functions
}

# Define your GridSearchCV()
gs = GridSearchCV(net, params, cv=3, scoring="accuracy", refit=True)

In [114]:
# train your model
gs.fit(X_train, y_train)

# print best params
print("best score: {:.3f}, best params: {}".format(gs.best_score_, gs.best_params_))

best score: 0.983, best params: {'lr': 0.01, 'max_epochs': 100, 'module__hidden_sizes': [64, 32], 'module__nonlin': LeakyReLU(negative_slope=0.01)}


In [115]:
# Get the best model from your GridSearchCV object.
net = gs.best_estimator_

# Get predictions on the test data
y_pred_test = net.predict(X_test)

# Calculate accuracy and print
print("Best Accuracy on Test: ", accuracy_score(y_test, y_pred_test))

Best Accuracy on Test:  0.975


In [None]:
df_examples["label_predictions"] = y_pred_test

Unnamed: 0,translated_text,label,label_predictions
0,"As a surviving child, the weekend seminar with...",3,3
1,As a surviving child who attended the TAPS Sem...,3,3
2,"""The TAPS Seminar was incredibly valuable for ...",4,4
3,"""I attended the TAPS Seminar recently, and it ...",2,2
4,"During our unforgettable weekend, we cherishin...",3,3
5,"Over the past weekend, I had the profound hono...",6,6
6,"""I attended the TAPS seminar this past weekend...",1,1
7,"At a recent TAPS seminar, a panel discussion o...",6,6
8,"""I recently attended a TAPS Seminar, and I fou...",4,4
10,"""During the weekend seminar, my favorite momen...",1,1


# TEST

In [None]:
# Read and preprocess Jomama CSV
df_jomama = pd.read_csv("jomama.csv")
df_jomama = df_jomama[["q21", "q22", "label"]]
df_jomama = df_jomama[df_jomama["q21"].notna() | df_jomama["q22"].notna()]
df_jomama["combined"] = df_jomama["q21"].fillna("") + " " + df_jomama["q22"].fillna("")
df_jomama["combined"] = df_jomama["combined"].str.strip()
df_jomama["translated_text"] = df_jomama["combined"].apply(translate_if_not_english)

# Prepare the training data from Jomama CSV
df_jomama = df_jomama[["translated_text", "label"]]

# Read and preprocess Examples CSV
df_examples = pd.read_csv("examples.csv")
df_examples = df_examples[["q21", "q22", "label"]]
df_examples = df_examples[df_examples["q21"].notna() | df_examples["q22"].notna()]
df_examples["combined"] = (
    df_examples["q21"].fillna("") + " " + df_examples["q22"].fillna("")
)
df_examples["combined"] = df_examples["combined"].str.strip()
df_examples["translated_text"] = df_examples["combined"].apply(translate_if_not_english)

# Prepare the training data from Examples CSV
df_examples = df_examples[["translated_text", "label"]]

# Combine both datasets into one DataFrame
df_combined = pd.concat([df_jomama, df_examples], ignore_index=True)



# Vectorization using a new instance of CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(
    df_combined["translated_text"]
).toarray()  # Fit and transform combined data
y = df_combined["label"].values  # Assuming 'label' is your target

# Convert to proper dtypes for training
X = torch.FloatTensor(X)  # Convert input features to FloatTensor
y = torch.LongTensor(y)  # Convert labels to LongTensor

In [150]:
class FRMod(nn.Module):
    def __init__(self, inp_size, hidden_sizes, num_classes=6, nonlin=nn.ReLU()):
        super().__init__()
        self.hidden = nn.ModuleList()
        self.nonlin = nonlin  # Store the non-linear activation function
        for i in range(len(hidden_sizes)):
            if i == 0:
                self.hidden.append(nn.Linear(inp_size, hidden_sizes[i]))
            else:
                self.hidden.append(nn.Linear(hidden_sizes[i - 1], hidden_sizes[i]))
            self.hidden.append(self.nonlin)  # Append the non-linear function

    def forward(self, X):
        for i, layer in enumerate(self.hidden):
            X = layer(X)
            if (
                i % 2 == 0 and i < len(self.hidden) - 1
            ):  # Apply nonlin only for Linear layers
                X = self.nonlin(X)  # Apply the non-linear activation
        return X  # Return raw logits for CrossEntropyLoss

In [None]:
# Neural Network Classifier
net = NeuralNetClassifier(
    FRMod,
    module__inp_size=X.shape[1],
    batch_size=256,
    module__hidden_sizes=[32, 16],  # Specify a hidden layer configuration
    optimizer=torch.optim.Adam,
    criterion=torch.nn.CrossEntropyLoss,
    iterator_train__shuffle=True,
)

# Deactivate skorch-internal train-valid split and verbose logging
net.set_params(train_split=False, verbose=0)

# Define the parameters you want to search over as a dict
params = {
    "lr": [0.01],  # Learning rates to test
    "max_epochs": [100],  # Number of training epochs to test
    "module__hidden_sizes": [
        [32, 16],
        [64, 32],
        [128, 64],
    ],  # Different hidden layer sizes
    "module__nonlin": [
        nn.ReLU(),
        nn.LeakyReLU(),
    ],  # Different activation functions
}

# Define your GridSearchCV()
gs = GridSearchCV(net, params, cv=3, scoring="accuracy", refit=True)

In [152]:
gs.fit(X, y)

# print best params
print("best score: {:.3f}, best params: {}".format(gs.best_score_, gs.best_params_))

best score: 0.985, best params: {'lr': 0.01, 'max_epochs': 100, 'module__hidden_sizes': [32, 16], 'module__nonlin': LeakyReLU(negative_slope=0.01)}


In [161]:
X_test.shape

torch.Size([2226, 14718])

In [157]:
# Read and preprocess test
test = pd.read_csv("test.csv")
test = test[["q21", "q22"]]
test = test[test["q21"].notna() | test["q22"].notna()]
test["combined"] = test["q21"].fillna("") + " " + test["q22"].fillna("")
test["combined"] = test["combined"].str.strip()
test["translated_text"] = test["combined"].apply(translate_if_not_english)

# Prepare training data from Jomama CSV
df_test = test[["translated_text"]]
vectorizer = CountVectorizer()
X_test = vectorizer.fit_transform(
    df_test["translated_text"]
).toarray()  # Convert to array


# Convert to proper dtypes for training
X_test = torch.FloatTensor(X_test)  # Convert input features to FloatTensor

In [159]:
# Get the best model from your GridSearchCV object.
net = gs.best_estimator_

# Get predictions on the test data
y_predictions = net.predict(X_test)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (256x14718 and 3050x32)