In [1]:
import pickle

# Load the transcription_training.pkl file
with open('/kaggle/input/first-impression-v2-train-dataset/train-transcription/transcription_training.pkl', 'rb') as file:
    data = pickle.load(file)

# Inspect the first few entries
for key, value in list(data.items())[:5]:
    print(f"{key}: {value}")


J4GQm9j0JZ0.003.mp4: He's cutting it and then turn around and see the end result, but I'm glad he didn't do that because I probably would've lost my mind. As it was getting cut, I was just excited. I saw the snippets of hair falling to the floor and I was like, "Yes!"
zEyRyTnIw5I.005.mp4: Responsibility to house the organ I had been given and I needed to tell them I was going to take good care of that organ and that I so appreciated what they had done. Almost immediately I sent a letter to them
nskJh7v6v1U.004.mp4: I actually got quite a few sets of black pens this year, because I bought one pack. I think I bought two packs, actually, that I really liked, and then I found ... Some people at my work had these really cool pens that I liked a lot, and I liked how they wrote-
6wHQsN5g2RM.000.mp4: I ate a lot. I'd like a lot of foods. I remember I have favorite, maybe Mexican chicken or barbecue, pork chops. I don't know. I've got a lot of favorite foods. What's your favorite ice cream?
dQO

In [2]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

# Load Multilingual BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased')

# Maximum feature size for FIv2
MAX_FEATURE_SIZE = 104
EMBEDDING_DIM = 768



tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

In [3]:
def extract_deep_features(text):
    # Tokenize and encode the text
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    with torch.no_grad():
        outputs = model(**inputs)
    # Extract embeddings from the last hidden layer
    embeddings = outputs.last_hidden_state.squeeze(0).numpy()
    # Zero-pad to match MAX_FEATURE_SIZE × EMBEDDING_DIM
    padded_embeddings = np.zeros((MAX_FEATURE_SIZE, EMBEDDING_DIM))
    padded_embeddings[:min(embeddings.shape[0], MAX_FEATURE_SIZE), :] = embeddings[:MAX_FEATURE_SIZE, :]
    return padded_embeddings

# Apply to the dataset
deep_features = {key: extract_deep_features(value) for key, value in data.items()}

# Save the extracted deep features
np.save('deep_features_fiv2.npy', deep_features)


KeyboardInterrupt: 

In [None]:
# Dummy LIWC implementation
LIWC_CATEGORIES = ["work", "perceptual_processes", "cognitive_processes", "anxiety"]  # Extend with real categories
CATEGORY_COUNT = 64
MAX_SEQUENCE_LENGTH = 89

def liwc_feature_extraction(text):
    words = text.split()  # Split text into words
    features = np.zeros((len(words), CATEGORY_COUNT))
    for i, word in enumerate(words):
        # Dummy example: Randomly assign categories (replace with real LIWC logic)
        features[i, np.random.choice(CATEGORY_COUNT, size=5, replace=False)] = 1
    # Zero-pad to match MAX_SEQUENCE_LENGTH × CATEGORY_COUNT
    padded_features = np.zeros((MAX_SEQUENCE_LENGTH, CATEGORY_COUNT))
    padded_features[:min(features.shape[0], MAX_SEQUENCE_LENGTH), :] = features[:MAX_SEQUENCE_LENGTH, :]
    return padded_features

# Apply to the dataset
hand_crafted_features = {key: liwc_feature_extraction(value) for key, value in data.items()}

# Save the extracted hand-crafted features
np.save('hand_crafted_features_fiv2.npy', hand_crafted_features)


In [1]:
import pandas as pd
import pickle
from transformers import BertTokenizer, BertModel
import torch
import re

# Load transcription data
with open('/kaggle/input/first-impression-v2-train-dataset/train-transcription/transcription_training.pkl', 'rb') as f:
    transcriptions = pickle.load(f)

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased')

# Load the LIWC dictionary from the provided file path
liwc_dict_path = '/kaggle/input/liwc2007/LIWC2007.txt'

def load_liwc_dict(file_path):
    """Parse the LIWC dictionary and return a mapping from words to categories."""
    liwc_dict = {}
    with open(file_path, 'r') as file:
        for line in file:
            if not line.startswith('%'):  # Skip comment lines
                parts = line.strip().split('\t')
                word = parts[0].lower()
                categories = parts[1:]  # All the categories for the word
                liwc_dict[word] = categories
    return liwc_dict

liwc_dict = load_liwc_dict(liwc_dict_path)

def extract_bert_features(text):
    """Extract deep features using the multilingual BERT model."""
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=104)
    with torch.no_grad():
        outputs = model(**inputs)
    sentence_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return sentence_embedding

def extract_liwc_features(text):
    """Extract hand-crafted features using the LIWC dictionary."""
    words = re.findall(r'\b\w+\b', text.lower())  # Normalize to lowercase and split into words
    feature_vector = [0] * 64  # LIWC categories have 64 features
    for word in words:
        if word in liwc_dict:
            categories = liwc_dict[word]
            for category in categories:
                # Ensure the category index is valid and increment the corresponding category count
                try:
                    category_index = int(category) - 1  # LIWC categories are 1-indexed
                    if 0 <= category_index < 64:
                        feature_vector[category_index] += 1
                except ValueError:
                    continue  # Skip if the category isn't a valid number
    return feature_vector

def extract_features_and_store(transcriptions):
    """Extract features for each transcription and store them."""
    deep_features_list = []
    handcrafted_features_list = []
    filenames = []

    for filename, text in transcriptions.items():
        # Extract deep and hand-crafted features
        deep_features = extract_bert_features(text)
        handcrafted_features = extract_liwc_features(text)

        # Store features and filenames
        deep_features_list.append(deep_features)
        handcrafted_features_list.append(handcrafted_features)
        filenames.append(filename)

    # Convert to DataFrames for easier export to CSV
    df_deep_features = pd.DataFrame(deep_features_list)
    df_handcrafted_features = pd.DataFrame(handcrafted_features_list)

    # Add filenames to the DataFrames
    df_deep_features['Filename'] = filenames
    df_handcrafted_features['Filename'] = filenames

    # Export to CSV
    df_deep_features.to_csv('text_deep_features.csv', index=False)
    df_handcrafted_features.to_csv('text_handcrafted_features.csv', index=False)

    print("Feature extraction complete and saved to CSV files.")

# Call the function to extract features and store them
extract_features_and_store(transcriptions)


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Feature extraction complete and saved to CSV files.


In [3]:
import pandas as pd
import pickle
from transformers import BertTokenizer, BertModel
import torch
import re

# Load transcription data
with open('/kaggle/input/first-impression-v2-train-dataset/train-transcription/transcription_training.pkl', 'rb') as f:
    transcriptions = pickle.load(f)

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased')

# Load the LIWC dictionary from the provided file path
liwc_dict_path = '/kaggle/input/liwc2007/LIWC2007.txt'

def load_liwc_dict(file_path):
    """Parse the LIWC dictionary and return a mapping from words to categories."""
    liwc_dict = {}
    with open(file_path, 'r') as file:
        for line in file:
            if not line.startswith('%'):  # Skip comment lines
                parts = line.strip().split('\t')
                word = parts[0].lower()
                categories = parts[1:]  # All the categories for the word
                liwc_dict[word] = categories
    return liwc_dict

liwc_dict = load_liwc_dict(liwc_dict_path)

def extract_bert_features(text):
    """Extract deep features using the multilingual BERT model."""
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=104)
    with torch.no_grad():
        outputs = model(**inputs)
    sentence_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return sentence_embedding

def extract_liwc_features(text):
    """Extract hand-crafted features using the LIWC dictionary."""
    words = re.findall(r'\b\w+\b', text.lower())  # Normalize to lowercase and split into words
    feature_vector = [0] * 64  # LIWC categories have 64 features
    for word in words:
        if word in liwc_dict:
            categories = liwc_dict[word]
            for category in categories:
                try:
                    category_index = int(category) - 1  # LIWC categories are 1-indexed
                    if 0 <= category_index < 64:
                        feature_vector[category_index] += 1
                except ValueError:
                    continue  # Skip if the category isn't a valid number
    return feature_vector

def extract_features_and_store(transcriptions):
    """Extract features for each transcription and store them."""
    deep_features_list = []
    handcrafted_features_list = []
    filenames = []

    for filename, text in transcriptions.items():
        # Extract deep and hand-crafted features
        deep_features = extract_bert_features(text)
        handcrafted_features = extract_liwc_features(text)

        # Store features and filenames
        deep_features_list.append(deep_features)
        handcrafted_features_list.append(handcrafted_features)
        filenames.append(filename)

    # Convert to DataFrames for easier export to CSV
    df_deep_features = pd.DataFrame(deep_features_list)
    df_handcrafted_features = pd.DataFrame(handcrafted_features_list)

    # Add filenames to the DataFrames as the first column
    df_deep_features.insert(0, 'Filename', filenames)
    df_handcrafted_features.insert(0, 'Filename', filenames)

    # Export to CSV
    df_deep_features.to_csv('text_deep_features.csv', index=False)
    df_handcrafted_features.to_csv('text_handcrafted_features.csv', index=False)

    print("Feature extraction complete and saved to CSV files.")

# Call the function to extract features and store them
extract_features_and_store(transcriptions)



Feature extraction complete and saved to CSV files.


In [4]:
print("working done")

done


In [11]:
##############

import pandas as pd
import pickle
import numpy as np
# import liwc
import torch
import re
from transformers import BertTokenizer, BertModel

# Load the transcription data (adjust the path if necessary)
with open('/kaggle/input/first-impression-v2-train-dataset/train-transcription/transcription_training.pkl', 'rb') as f:
    transcriptions = pickle.load(f)

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased')

# Define the LIWC categories and translation feature extraction function
class FeatureExtractor:
    def __init__(self, liwc_dict_path):
        self.__lang_traslate = ["en", "es"]  # Example languages (adjust accordingly)
        self.__contractions_dict = {}  # Add actual contractions dictionary as required
        self.__category_text_features = self.load_liwc_dict(liwc_dict_path)

    def load_liwc_dict(self, liwc_dict_path):
        """Load and parse the LIWC2007 dictionary file."""
        liwc_dict = {}
        with open(liwc_dict_path, 'r') as file:
            lines = file.readlines()
        
        # Parse the LIWC dictionary file line by line
        for line in lines:
            parts = line.strip().split("\t")
            if len(parts) > 1:
                category = parts[0]  # LIWC category
                words = parts[1].split()  # List of words associated with the category
                liwc_dict[category] = words
        return liwc_dict

    def __parse_text_features(self, word):
        """Parse the word and return the associated LIWC categories."""
        features = []
        for category, words in self.__category_text_features.items():
            if word.lower() in [w.lower() for w in words]:
                features.append(category)
        return features

    def __translate_and_extract_features(self, text, lang, show_text=False, last=False, out=True):
        """Extract LIWC and BERT features from text."""
        contractions_re = re.compile("(%s)" % "|".join(self.__contractions_dict.keys()))
        expand_contractions = lambda s: contractions_re.sub(lambda match: self.__contractions_dict[match.group(0)], s)

        get_norm_text = lambda text: re.sub(
            r"(?<=[.,])(?=[^\s])", " ", re.sub(r"\[[^\[\]]+\]", "", expand_contractions(re.sub(r'[.,"\'?:!/;]', "", text.lower().strip()))))

        norm_features = lambda feature, length: np.pad(
            feature[:length, :], ((0, max(0, length - feature.shape[0])), (0, 0)), "constant"
        )

        # Normalize text and get features
        text = get_norm_text(text)

        # BERT feature extraction
        encoded_input = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**encoded_input)
        features_bert = outputs.last_hidden_state.mean(dim=1).cpu().numpy()

        # LIWC feature extraction
        features_liwc = np.zeros((len(self.__category_text_features),))
        for word in text.split(" "):
            categories = self.__parse_text_features(word)
            for cat in categories:
                idx = self.__category_text_features.get(cat, [])
                if idx:
                    features_liwc[self.__category_text_features.keys().index(cat)] += 1

        # Normalize features
        features_bert = norm_features(features_bert, 414)
        features_liwc = norm_features(features_liwc, 365)

        return features_liwc, features_bert

# Initialize FeatureExtractor with the path to LIWC dictionary
liwc_dict_path = '/kaggle/input/liwc2007/LIWC2007.txt'
extractor = FeatureExtractor(liwc_dict_path)

def extract_features_and_store(transcriptions):
    """Extract features for each transcription and store them."""
    deep_features_list = []
    handcrafted_features_list = []
    filenames = []

    # Loop through each transcription in the pkl file
    for filename, text in transcriptions.items():
        # Extract both LIWC and BERT features
        features_liwc, features_bert = extractor.__translate_and_extract_features(text, "en")

        # Store features and filenames
        handcrafted_features_list.append(features_liwc)
        deep_features_list.append(features_bert)
        filenames.append(filename)

    # Convert lists to DataFrames for easier export to CSV
    df_deep_features = pd.DataFrame(deep_features_list)
    df_handcrafted_features = pd.DataFrame(handcrafted_features_list)

    # Add filenames as the first column in both DataFrames
    df_deep_features.insert(0, 'Filename', filenames)
    df_handcrafted_features.insert(0, 'Filename', filenames)

    # Export DataFrames to CSV
    df_deep_features.to_csv('deep_features_with_filenames.csv', index=False)
    df_handcrafted_features.to_csv('handcrafted_features_with_filenames.csv', index=False)

    print("Feature extraction complete and saved to CSV files.")

# Call the function to extract features and store them
extract_features_and_store(transcriptions)
print("done")



AttributeError: 'FeatureExtractor' object has no attribute '__translate_and_extract_features'

In [5]:
import pandas as pd

In [8]:
df = pd.read_csv("/kaggle/working/text_deep_features.csv")


In [9]:
df

Unnamed: 0,Filename,0,1,2,3,4,5,6,7,8,...,758,759,760,761,762,763,764,765,766,767
0,J4GQm9j0JZ0.003.mp4,-0.242928,-0.339531,-0.025601,0.244540,0.892297,0.147015,-0.313380,0.809669,-0.370543,...,-0.226408,0.032308,-0.379697,0.127955,0.015482,-0.714135,0.967615,0.191498,-0.306697,-0.206918
1,zEyRyTnIw5I.005.mp4,-0.069200,-0.434665,0.073010,-0.173166,0.602467,-0.232156,-0.256193,0.220219,-0.522540,...,0.090561,-0.015783,-0.123446,-0.782116,-0.165112,-0.754044,0.533481,0.594238,-0.516909,-0.126553
2,nskJh7v6v1U.004.mp4,0.043944,-0.367641,-0.114857,0.099605,0.565259,0.055890,-0.370536,0.483487,-0.522987,...,0.027890,-0.073298,-0.901172,-0.156638,-0.123582,-0.825829,0.455202,0.932887,-0.204750,0.140392
3,6wHQsN5g2RM.000.mp4,-0.439144,-0.092683,0.282453,0.116274,0.824320,-0.048884,-0.357818,0.408998,-0.100275,...,-0.501510,-0.320901,-0.611902,-0.306970,0.170360,-0.208912,0.569401,0.160111,-0.503446,0.189523
4,dQOeQYWIgm8.000.mp4,-0.171071,-0.237357,-0.067170,0.171221,0.589509,0.148646,-0.310589,0.400486,-0.428959,...,-0.031038,-0.073519,-0.280926,-0.544391,-0.094637,-0.647683,0.089279,0.205052,-0.541694,0.195269
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,mhF4kYTlVUE.001.mp4,-0.207325,-0.727157,-0.574352,0.312974,0.782626,0.416122,-0.146443,0.527809,-0.679955,...,-0.239749,-0.178334,-0.372753,-0.079485,-0.145676,-0.337447,0.702685,0.566176,-0.400116,0.125432
5996,2q8orkMs2Jg.003.mp4,-0.468130,-0.482583,0.096771,0.292015,0.733364,0.404647,-0.344417,0.406257,-0.031403,...,-0.077539,0.211370,-0.664456,0.001076,-0.169727,-1.204941,0.683210,0.321280,-0.442748,0.229803
5997,F1lAPYh4t3U.000.mp4,-0.277300,-0.429958,0.136949,0.067262,0.895780,0.126993,-0.182677,0.415356,-0.016152,...,0.169418,-0.366907,-0.741146,-0.578648,0.226593,-0.783851,0.411136,0.449866,0.037044,-0.061017
5998,cxJ0u6r0-pU.001.mp4,-0.225750,-0.326381,0.120616,0.143940,0.329312,0.214944,-0.267022,0.581226,-0.007218,...,-0.276070,0.045374,0.198921,-0.568740,-0.006747,-0.503992,0.475720,0.794023,-0.221327,0.168648
