In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tqdm import tqdm

from transformers import BertTokenizer, BertModel
import torch
from torch.utils.data import DataLoader, TensorDataset

In [2]:
def preprocess_text(text):
    words = word_tokenize(text)
    words = [word.lower() for word in words]
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.isalnum() and word not in stop_words]
    return words

def build_vocabulary(corpus):
    vectorizer = CountVectorizer(tokenizer=preprocess_text, stop_words='english')
    vectorizer.fit(tqdm(corpus, desc='Building vocabulary'))
    return vectorizer

def compute_bow(corpus, vectorizer):
    return vectorizer.transform(tqdm(corpus, desc='Computing BoW'))

def compute_tfidf(corpus, word2ind):
    vectorizer = TfidfVectorizer(vocabulary=word2ind, tokenizer=preprocess_text, stop_words='english')
    return vectorizer.fit_transform(tqdm(corpus, desc='Computing TF-IDF'))

In [3]:
from collections import Counter
import re
import json

stop_words = set(stopwords.words('english'))
train_file = '../../dataset/processed/train_data.csv'
data = pd.read_csv(train_file)

word_freq = Counter()


for steps in data['steps']:
    steps_list = eval(steps)    
    for step in steps_list:
        step_clean = re.sub(r'[^\w\s]', '', step).lower()
        words = step_clean.split()
        words_filtered = [word for word in words if word not in stop_words]

        word_freq.update(words_filtered)


In [4]:
most_common_words = word_freq.most_common(1000)
most_common_words = {word for word, _ in most_common_words}
most_common_words

{'ball',
 'base',
 'cold',
 'ham',
 'small',
 'would',
 'seasoning',
 'rings',
 'along',
 'right',
 'refrigerated',
 'blender',
 'preheated',
 'mash',
 'core',
 'except',
 'shredded',
 'chives',
 'pizza',
 'vinegar',
 'well',
 'vanilla',
 'loaf',
 'try',
 'big',
 'brown',
 'cook',
 'pink',
 'minced',
 'end',
 'stuff',
 'twice',
 'chunks',
 'raisins',
 'even',
 'broil',
 'crab',
 'gradually',
 'ketchup',
 'bars',
 'skim',
 'stand',
 'larger',
 'custard',
 'broiler',
 'necessary',
 'tea',
 'beaten',
 'walnuts',
 'rack',
 'carrots',
 'place',
 'crush',
 'balsamic',
 'texture',
 'blended',
 'tender',
 'swirl',
 'check',
 'slowly',
 'seconds',
 'freshly',
 'microwave',
 'thin',
 'crock',
 'bubbling',
 'dry',
 'wash',
 'roll',
 'give',
 'prepared',
 'trim',
 'next',
 'cool',
 'tops',
 'chops',
 'shells',
 'half',
 'container',
 'dissolves',
 'hour',
 'tin',
 'toasted',
 'skillet',
 'completely',
 'thermometer',
 'beans',
 'continue',
 'steam',
 'bananas',
 'coarsely',
 'aluminum',
 'allow',


In [5]:
'add' in most_common_words

True

In [6]:

new_steps = []  # 创建一个新的列表来存放更新后的步骤
for steps_str in data['steps']:
    steps_list = eval(steps_str)
    steps_list = ''.join(steps_list)
    # 使用列表推导式过滤单词
    words_filtered = []

    for word in steps_list.split(" "):

        if word in most_common_words:

            words_filtered.append(word)

    # 将过滤后的steps添加到新的列表中
    new_steps.append(' '.join(words_filtered))

# 将原来的data['steps']更新为新的步骤列表


In [17]:
steps_text = new_steps

print("Preprocessing text and building vocabulary...")
vectorizer = build_vocabulary(steps_text)

print("Computing BoW...")
bow_matrix = compute_bow(steps_text, vectorizer)
print("BoW representation computed.")

print("Computing TF-IDF...")
tfidf_matrix = compute_tfidf(steps_text, vectorizer.vocabulary_)
print("TF-IDF matrix computed.")

Preprocessing text and building vocabulary...


Building vocabulary: 100%|████████████| 138981/138981 [00:17<00:00, 7869.29it/s]


Computing BoW...


Computing BoW: 100%|██████████████████| 138981/138981 [00:17<00:00, 7851.85it/s]


BoW representation computed.
Computing TF-IDF...


Computing TF-IDF: 100%|███████████████| 138981/138981 [00:17<00:00, 7868.71it/s]


TF-IDF matrix computed.


In [85]:
bow_matrix.shape

(138981, 942)

In [86]:
tfidf_matrix.shape

(138981, 942)

In [8]:
steps_text = new_steps

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# 1. 加载预训练的 BERT Tokenizer 和 BERT 模型
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

model.to(device)

# 2. 准备数据处理的函数
def encode_texts(texts, tokenizer, max_length=512):
    input_ids = []
    attention_masks = []

    for text in tqdm(texts, desc="Encoding texts"):
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True, # 添加 '[CLS]' 和 '[SEP]'
            max_length=max_length,   # 设定最大文本长度
            padding='max_length',    # 添加 padding
            return_attention_mask=True, # 返回 attention mask
            return_tensors='pt',     # 返回 PyTorch tensors
            truncation=True
        )
        
        # 添加编码后的文本到列表
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    
    # 转换为 tensor
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

# 3. 文本编码
input_ids, attention_masks = encode_texts(steps_text, tokenizer)

# 4. 创建 DataLoader
batch_size = 16
dataset = TensorDataset(input_ids, attention_masks)
dataloader = DataLoader(dataset, batch_size=batch_size)

bert_embeddings = []

# 遍历 DataLoader
for batch in tqdm(dataloader, desc="Extracting features"):
    batch = tuple(t.to(device) for t in batch)
    input_ids, attention_masks = batch

    with torch.no_grad():
        # 前向传播，获取编码层的输出
        outputs = model(input_ids, attention_mask=attention_masks)
    
    # 获取最后一层的隐藏状态
    last_hidden_state = outputs.last_hidden_state
    
    # 对所有 token 的输出进行平均，作为句子表示
    sentence_embedding = torch.mean(last_hidden_state, dim=1)
    bert_embeddings.append(sentence_embedding)

bert_embeddings = torch.cat(bert_embeddings, dim=0)
bert_embeddings = bert_embeddings.to('cpu').numpy()
print(bert_embeddings.shape)

Using device: cuda


Encoding texts: 100%|█████████████████| 138981/138981 [01:07<00:00, 2056.90it/s]
Extracting features: 100%|██████████████████| 8687/8687 [24:42<00:00,  5.86it/s]


(138981, 768)


In [9]:
torch.save(bert_embeddings, 'bert_embeddings.pt')
torch.save(model.state_dict(), 'bert_model_weights.pt')

In [97]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

bert_embeddings = torch.load('bert_embeddings.pt')
model = BertModel.from_pretrained('bert-base-uncased')
model.load_state_dict(torch.load('bert_model_weights.pt'))
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [10]:
bert_embeddings.shape

(138981, 768)

In [21]:
train_file = '../../dataset/processed/train_data.csv'
train_data = pd.read_csv(train_file)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138981 entries, 0 to 138980
Data columns (total 25 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   name               138981 non-null  object 
 1   id                 138981 non-null  int64  
 2   minutes            138981 non-null  int64  
 3   contributor_id     138981 non-null  int64  
 4   submitted          138981 non-null  object 
 5   tags               138981 non-null  object 
 6   n_steps            138981 non-null  int64  
 7   steps              138981 non-null  object 
 8   description        136025 non-null  object 
 9   ingredients        138981 non-null  object 
 10  n_ingredients      138981 non-null  int64  
 11  calories           138981 non-null  float64
 12  total_fat          138981 non-null  float64
 13  sugar              138981 non-null  float64
 14  sodium             138981 non-null  float64
 15  protein            138981 non-null  float64
 16  sa

In [None]:
# 我们现在有一个通过bert提取的对于数据集中step的feature
# 现在希望把这个数据通过PCA降低到十个dim
# 然后作为新的col写回到数据集中

import pandas as pd
from sklearn.decomposition import PCA


train_file = '../../dataset/processed/train_data.csv'
train_data = pd.read_csv(train_file)

# 定义PCA变换器，指定我们想要的组件数量为10
pca = PCA(n_components=768)

# 对BERT特征应用PCA变换
bert_embeddings_reduced = pca.fit_transform(bow_matrix)

print("PCA features added to the data and saved to CSV.")

In [28]:
test_file = '../../dataset/processed/train_data.csv'
test_data = pd.read_csv(test_file)

In [30]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# 导入数据和BERT特征
valid_data = pd.read_csv(test_file)

# 定义BERT特征的维度和数据点的数量
n_samples = valid_data.shape[0]
n_features = 765

# 生成随机特征
random_features = np.random.rand(n_samples, n_features)

X_random = pd.DataFrame(random_features, columns=[f'random_feature_{i+1}' for i in range(n_features)])


# 准备训练数据（BERT特征列）

X_other_features = valid_data[['total_fat', 'sugar', 'sodium', 'protein', 'saturated_fat']]

# 合并随机特征和其他特征列
# X = pd.DataFrame(bow_matrix.toarray())

X = pd.concat([pd.DataFrame(tfidf_matrix.toarray())], axis=1)


# 准备目标值（calories列）
y = valid_data['calories_log']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 初始化线性回归模型
regressor = LinearRegression()

# 拟合线性回归模型
regressor.fit(X_train, y_train)

# 使用模型对测试集进行预测
y_pred = regressor.predict(X_test)

# 计算预测的均方误差（MSE）
mse = mean_squared_error(y_test, y_pred)
print(f"The mean squared error (MSE) on test set: {mse:.2f}")

# 如果需要，可以保存模型，以便以后使用
# from joblib import dump
# dump(regressor, 'calories_regressor.joblib')


The mean squared error (MSE) on test set: 0.74


In [None]:
# The mean squared error (MSE) on test set: 0.52 -> 0.50 5% -> 0.48
# Random 0.55
# tf-idf 0.74
# bow 0.76
# bert 0.75

# bert + tfidf = 0.73
# bert + bow = 0.74
# bow + tfidf = 0.74
# bert + tfidf + bow = 0.73

# tfidf = 0.74
# bert = 0.75
# bow = 0.76

In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
import numpy as np


test_file = '../../dataset/processed/train_proced.csv'
test_data = pd.read_csv(test_file)
valid_data = pd.read_csv(test_file)

X_other_features = valid_data[['total_fat', 'sugar', 'sodium', 'protein', 'saturated_fat']]


# 假设您已经有了目标变量 y （例如，您的回归目标）
y = valid_data['calories_log']


# 定义BERT特征的维度和数据点的数量
n_samples = valid_data.shape[0]
n_features = 10

# 生成随机特征
random_features = np.random.rand(n_samples, n_features)

X_random = pd.DataFrame(random_features, columns=[f'random_feature_{i+1}' for i in range(n_features)])


# 分割数据集为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(bow_matrix, y, test_size=0.2, random_state=42)

# 初始化Lasso回归模型
# alpha是正则化强度，可以通过交叉验证来选择最佳值
lasso_regressor = Lasso(alpha=0.000000001)

# 拟合模型
lasso_regressor.fit(X_train, y_train)

# 可以使用X_test来进行模型评估
from sklearn.metrics import mean_squared_error

y_pred = lasso_regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')


In [None]:
random 0.92

Bert Mean Squared Error: 0.9226066329299135
Ran Mean Squared Error: 0.9226066232306231