In [4]:
import random
import pandas as pd
import numpy as np
import re
import pickle
import sklearn

In [21]:
df = pd.read_csv('merged_product_info_list.csv')

In [46]:
df.to_csv('merged_product_info_list.csv', index=False)

In [19]:
df.count()

product_id            3437
product_name          3437
product_price         3437
product_link          3437
Product_desc          3437
Expert Review         1194
Customer Review        110
Appearance            1231
Nose                  1307
Food Pairing          3437
Alcohol percentage    3373
Bottle volume ml      3437
Gift box              3369
Country of origin     3381
Varietal              2485
Brand                 3437
alcohol_type          3437
Varietals              330
Organic                 33
Vintage                345
Style                   16
Score                  519
Finish                  89
Type                    96
Palate                1301
dtype: int64

In [11]:
df['Merged Brand'] = df.apply(lambda row: row['Palate'] if pd.notna(row['Palate']) else row['palate  '], axis=1)
df.drop(['Palate', 'palate  '], axis=1, inplace=True)
df.rename(columns={'Merged Brand': 'Palate'}, inplace=True)

In [29]:
def create_taste_description(row):
    parts = []
    if not pd.isna(row['Appearance']):
        parts.append(f"It is {row['Appearance']}")
    if not pd.isna(row['Nose']):
        parts.append(f"smells like {row['Nose']}")
    if not pd.isna(row['Palate']):
        parts.append(f"tastes like {row['Palate']}")
    if not pd.isna(row['Finish']):
        parts.append(f"finished with {row['Finish']}")
    return ', '.join(parts)

df['taste_description'] = df.apply(create_taste_description, axis=1)

In [30]:
print(df['taste_description'].head())

0    It is Golden yellow, smells like Pear, peach, ...
1                                                     
2                                                     
3                                                     
4                                                     
Name: taste_description, dtype: object


In [35]:
df['Food Pairing'] = df['Food Pairing'].str.strip('[]')

In [40]:
def create_food_pair_description(row):
    if not row:
        return ""
    else:
        return f"suitable serving for {''.join(row)}"

df['food_pair_description'] = df['Food Pairing'].apply(create_food_pair_description).str.replace("'", '')

In [41]:
print(df['food_pair_description'].head())

0    suitable serving for Seafood, Pasta, Cheese
1                                               
2                                               
3                                               
4                                               
Name: food_pair_description, dtype: object


In [43]:
def create_brand_and_varietal(row):
    parts = ["It's"]
    if not pd.isna(row['Varietal']):
        parts.append(f"a {row['Varietal']}")
    if not pd.isna(row['alcohol_type']):
        parts.append(f"{row['alcohol_type']}")
    if not pd.isna(row['Country of origin']):
        parts.append(f"from {row['Country of origin']}")
    if not pd.isna(row['Varietals']):
        parts.append(f",{row['Varietals']}")
    if not pd.isna(row['Brand']):
        parts.append(f",made by {row['Brand']}")
    return ' '.join(parts)

df['varietal_description'] = df.apply(create_brand_and_varietal, axis=1)

In [44]:
print(df['varietal_description'].head())

0     It's a Wheat Beer from Germany ,made by Erdinger
1     It's a Wheat Beer from Germany ,made by Erdinger
2    It's a Wheat Beer from Germany ,made by Franzi...
3     It's a Lager Beer from Germany ,made by Paulaner
4     It's a Wheat Beer from Germany ,made by Paulaner
Name: varietal_description, dtype: object


In [72]:
def create_training_text(text, output_file):
    cleaned_text = text.strip()
    cleaned_text = re.sub(r'\n', ' ', cleaned_text)
    cleaned_text = re.sub(r'([^.!?])\n', r'\1.\n', cleaned_text)
    
    with open(output_file, 'a', encoding='utf-8') as file:
        file.write(cleaned_text + '\n')

In [87]:
output_file = 'training_text.txt'

for column_name in ['Product_desc', 'Expert Review', 'Customer Review', 'varietal_description', 'taste_description']:
    for text in df[column_name]:
        if not pd.isna(text): 
            create_training_text(text, output_file)


In [88]:
import nltk

nltk.download('punkt')

from nltk.tokenize import sent_tokenize

with open('training_text.txt', 'r', encoding='utf-8') as file:
    text = file.read()

sentences = sent_tokenize(text)
fixed_sentences = []
i = 0
while i < len(sentences):
    if i < len(sentences) - 1 and re.search(r'\b[Nn]o\.\s?', sentences[i]):
        # 如果当前句子以 "no." 结尾，且下一句以数字开头，则合并它们
        fixed_sentence = sentences[i] + " " + sentences[i + 1]
        fixed_sentences.append(fixed_sentence)
        i += 2  # 跳过下一句
    else:
        fixed_sentences.append(sentences[i])
        i += 1

with open('training_text.txt', 'w', encoding='utf-8') as file:
    for sentence in fixed_sentences:
            file.write(sentence + '\n')



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\20571\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [89]:
with open('training_text.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

unique_lines = list(set(lines))
max_seq_len = max(len(line) for line in lines)

with open('training_text.txt', 'w', encoding='utf-8') as file:
    for line in lines:
        if line.strip():
            file.write(line)
    file.writelines(unique_lines)
    
print("Max Sequence Length:", max_seq_len)

Max Sequence Length: 649
