In [None]:
# Amazon Trend Prediction based on TikTok Hashtags

This repository contains code for the analysis of TikTok hashtag descriptions to predict trending products on Amazon. It includes a Python script that processes datasets and calculates cosine similarity between TikTok descriptions and Amazon search terms.

## Python Script

Below is the Python code used in this project. Each code block corresponds to a portion of the script with a specific function.

### Dependencies

Before running the script, ensure you have Dask, Pandas, and Scikit-learn installed in your Python environment.

```python
import dask.dataframe as dd
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


## Data Loading:
The datasets are loaded from Parquet files using Dask.

In [None]:
amazon_df = dd.read_parquet('nswAmz.parquet')
tiktok_df = dd.read_parquet('nswTik.parquet')

amazon_titles = amazon_df['Search_Term']
tiktok_text = tiktok_df['description']

amazon_titles = [str(item) for item in amazon_titles]
tiktok_text = [str(item) for item in tiktok_text]

## Cosine Similarity Calculation:
Define a function to calculate cosine similarity between two lists of text.

In [None]:
def calculate_cosine_similarity(list1, list2):
    # Create a single list for TF-IDF
    combined_list = list1 + list2

    # Initialize the vectorizer
    vectorizer = TfidfVectorizer()

    # Vectorize the text using TF-IDF
    tfidf_matrix = vectorizer.fit_transform(combined_list)

    # Calculate cosine similarity
    cos_sim_matrix = cosine_similarity(tfidf_matrix[:len(list1)], tfidf_matrix[len(list1):])

    return cos_sim_matrix

## Saving Results to CSV:
Define a function to save the similarity matrix to a CSV file.

In [None]:
def save_to_csv(similarity_matrix, list1, list2, dates_collected, reporting_dates, output_file):
    # Prepare data for DataFrame
    data = []
    for i in range(len(list1)):
        top_indices = similarity_matrix[i].argsort()[-1:][::-1]
        for index in top_indices:
            data.append([
                list1[i],
                dates_collected[i],
                list2[index],
                reporting_dates[index],
                similarity_matrix[i][index]
            ])
    df = pd.DataFrame(data, columns=[
        'Hashtag Description',
        'Date Collected',
        'SearchTerm',
        'Reporting Date',
        'Cosine Similarity'
    ])
    df.to_csv(output_file, index=False)

In [None]:
# Extract 'date_collected' from tiktok_df DataFrame
dates_collected = list(tiktok_df['date_collected'])

# Extract 'Reporting Date' from amazon_df DataFrame
reporting_dates = list(amazon_df['Reporting Date'])

## Processing in Chunks
Due to memory limits, the script processes data in chunks.

In [None]:
# Due to the memory limit, we need to write a for loop to run 12000 lines each time and totally run 20 times
chunk_size = 12000

for i in range(20):
    start_index = i * chunk_size
    end_index = start_index + chunk_size
    if end_index > len(list1):
        end_index = len(list1)
    current_list1_chunk = list1[start_index:end_index]
    similarity_matrix = calculate_cosine_similarity(current_list1_chunk, list2)
    output_csv = f'cosine_similarity_output_part_{i+1}.csv'
    save_to_csv(similarity_matrix, current_list1_chunk, list2, dates_collected, reporting_dates, output_csv)
    print(f"Results saved to {output_csv}")


## Merging CSV Files
After processing, merge all CSV parts into a single file.

In [None]:
dataframes = []

for i in range(1, 20):
    df = pd.read_csv(f'cosine_similarity_output_part_{i}.csv', encoding='UTF-8')
    dataframes.append(df)

combined_df = pd.concat(dataframes, ignore_index=True)
combined_df.to_csv('combined_output.csv', index=False)
print("All files have been merged into combined_output.csv")


## Sample Output
Below is a sample output from the script, showcasing the data structure of the results.

In [None]:
Hashtag Description Date Collected                 SearchTerm  Cosine Similarity  Cosine Similarity.1
0                   _     04/07/2023             zzzquil liquid          04/15/2023             0.000000
1                 ___     04/07/2023             zzzquil liquid          04/15/2023             0.000000
2              _marwe     04/12/2023             zzzquil liquid          04/15/2023             0.000000
3           musically     04/07/2023             zzzquil liquid          04/15/2023             0.000000