In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import json
import numpy as np
import threading
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import gc
import os

# **Read data**

In [3]:
data = pd.read_csv('/content/drive/MyDrive/Final Project/mfp-diaries.tsv', sep='\t', header=None)

In [5]:
data.head()

Unnamed: 0,0,1,2,3
0,1,2014-09-14,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 2924}..."
1,1,2014-09-15,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 2430}..."
2,1,2014-09-16,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 1862}..."
3,1,2014-09-17,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 2251}..."
4,1,2014-09-18,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 2001}..."


# **This is the value of first row**

In [4]:
data.iloc[[0],:].values

array([[1, '2014-09-14',
        '[{"meal": "MY food", "dishes": [{"nutritions": [{"name": "Calories", "value": "412"}, {"name": "Carbs", "value": "29"}, {"name": "Fat", "value": "24"}, {"name": "Protein", "value": "21"}, {"name": "Sodium", "value": "258"}, {"name": "Sugar", "value": "29"}], "name": "my - McDonalds Espresso Pronto\\u00ae Flat White, 2 TALL"}, {"nutritions": [{"name": "Calories", "value": "170"}, {"name": "Carbs", "value": "25"}, {"name": "Fat", "value": "5"}, {"name": "Protein", "value": "20"}, {"name": "Sodium", "value": "260"}, {"name": "Sugar", "value": "2"}], "name": "Quest Bar - Banana Nut Muffin Natural Protein Bar, 60 g"}, {"nutritions": [{"name": "Calories", "value": "176"}, {"name": "Carbs", "value": "33"}, {"name": "Fat", "value": "1"}, {"name": "Protein", "value": "5"}, {"name": "Sodium", "value": "195"}, {"name": "Sugar", "value": "0"}], "name": "Uncle Tobys Australia - Vita Brits, 3 Biscuits 33.3g"}, {"nutritions": [{"name": "Calories", "value": "342"}, {"

# **Convert given row from string to json**

In [None]:
def convert_to_json(row):
    id = row[0]
    date = row[1]
    details = json.loads(row[2])
    summary = json.loads(row[3])
    return pd.DataFrame([[id, date, details, summary]], columns=['Id', 'Date', 'Details', 'Summary'])

In [None]:
def create_threads_for_rows(data):
    threads = []
    results = []

    def thread_function(index, row):
        processed_row = convert_to_json(row)
        results.append(processed_row)

    num_rows = data.shape[0]
    for i in range(num_rows):
        row = data.iloc[i, :]
        thread = threading.Thread(target=thread_function, args=(i, row))
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    processed_data = pd.concat(results, ignore_index=True)
    return processed_data

# **Find all unique columns available in whole dataset and create columnset**

In [None]:
def create_columnset(data):
    dish_nutritions = set()
    total_nutritions = set()
    goal_nutritions = set()

    for index, row in data.iterrows():
        meals_data_json = row[2]
        nutrition_data_json = row[3]

        meals_data = json.loads(meals_data_json)
        for meal in meals_data:
            for dish in meal['dishes']:
                for nutrition in dish['nutritions']:
                    dish_nutritions.add(nutrition['name'])

        nutrition_data = json.loads(nutrition_data_json)

        for item in nutrition_data['total']:
            total_nutritions.add(item['name'])


        for item in nutrition_data['goal']:
            goal_nutritions.add(item['name'])


    dish_words_list = sorted(dish_nutritions)
    total_words_list = sorted(total_nutritions)
    goal_words_list = sorted(goal_nutritions)

    sorted_combined_list = dish_words_list + total_words_list + goal_words_list


    print("Dish Nutrition Words:", dish_words_list)
    print("Total Nutrition Words:", total_words_list)
    print("Goal Nutrition Words:", goal_words_list)
    print("Combined Sorted List:", sorted_combined_list)

    return sorted_combined_list
columnset = create_columnset(data)

Dish Nutrition Words: ['Calcium', 'Calories', 'Carbs', 'Chol', 'Fat', 'Fiber', 'Iron', 'Mon Fat', 'Ply Fat', 'Potass.', 'Protein', 'Sat Fat', 'Sodium', 'Sugar', 'Trn Fat', 'Vit A', 'Vit C']
Total Nutrition Words: ['Calcium', 'Calories', 'Carbs', 'Chol', 'Fat', 'Fiber', 'Iron', 'Mon Fat', 'Ply Fat', 'Potass.', 'Protein', 'Sat Fat', 'Sodium', 'Sugar', 'Trn Fat', 'Vit A', 'Vit C']
Goal Nutrition Words: ['Calcium', 'Calories', 'Carbs', 'Chol', 'Fat', 'Fiber', 'Iron', 'Mon Fat', 'Ply Fat', 'Potass.', 'Protein', 'Sat Fat', 'Sodium', 'Sugar', 'Trn Fat', 'Vit A', 'Vit C']
Combined Sorted List: ['Calcium', 'Calories', 'Carbs', 'Chol', 'Fat', 'Fiber', 'Iron', 'Mon Fat', 'Ply Fat', 'Potass.', 'Protein', 'Sat Fat', 'Sodium', 'Sugar', 'Trn Fat', 'Vit A', 'Vit C', 'Calcium', 'Calories', 'Carbs', 'Chol', 'Fat', 'Fiber', 'Iron', 'Mon Fat', 'Ply Fat', 'Potass.', 'Protein', 'Sat Fat', 'Sodium', 'Sugar', 'Trn Fat', 'Vit A', 'Vit C', 'Calcium', 'Calories', 'Carbs', 'Chol', 'Fat', 'Fiber', 'Iron', 'Mon Fat

In [None]:
columnset.insert(0, 'Id')
columnset.insert(1, 'Date')
columnset.insert(2, 'Meal')
columnset.insert(3, 'Sequence')
columnset

['Id',
 'Date',
 'Meal',
 'Sequence',
 'Calcium',
 'Calories',
 'Carbs',
 'Chol',
 'Fat',
 'Fiber',
 'Iron',
 'Mon Fat',
 'Ply Fat',
 'Potass.',
 'Protein',
 'Sat Fat',
 'Sodium',
 'Sugar',
 'Trn Fat',
 'Vit A',
 'Vit C',
 'Calcium',
 'Calories',
 'Carbs',
 'Chol',
 'Fat',
 'Fiber',
 'Iron',
 'Mon Fat',
 'Ply Fat',
 'Potass.',
 'Protein',
 'Sat Fat',
 'Sodium',
 'Sugar',
 'Trn Fat',
 'Vit A',
 'Vit C',
 'Calcium',
 'Calories',
 'Carbs',
 'Chol',
 'Fat',
 'Fiber',
 'Iron',
 'Mon Fat',
 'Ply Fat',
 'Potass.',
 'Protein',
 'Sat Fat',
 'Sodium',
 'Sugar',
 'Trn Fat',
 'Vit A',
 'Vit C']

In [None]:
pd.DataFrame(columnset).to_csv('/content/drive/MyDrive/Final Project/columnset.csv', index=False)

# **Parse data according to nutrition values**

In [None]:
def parse_data(row):
    #global columnset
    columnset = pd.read_csv('/content/drive/MyDrive/Final Project/columnset.csv').values.flatten().tolist()
    id = row.values[0][0]
    date = row.values[0][1]
    nutritions_data = row.values[0][2]

    rows = []

    for meal_data in nutritions_data:
        sequence = meal_data.get('sequence', np.nan)
        meal_name = meal_data.get('meal', np.nan)
        dishes = meal_data.get('dishes', [])

        for dish in dishes:
            dish_name = dish.get('name', np.nan)
            nutritions_list = dish.get('nutritions', [])

            nutrients = {
                'Calcium': np.nan,
                'Calories': np.nan,
                'Carbs': np.nan,
                'Chol': np.nan,
                'Fat': np.nan,
                'Fiber': np.nan,
                'Iron': np.nan,
                'Mon Fat': np.nan,
                'Ply Fat': np.nan,
                'Potass.': np.nan,
                'Protein': np.nan,
                'Sat Fat': np.nan,
                'Sodium': np.nan,
                'Sugar': np.nan,
                'Trn Fat': np.nan,
                'Vit A': np.nan,
                'Vit C': np.nan
            }

            for nutrition in nutritions_list:
                nutrient_name = nutrition.get('name', '')
                if nutrient_name in nutrients:
                    nutrients[nutrient_name] = nutrition.get('value', np.nan)
                else:
                    print(f"Unexpected nutrient: {nutrient_name}")

            row_data = {
                columnset[0]: id,
                'Date': date,
                'Sequence': sequence,
                'MealName': meal_name,
                'DishName': dish_name,
                f"{columnset[4]}_value": nutrients['Calcium'],
                f"{columnset[5]}_value": nutrients['Calories'],
                f"{columnset[6]}_value": nutrients['Carbs'],
                f"{columnset[7]}_value": nutrients['Chol'],
                f"{columnset[8]}_value": nutrients['Fat'],
                f"{columnset[9]}_value": nutrients['Fiber'],
                f"{columnset[10]}_value": nutrients['Iron'],
                f"{columnset[11]}_value": nutrients['Mon Fat'],
                f"{columnset[12]}_value": nutrients['Ply Fat'],
                f"{columnset[13]}_value": nutrients['Potass.'],
                f"{columnset[14]}_value": nutrients['Protein'],
                f"{columnset[15]}_value": nutrients['Sat Fat'],
                f"{columnset[16]}_value": nutrients['Sodium'],
                f"{columnset[17]}_value": nutrients['Sugar'],
                f"{columnset[18]}_value": nutrients['Trn Fat'],
                f"{columnset[19]}_value": nutrients['Vit A'],
                f"{columnset[20]}_value": nutrients['Vit C'],
            }

            rows.append(row_data)

    df = pd.DataFrame(rows)
    return df

# **Parse data according to customers' total and goal nutritions**

In [None]:
def parse_data_total_goal(row):
    totals = row.values[0][3].get('total', [])
    goals = row.values[0][3].get('goal', [])

    totals_dict = {
        'Calcium_total': np.nan,
        'Calories_total': np.nan,
        'Carbs_total': np.nan,
        'Chol_total': np.nan,
        'Fat_total': np.nan,
        'Fiber_total': np.nan,
        'Iron_total': np.nan,
        'Mon Fat_total': np.nan,
        'Ply Fat_total': np.nan,
        'Potass._total': np.nan,
        'Protein_total': np.nan,
        'Sat Fat_total': np.nan,
        'Sodium_total': np.nan,
        'Sugar_total': np.nan,
        'Trn Fat_total': np.nan,
        'Vit A_total': np.nan,
        'Vit C_total': np.nan
    }

    goals_dict = {key.replace('_total', '_goal'): value for key, value in totals_dict.items()}

    for total in totals:
        name = total.get('name', '')
        value = total.get('value', np.nan)
        total_name = name + '_total'
        if total_name in totals_dict:
            totals_dict[total_name] = value

    for goal in goals:
        name = goal.get('name', '')
        value = goal.get('value', np.nan)
        goal_name = name + '_goal'
        if goal_name in goals_dict:
            goals_dict[goal_name] = value

    totals_df = pd.DataFrame([totals_dict])
    goals_df = pd.DataFrame([goals_dict])

    both = pd.concat([totals_df, goals_df], axis=1)
    return both

In [None]:
def create_threads_for_processing(data):
    threads = []
    results = []

    def thread_function(index, row):
        row = create_threads_for_rows(data.iloc[[i], :])
        processed_row_1 = parse_data(row)
        processed_row_2 = parse_data_total_goal(row)
        duplicated_df = pd.concat([processed_row_2] * processed_row_1.shape[0], ignore_index=True)
        processed_row = pd.concat([processed_row_1, duplicated_df], axis=1)
        results.append(processed_row)
    num_rows = data.shape[0]
    for i in range(num_rows):
        row = data.iloc[[i], :]
        thread = threading.Thread(target=thread_function, args=(i, row))
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    processed_data = pd.concat(results, ignore_index=True)
    return processed_data

# **Save parsed data in batches**

In [None]:
def process_and_save_in_batches(data, batch_size=50000, save_dir='/content/drive/MyDrive/Final Project/'):
    os.makedirs(save_dir, exist_ok=True)

    num_rows = len(data)
    num_batches = num_rows // batch_size + 1

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, num_rows)

        partial_data = data.iloc[start_idx:end_idx]

        processd_rows = create_threads_for_processing(partial_data)

        processd_rows.sort_values(by=['Id', 'Date'], inplace=True)

        batch_filename = f'batch_{i + 1}.csv'
        batch_path = os.path.join(save_dir, batch_filename)

        processd_rows.to_csv(batch_path, index=False)

        print(f'Saved batch {i + 1} to {batch_path}')

        del processd_rows
        del partial_data
        gc.collect()

    print('All batches processed and saved successfully.')
process_and_save_in_batches(data)

Saved batch 1 to /content/drive/MyDrive/Final Project/batch_1.csv
Saved batch 2 to /content/drive/MyDrive/Final Project/batch_2.csv
Saved batch 3 to /content/drive/MyDrive/Final Project/batch_3.csv
Saved batch 4 to /content/drive/MyDrive/Final Project/batch_4.csv
Saved batch 5 to /content/drive/MyDrive/Final Project/batch_5.csv
Saved batch 6 to /content/drive/MyDrive/Final Project/batch_6.csv
Saved batch 7 to /content/drive/MyDrive/Final Project/batch_7.csv
Saved batch 8 to /content/drive/MyDrive/Final Project/batch_8.csv
Saved batch 9 to /content/drive/MyDrive/Final Project/batch_9.csv
Saved batch 10 to /content/drive/MyDrive/Final Project/batch_10.csv
Saved batch 11 to /content/drive/MyDrive/Final Project/batch_11.csv
Saved batch 12 to /content/drive/MyDrive/Final Project/batch_12.csv
All batches processed and saved successfully.


# **Aggregate all batches into one file**

In [None]:
def consolidate_batches_into_csv(save_dir='/content/drive/MyDrive/Final Project/'):

    csv_files = [f for f in os.listdir(save_dir) if f.startswith('batch_') and f.endswith('.csv')]
    consolidated_data = pd.DataFrame()
    for csv_file in csv_files:
        csv_path = os.path.join(save_dir, csv_file)

        batch_data = pd.read_csv(csv_path, low_memory=False)

        consolidated_data = pd.concat([consolidated_data, batch_data], ignore_index=True, axis=0)


    consolidated_file_path = os.path.join(save_dir, 'consolidated_data.csv')
    consolidated_data.to_csv(consolidated_file_path, index=False)

    print(f'Consolidated data saved to {consolidated_file_path}')

consolidate_batches_into_csv()

Consolidated data saved to /content/drive/MyDrive/Final Project/consolidated_data.csv


# **See the way how it is parsed**

In [None]:
import pandas as pd
Consolidated_data = pd.read_csv('/content/drive/MyDrive/Final Project/consolidated_data.csv', low_memory=False)

In [None]:
Consolidated_data.iloc[[1],:].values

array([[1, '2014-09-14', 1, 'MY food',
        'Quest Bar - Banana Nut Muffin Natural Protein Bar, 60 g', nan,
        '170', '25', nan, '5.0', nan, nan, nan, nan, nan, '20', nan,
        '260', '2.0', nan, nan, nan, nan, 2924.0, 340.0, nan, 114.0, nan,
        nan, nan, nan, nan, 186.0, nan, 3658.0, 109.0, nan, nan, nan,
        nan, 3173.0, 396.0, nan, 105.0, nan, nan, nan, nan, nan, 160.0,
        nan, 2300.0, 119.0, nan, nan, nan]], dtype=object)

In [None]:
data.iloc[[0],:].values

array([[1, '2014-09-14',
        '[{"meal": "MY food", "dishes": [{"nutritions": [{"name": "Calories", "value": "412"}, {"name": "Carbs", "value": "29"}, {"name": "Fat", "value": "24"}, {"name": "Protein", "value": "21"}, {"name": "Sodium", "value": "258"}, {"name": "Sugar", "value": "29"}], "name": "my - McDonalds Espresso Pronto\\u00ae Flat White, 2 TALL"}, {"nutritions": [{"name": "Calories", "value": "170"}, {"name": "Carbs", "value": "25"}, {"name": "Fat", "value": "5"}, {"name": "Protein", "value": "20"}, {"name": "Sodium", "value": "260"}, {"name": "Sugar", "value": "2"}], "name": "Quest Bar - Banana Nut Muffin Natural Protein Bar, 60 g"}, {"nutritions": [{"name": "Calories", "value": "176"}, {"name": "Carbs", "value": "33"}, {"name": "Fat", "value": "1"}, {"name": "Protein", "value": "5"}, {"name": "Sodium", "value": "195"}, {"name": "Sugar", "value": "0"}], "name": "Uncle Tobys Australia - Vita Brits, 3 Biscuits 33.3g"}, {"nutritions": [{"name": "Calories", "value": "342"}, {"

# **Function for parsing test data**

In [None]:
def for_test(test):
  processd_rows = create_threads_for_processing(test)
  processd_rows.sort_values(by=['Id', 'Date'], inplace=True)
  return processd_rows

In [None]:
for_test(data.iloc[[0],:]).to_csv('/content/drive/MyDrive/Final Project/real_testing.csv', index=False)