In [3]:
import pandas as pd
import json
import matplotlib.pyplot as plt
from os import listdir

Parse text files into usable csv datasets

In [40]:
def pairwise(iterable):
    "s -> (s0, s1), (s2, s3), (s4, s5), ..."
    a = iter(iterable)
    return zip(a, a)

for file_name in listdir('./data/text/closures/'):
    file = open('data/text/closures/{jars_doc}'.format(jars_doc=file_name))
    lines = file.readlines()

    value_list = []
    for line in lines:
        entry = {}
        parsed_line = json.loads(line)
        for column, value in pairwise(parsed_line):
            entry[column] = value
        value_list.append(entry)

    df = pd.DataFrame(value_list)
    df.to_csv('./data/csv/closures/{jars_doc}.csv'.format(jars_doc=file_name.split('.')[0]))

Merge all datasets into one Dataframe

In [2]:
df = pd.DataFrame()
for file in listdir('./data/csv/closures/'):
    current_df = pd.read_csv('./data/csv/closures/{csv_file}'.format(csv_file=file))
    df = pd.concat([df, current_df])

Clean Dataframe

In [4]:
# Drop columns, change SKU Ref. to Id, and remove duplicates by Id
df = df.drop(['Unnamed: 0', 'Diameter', 'Height', 'Finish', 'Shape', 'Weight', 'Capacity'], axis=1)
df = df.rename(columns={'SKU reference': 'Id'})
df.drop_duplicates(subset=['Id'], inplace=True)
df.reset_index(drop=True, inplace=True)

Material

In [5]:
# Only use rows where Material column is not null
df = df[df['Material'].notna()]
for index in df.index.values:
    material = df.loc[index,'Material']
    if ('Synthetic' in material) | ('PPCP' in material) | ('Latex' in material):
        material = 'Plastic'
    df.loc[index, 'Material'] = material

Color

In [6]:
# Only use rows where Color column is not null
df = df[df['Color'].notna()]
# Remove material from color attribute
for index in df.index.values:
    color = df.loc[index,'Color']
    color = str(color).split('-').pop()
    df.loc[index, 'Color'] = color
df['Color'] = df['Color'].str.capitalize()

In [7]:
# Group colors
for index in df.index.values:
    color = df.loc[index,'Color']
    if 'Golden' in color:
        color = 'Gold'
    elif ('Cork' in color) | ('Wood' in color):
        color = 'Brown'
    elif 'Blue' in color:
        color = 'Blue'
    elif 'Cherry' in color:
        color = 'Red'
    elif 'Silverish' in color:
        color = 'Silver'
    elif 'White' in color:
        color = 'White'
    elif ('Green' in color) | ('green' in color):
        color = 'Green'
    elif ('Bee' in color) | ('Cell' in color):
        color = 'Gold'
    df.loc[index, 'Color'] = color

Name

In [9]:
df

Unnamed: 0,Id,Material,Name,Color,Type
25,,Metal,,Gold,
26,00738-5,Metal,sealing-cap-terrine-le-parfait-familia-wiss-10...,Gold,
27,00738-4,Metal,sealing-cap-terrine-le-parfait-familia-wiss-11...,Gold,
28,00738-6,Metal,sealing-cap-terrine-le-parfait-familia-wiss-82-mm,Gold,
29,00738-3,Plastic,rubber-gasket-genuine-latex-100-mm,Orange,
...,...,...,...,...,...
173,007072-217,Plastic,plastic-lid-for-60-mm-weck-jars,White,
174,007072-216,Plastic,plastic-lid-for-80-mm-weck-jars,White,
175,007072-215,Plastic,plastic-lid-for-weck-100-mm,Transparent,
176,007072-214,Plastic,plastic-lid-for-weck-60-mm,Transparent,


In [10]:
df['Type'] = ''
df = df[df['Name'].notna()]
for index in df.index.values:
    name = df.loc[index, 'Name']
    type = 'Twist Off'
    if 'wiss' in name:
        type = 'Wiss'
    elif 'rubber-gasket' in name:
        type = 'Rubber Ring'
    elif 'weck' in name:
        type = 'Weck'
    df.loc[index, 'Type'] = type

Basic Affordances

In [11]:
df['Lift'] = True
df['Unscrew'] = False
df['Pull'] = False
df['Puncture'] = False
for index in df.index.values:
    if df.loc[index, 'Type'] == 'Wiss':
        df.loc[index, 'Puncture'] = True
    if df.loc[index, 'Type'] == 'Rubber Ring':
        df.loc[index, 'Pull'] = True
    if df.loc[index, 'Type'] == 'Twist Off':
        df.loc[index, 'Unscrew'] = True

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Lift'] = True
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Unscrew'] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Pull'] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the

In [12]:
df = df.drop(['Id', 'Name'], axis=1)
df.reset_index(drop=True, inplace=True)

In [14]:
df.to_csv('./data/csv/jar_closures.csv')