In [1]:
import pandas as pd

df = pd.read_csv('clean_shoes.csv')  

df['prices.amountMin'] = pd.to_numeric(df['prices.amountMin'], errors='coerce')
df['prices.amountMax'] = pd.to_numeric(df['prices.amountMax'], errors='coerce')

df['average_price'] = df[['prices.amountMin', 'prices.amountMax']].mean(axis=1)

# 1. Calculate the average price for each distinct brand
average_prices = df.groupby('brand')['average_price'].mean().reset_index()

average_prices.to_csv('output.csv', index=False)

# 2. Top 5 brands with the highest average price
top_5_avg_price = average_prices.sort_values(by='average_price', ascending=False).head(5)
top_5_avg_price_list = ', '.join(top_5_avg_price['brand'])

# 3. Top 5 brands with the biggest variance
price_variance = df.groupby('brand')['average_price'].var().reset_index()
top_5_variance = price_variance.sort_values(by='average_price', ascending=False).head(5)
top_5_variance_list = ', '.join(top_5_variance['brand'])


with open('output.txt', 'w') as f:
    f.write(top_5_avg_price_list + '\n')
    f.write(top_5_variance_list)

In [9]:
import pandas as pd

try:
    old_df = pd.read_csv('OLD-lab0-data.csv', on_bad_lines='skip', low_memory=False)
    
    old_df['prices.amountMin'] = pd.to_numeric(old_df['prices.amountMin'], errors='coerce')
    old_df['prices.amountMax'] = pd.to_numeric(old_df['prices.amountMax'], errors='coerce')

    old_df_clean = old_df.dropna(subset=['prices.amountMin', 'prices.amountMax'], how='all').copy()

    old_df_clean.loc[old_df_clean['prices.amountMin'].isna(), 'prices.amountMin'] = old_df_clean['prices.amountMax']
    old_df_clean.loc[old_df_clean['prices.amountMax'].isna(), 'prices.amountMax'] = old_df_clean['prices.amountMin']

    old_df_clean['average_price'] = old_df_clean[['prices.amountMin', 'prices.amountMax']].mean(axis=1)

    old_df_clean.to_csv('my_clean_shoes.csv', index=False)

    # 3. Document the Cleaning Process
    with open('clean.txt', 'w') as f:
        f.write('Cleaning Process for OLD-lab0-data.csv:\n')
        f.write('Skipped lines with parsing errors.\n')
        f.write('Converted prices.amountMin and prices.amountMax to numeric values.\n')
        f.write('Removed rows where both prices.amountMin and prices.amountMax were NaN.\n')
        f.write('Filled missing values in prices.amountMin with prices.amountMax.\n')
        f.write('Calculated a new column average_price based on the cleaned prices.\n')

except Exception as e:
    with open('clean.txt', 'w') as f:
        f.write(f'An error occurred while processing the file: {e}\n')
