In [1]:
# 1_data_analysis.ipynb
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
data = pd.read_csv('../data/shipping_data.csv')

# Add calculated columns
data['cost_per_km'] = data['shipping_cost'] / data['distance_km']
data['cost_per_kg'] = data['shipping_cost'] / data['weight_kg']

# Basic statistics
print("Basic Statistics:")
print(data.describe())

# Visualize shipping costs by carrier
plt.figure(figsize=(10, 6))
sns.barplot(x='carrier', y='shipping_cost', data=data)
plt.title('Average Shipping Cost by Carrier')
plt.ylabel('Shipping Cost ($)')
plt.savefig('../results/carrier_costs.png')
plt.show()

# Visualize cost per km by route
plt.figure(figsize=(12, 6))
routes = data['origin'] + ' to ' + data['destination']
sns.barplot(x=routes, y='cost_per_km', data=data)
plt.title('Cost per Kilometer by Route')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('../results/route_costs.png')
plt.show()

# Find the most expensive routes
print("\nMost Expensive Routes (by cost per km):")
route_costs = data.groupby(['origin', 'destination'])['cost_per_km'].mean().reset_index()
print(route_costs.sort_values('cost_per_km', ascending=False).head())

# Find the cheapest carrier for each route
print("\nCheapest Carrier for Each Route:")
cheapest_carriers = data.groupby(['origin', 'destination', 'carrier'])['shipping_cost'].mean().reset_index()
cheapest_carriers = cheapest_carriers.loc[cheapest_carriers.groupby(['origin', 'destination'])['shipping_cost'].idxmin()]
print(cheapest_carriers[['origin', 'destination', 'carrier', 'shipping_cost']])


ModuleNotFoundError: No module named 'pandas'