In [None]:
!pip install seaborn

In [None]:
import pandas as pd

df = pd.read_csv('/Users/keshavsaraogi/Desktop/indorama/eureka-data/raw-csv/sales_packaging.csv')

In [None]:
# df.head(15)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
missing_percent = df.isnull().mean() * 100
missing_percent = missing_percent.sort_values(ascending=False)

print(missing_percent.round(2))

In [None]:
import pandas as pd

nan_counts = df.isnull().sum()
nan_counts = nan_counts[nan_counts > 0]

hash_counts = (
    df.select_dtypes(include='object')
    .apply(lambda col: col.astype(str).str.count('#').sum())
)
hash_counts = hash_counts[hash_counts > 0]

combined = pd.DataFrame({
    "NaN_Count": nan_counts,
    "#_Count": hash_counts
}).fillna(0).astype(int)

combined["Total_Issues"] = combined["NaN_Count"] + combined["#_Count"]
combined = combined.sort_values("Total_Issues", ascending=False)

print("📊 Columns with NaN or '#' values and their counts:")
print(combined)


In [None]:
for col in df.columns:
    print(f"📌 Column: {col}")
    print(f"🔢 Unique Values ({df[col].nunique(dropna=False)}):")
    print(df[col].unique())
    print("-" * 80)

In [None]:
df.columns.to_list()

## Handling Missing Values

In [None]:
df.columns.to_list()

In [None]:
df.rename(columns={
    "Company Code": "Company ID",
    "Sales Organization": "Sales Organization ID",
    "Plant": "Plant ID",
    "Material": "Material ID",
    "Incoterms 1": "Incoterms Type",
    "Tax amount": "Tax Amount",
    "Sales Invoice Price\n(USD/MT)": "Sales Invoice Price(USD/MT)",
    "Marine Insurance\n(USD/MT)":"Marine Insurance(USD/MT)",
    "Freight Charge road(USD/MT)": "Freight Charge Road(USD/MT)",
    "FOBBING Charge sea O/B(USD/MT)": "Fobbing Charge Sea O/B(USD/MT)",
    "Destination Charge sea(USD/MT)": "Destination Charge Sea(USD/MT)",
    "Freight charge Air(USD/MT)": "Freight Charge Air(USD/MT)",
    "Credit Insurance Cost\n(USD/MT)": "Credit Insurance Cost(USD/MT)",
    "Interest Cost (CC)\n(USD/MT)": "Interest Cost (CC)(USD/MT)",
    "Power Fuel Utilities\n(USD/MT)": "Power Fuel Utilities(USD/MT)",
    "Packing Cost\n(USD/MT)": "Packing Cost(USD/MT)",
    "MB Cost\n(USD/MT)": "MB Cost(USD/MT)",
    "Export Incentive\n(USD/MT)": "Export Incentive(USD/MT)",
    
}, inplace=True)

In [None]:
# Convert to datetime WITHOUT custom format
df['Sales Invoice Date'] = pd.to_datetime(df['Sales Invoice Date'], errors='coerce')

# (Optional) Format back to string as 'YYYY-MM-DD' (if needed for export or SQL)
df['Sales Invoice Date'] = df['Sales Invoice Date'].dt.strftime('%Y-%m-%d')


In [None]:
output_path = "/Users/keshavsaraogi/Desktop/indorama/eureka-data/clean-csv/cleaned_sales_packaging.csv"
df.to_csv(output_path, index=False, encoding="utf-8-sig")

print(f"✅ DataFrame saved to: {output_path}")

In [None]:
# Convert 'Sales Invoice Date' to datetime if not already
df['Sales Invoice Date'] = pd.to_datetime(df['Sales Invoice Date'], errors='coerce')

# Filter for the specific customer and year
df_filtered = df[
    (df['Customer'] == 'SEVEN UP BOTTLING CO PLC') & 
    (df['Sales Invoice Date'].dt.year == 2024)
]

# Create a 'month' column in YYYY-MM format
df_filtered['month'] = df_filtered['Sales Invoice Date'].dt.to_period('M').astype(str)

# Calculate profit and group by month
result = (
    df_filtered
    .groupby('month')
    .apply(lambda x: x['Invoice Net value'].sum() - x['Tax Amount'].sum())
    .reset_index(name='profit')
    .sort_values('month')
)

print(result)

# FORMULAS FOR SALES PACKAGING

## LIST OF FORMULAS I WANT:

1. Profit Margin
2. Profit Margin Ratio
3. Lower Bound
4. Upper Bound
5. Outliers
6. IQR
7. EBIDITA
8. Quartile

In [None]:
import pandas as pd

# First, define all cost columns
cost_columns_per_mt = [
    'Marine Insurance(USD/MT)',
    'Freight Charge Road(USD/MT)',
    'Freight Charge Sea(USD/MT)',
    'Fobbing Charge Sea O/B(USD/MT)',
    'Destination Charge Sea(USD/MT)',
    'Freight Charge Air(USD/MT)',
    'Credit Insurance Cost(USD/MT)',
    'Interest Cost (CC)(USD/MT)',
    'Power Fuel Utilities(USD/MT)',
    'Packing Cost(USD/MT)',
    'MB Cost(USD/MT)'
]

# Calculate total cost per MT
df['Total Cost per MT'] = df[cost_columns_per_mt].sum(axis=1)

# Total cost for the entire quantity
df['Total Cost'] = df['Quantity MT'] * df['Total Cost per MT']

df['Export Incentive Reduction'] = df['Quantity MT'] * df['Export Incentive(USD/MT)']
df['Adjusted Total Cost'] = df['Total Cost'] - df['Export Incentive Reduction']

# Profit (Revenue - Cost)
df['Profit'] = df['Invoice Net value'] - df['Adjusted Total Cost']

# Profit Margin Ratio
df['Profit Margin Ratio (%)'] = (df['Profit'] / df['Invoice Net value']) * 100

print(df[['Invoice Net value', 'Profit', 'Profit Margin Ratio (%)']].head())


In [None]:
# Calculate IQR for Profit Margin Ratio (%)
q1 = df['Profit Margin Ratio (%)'].quantile(0.25)
q3 = df['Profit Margin Ratio (%)'].quantile(0.75)
iqr = q3 - q1

# Calculate bounds
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

print(f"Q1 (25th percentile): {q1:.2f}%")
print(f"Q3 (75th percentile): {q3:.2f}%")
print(f"IQR: {iqr:.2f}%")
print(f"Lower Bound: {lower_bound:.2f}%")
print(f"Upper Bound: {upper_bound:.2f}%")

# Identify outliers
df['Is Outlier'] = ~df['Profit Margin Ratio (%)'].between(lower_bound, upper_bound)

# View number of outliers
outlier_count = df['Is Outlier'].sum()
print(f"Number of outliers: {outlier_count}")

# Optional: view few outlier rows
outliers_df = df[df['Is Outlier']]
print(outliers_df[['Invoice Net value', 'Profit', 'Profit Margin Ratio (%)']].head())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.histplot(df['Profit Margin Ratio (%)'], bins=50, kde=True)
plt.title("Profit Margin Distribution")
plt.show()

In [None]:
sns.scatterplot(x='Invoice Net value', y='Profit Margin Ratio (%)', data=df)
plt.title("Volume vs Margin")
plt.show()

In [None]:
df['Month'] = df['Sales Invoice Date'].dt.to_period('M')
monthly_margin = df.groupby('Month')['Profit Margin Ratio (%)'].mean()
monthly_margin.plot(figsize=(10,6), marker='o', title='Monthly Profit Margin Trend')
plt.show()

In [None]:
cost_cols = [
    'Marine Insurance(USD/MT)', 'Freight Charge Road(USD/MT)',
    'Freight Charge Sea(USD/MT)', 'Freight Charge Air(USD/MT)',
    'Credit Insurance Cost(USD/MT)', 'Interest Cost (CC)(USD/MT)',
    'Power Fuel Utilities(USD/MT)', 'Packing Cost(USD/MT)', 'MB Cost(USD/MT)'
]

corr = df[cost_cols + ['Profit Margin Ratio (%)']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix: Costs vs Margin")
plt.show()