In [3]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [4]:
df = pd.read_csv(r"C:\Users\USER\Desktop\sales pred\Test.csv")
df.head()

Unnamed: 0,Product code,Product name,Unit name,Quantity,Rate,Amount,Tax,Gross amount,Month
0,4,180ML DISPOSABLE GLASSES,PIECE,4.1,3000,10169,1813,12000.0,October
1,7,300MLS DISPOSABLE GLASSES,PIECE,22.0,5000,93220,16780,110000.0,October
2,9,500MLS DISPOSABLE GLASSES,PIECE,1.0,6000,5085,915,6000.0,October
3,12,7 DAYS BABY WIPES,PIECE,2.0,8500,14407,2593,17000.0,October
4,15,7DAYS ANTIBACTERIAL BABY WIPES,PIECE,2.0,8500,14407,2593,17000.0,October


In [5]:
df.shape

(4988, 9)

In [6]:
df.columns

Index(['Product code', 'Product name ', 'Unit name', 'Quantity', 'Rate',
       'Amount', 'Tax', 'Gross amount', 'Month'],
      dtype='object')

In [7]:
df.info

<bound method DataFrame.info of       Product code                   Product name  Unit name  Quantity   Rate  \
0                4        180ML DISPOSABLE GLASSES     PIECE       4.1   3000   
1                7       300MLS DISPOSABLE GLASSES     PIECE      22.0   5000   
2                9       500MLS DISPOSABLE GLASSES     PIECE       1.0   6000   
3               12               7 DAYS BABY WIPES     PIECE       2.0   8500   
4               15  7DAYS ANTIBACTERIAL BABY WIPES     PIECE       2.0   8500   
...            ...                             ...       ...       ...    ...   
4983          8191            MICROPHONE TOY 18000     PIECE       1.0  17000   
4984          8192              WATER GUN TOY 6000     PIECE       1.0   6000   
4985          8193                  WATER GUN 8000     PIECE       2.0   8000   
4986          8200                    DIARIES 7OOO     PIECE       1.0   7000   
4987          8201                   DIARIES 17000     PIECE       1.0  17000

In [8]:
df.isnull().sum()

Product code     0
Product name     0
Unit name        0
Quantity         0
Rate             0
Amount           0
Tax              0
Gross amount     0
Month            0
dtype: int64

In [10]:


# --- Data Cleaning ---
# Clean column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# Standardize the month column (keep as string, just clean up formatting)
df['month'] = df['month'].astype(str).str.strip().str.title()

# Standardize product_code and product_name mapping
code_name_mapping = df.groupby('product_code')['product_name'].nunique()
inconsistent_codes = code_name_mapping[code_name_mapping > 1]
for code in inconsistent_codes.index:
    most_frequent_name = df[df['product_code'] == code]['product_name'].mode()[0]
    df.loc[df['product_code'] == code, 'product_name'] = most_frequent_name

# Trim spaces and standardize strings
df['product_name'] = df['product_name'].str.strip()
df['unit_name'] = df['unit_name'].str.strip().str.title()

# Recalculate tax (since amount and tax are present, but gross_amount is not)
# Note: Without gross_amount, we assume tax is correct as-is; no recalculation needed

# Cap outliers using IQR for rate and amount (skip tax, as per previous update)
for col in ['rate', 'amount']:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[col] = df[col].clip(lower=lower_bound, upper=upper_bound).astype(int)

# Save the cleaned dataset
df.to_csv('cleaned_test_dataset.csv', index=False)
print("Cleaned test dataset saved as 'cleaned_test_dataset.csv'")

# --- Data Assessment ---
print("\n### 1. Data Assessment")
print("\nShape of the dataset:", df.shape)
print("\nColumns in the dataset:", df.columns)
print("\nData types and info:")
df.info()
print("\nSummary statistics:")
print(df.describe())
print("\nMissing values in each column:")
print(df.isnull().sum())
print("\nNumber of duplicated rows:", df.duplicated().sum())

# Define numerical & categorical columns
numeric_features = [feature for feature in df.columns if df[feature].dtype != 'O']
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print('\nWe have {} numerical features in test data: {}'.format(len(numeric_features), numeric_features))
print('We have {} categorical features in test data: {}'.format(len(categorical_features), categorical_features))

# Check unique values in categorical features
print('\nNumber of unique data points in categorical features in the test dataset:')
print('Number of unique data points in Product code:', df['product_code'].nunique(), df['product_code'].unique())
print('Number of unique data points in Product name:', df['product_name'].nunique(), df['product_name'].unique())
print('Number of unique data points in Unit name:', df['unit_name'].nunique(), df['unit_name'].unique())
print('Number of unique data points in Month:', df['month'].nunique(), df['month'].unique())

# --- Additional Cleaning Checks ---
# Verify one-to-one mapping between 'product_code' and 'product_name'
code_name_mapping = df.groupby('product_code')['product_name'].nunique()
inconsistent_codes = code_name_mapping[code_name_mapping > 1]
if not inconsistent_codes.empty:
    print("\nProduct codes with multiple names after fix:")
    print(inconsistent_codes)
else:
    print("\nAll product codes map to exactly one product name after fix.")

# Check for case inconsistencies or extra spaces in 'unit_name' and 'month'
print("\nUnique values in 'unit_name':", df['unit_name'].unique())
print("Unique values in 'month':", df['month'].unique())

# Validate numeric columns: Check for negative values
negative_rate = df[df['rate'] < 0]
if not negative_rate.empty:
    print("\nRows with negative rate:")
    print(negative_rate[['product_code', 'product_name', 'rate']])

# --- Product Performance Analysis ---
print("\n### Product Performance")
# Set plot style
sns.set(style="whitegrid")

# Group by product_name for performance analysis
product_performance = df.groupby('product_name').agg({
    'amount': 'sum',
    'tax': 'sum',
    'product_code': 'first'  # product_code for reference
}).reset_index()

# Sort by amount to find top-performing products
top_products = product_performance.sort_values(by='amount', ascending=False).head(10)
print("\nTop 10 Products by Total Amount:")
print(top_products[['product_name', 'product_code', 'amount']])

# --- Univariate EDA ---
print("\n### Univariate EDA")
# Distribution Plots for Numeric Columns
numeric_cols = ['rate', 'amount', 'tax']
plt.figure(figsize=(15, 5))
for i, col in enumerate(numeric_cols, 1):
    plt.subplot(1, 3, i)
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
plt.tight_layout()
plt.savefig('test_distribution_plots.png')
plt.close()

# Count Plots for Categorical Columns
categorical_cols = ['unit_name', 'month']
plt.figure(figsize=(15, 5))
for i, col in enumerate(categorical_cols, 1):
    plt.subplot(1, 2, i)
    sns.countplot(data=df, x=col, order=df[col].value_counts().index)
    plt.title(f'Count Plot of {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('test_count_plots.png')
plt.close()

# --- Bivariate EDA ---
print("\n### Bivariate EDA")
# Scatter Plot: Rate vs. Amount (since quantity and gross_amount are not available)
plt.figure(figsize=(10, 6))
sns.scatterplot(data=product_performance, x='amount', y='tax', hue='amount', size='amount')
plt.title('Scatter Plot: Amount vs. Tax by Product')
plt.xlabel('Total Amount')
plt.ylabel('Total Tax')
plt.savefig('test_scatter_amount_vs_tax.png')
plt.close()

# --- Multivariate EDA ---
print("\n### Multivariate EDA")
# Correlation Matrix
plt.figure(figsize=(10, 8))
corr_matrix = df[numeric_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Numeric Columns')
plt.savefig('test_correlation_matrix.png')
plt.close()

# Variance Inflation Factor (VIF) for Numeric Columns
vif_data = df[numeric_cols].dropna()
vif_results = pd.DataFrame()
vif_results['Feature'] = numeric_cols
vif_results['VIF'] = [variance_inflation_factor(vif_data.values, i) for i in range(vif_data.shape[1])]
print("\nVariance Inflation Factor (VIF) for Numeric Columns:")
print(vif_results)

# --- Outlier Detection ---
print("\n### Outlier Detection")
# Box Plots for Numeric Columns
plt.figure(figsize=(15, 5))
for i, col in enumerate(numeric_cols, 1):
    plt.subplot(1, 3, i)
    sns.boxplot(y=df[col])
    plt.title(f'Box Plot of {col}')
plt.tight_layout()
plt.savefig('test_box_plots.png')
plt.close()

# IQR-Based Outlier Detection
outliers_summary = {}
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    outliers_summary[col] = {
        'num_outliers': len(outliers),
        'lower_bound': lower_bound,
        'upper_bound': upper_bound
    }
print("\nOutlier Summary (IQR Method):")
for col, info in outliers_summary.items():
    print(f"{col}: {info['num_outliers']} outliers (lower: {info['lower_bound']:.2f}, upper: {info['upper_bound']:.2f})")

# Visualize Outliers in Amount
gross_outliers = product_performance[
    (product_performance['amount'] < outliers_summary['amount']['lower_bound']) |
    (product_performance['amount'] > outliers_summary['amount']['upper_bound'])
]
plt.figure(figsize=(10, 6))
sns.scatterplot(data=product_performance, x='amount', y='tax', color='blue', label='Normal')
sns.scatterplot(data=gross_outliers, x='amount', y='tax', color='red', label='Outliers')
plt.title('Scatter Plot with Outliers in Amount')
plt.xlabel('Total Amount')
plt.ylabel('Total Tax')
plt.legend()
plt.savefig('test_outliers_amount.png')
plt.close()

# --- Save Top Products for Stocking Recommendations ---
top_products.to_csv('test_top_products.csv', index=False)
print("\nTop products saved to 'test_top_products.csv' for stocking recommendations.")

Cleaned test dataset saved as 'cleaned_test_dataset.csv'

### 1. Data Assessment

Shape of the dataset: (4987, 9)

Columns in the dataset: Index(['product_code', 'product_name', 'unit_name', 'quantity', 'rate',
       'amount', 'tax', 'gross_amount', 'month'],
      dtype='object')

Data types and info:
<class 'pandas.core.frame.DataFrame'>
Index: 4987 entries, 0 to 4987
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_code  4987 non-null   int64 
 1   product_name  4987 non-null   object
 2   unit_name     4987 non-null   object
 3   quantity      4987 non-null   int32 
 4   rate          4987 non-null   int32 
 5   amount        4987 non-null   int32 
 6   tax           4987 non-null   int32 
 7   gross_amount  4987 non-null   int32 
 8   month         4987 non-null   object
dtypes: int32(5), int64(1), object(3)
memory usage: 292.2+ KB

Summary statistics:
       product_code     quantity          rate   