In [58]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv("sales_data_sample.csv", encoding="ISO-8859-1")

# Standardize column names (lowercase for consistency)
df.columns = df.columns.str.lower()

# Convert ORDERDATE to datetime format
df['orderdate'] = pd.to_datetime(df['orderdate'])

# Handling missing values
df.fillna({"state": "Unknown", "postalcode": "00000", "territory": "Not Assigned"}, inplace=True)

# Drop ADDRESSLINE2 since it's mostly empty
df.drop(columns=['addressline2'], inplace=True)

# Remove duplicate rows if any
before_dup = df.shape[0]
df.drop_duplicates(inplace=True)
after_dup = df.shape[0]
print(f"Removed {before_dup - after_dup} duplicate rows.")

# Handling Outliers for QUANTITYORDERED, SALES, and MSRP
def cap_outliers(series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    return series.clip(lower_bound, upper_bound)

df["quantityordered"] = cap_outliers(df["quantityordered"])
df["sales"] = cap_outliers(df["sales"])
df["msrp"] = cap_outliers(df["msrp"])

# Checking for inconsistent data types
df['quantityordered'] = pd.to_numeric(df['quantityordered'], errors='coerce')
df['priceeach'] = pd.to_numeric(df['priceeach'], errors='coerce')
df['sales'] = pd.to_numeric(df['sales'], errors='coerce')

# Standardizing categorical data
df['state'] = df['state'].str.upper()
df['territory'] = df['territory'].str.upper()
df['postalcode'] = df['postalcode'].astype(str)

# Display cleaned data info
df.info()
print("Data Cleaning Completed Successfully!")



Removed 0 duplicate rows.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2823 entries, 0 to 2822
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   ordernumber       2823 non-null   int64         
 1   quantityordered   2823 non-null   int64         
 2   priceeach         2823 non-null   float64       
 3   orderlinenumber   2823 non-null   int64         
 4   sales             2823 non-null   float64       
 5   orderdate         2823 non-null   datetime64[ns]
 6   status            2823 non-null   object        
 7   qtr_id            2823 non-null   int64         
 8   month_id          2823 non-null   int64         
 9   year_id           2823 non-null   int64         
 10  productline       2823 non-null   object        
 11  msrp              2823 non-null   int64         
 12  productcode       2823 non-null   object        
 13  customername      2823 non-null   object        
 14

In [56]:
import pandas as pd
import numpy as np
from scipy import stats

# Load the dataset
df = pd.read_csv("sales_data_sample.csv", encoding="ISO-8859-1")

# Standardize column names (lowercase for consistency)
df.columns = df.columns.str.lower()

# Numeric columns for analysis
numeric_columns = [
    "ordernumber", "quantityordered", "priceeach", "orderlinenumber",
    "sales", "qtr_id", "month_id", "year_id", "msrp"
]

# Mapping for user input
column_mapping = {str(i+1): col for i, col in enumerate(numeric_columns)}
column_mapping["10"] = "exit"  # Exit option

def show_stats():
    while True:
        # Display column options
        print("\nSelect a column for statistical analysis:")
        for key, value in column_mapping.items():
            print(f"{key}. {value}")
        
        # User input for column selection
        choice = input("Enter the number corresponding to the column: ").strip()

        if choice == "10":  # Exit condition
            print("Exiting program. Goodbye!")
            break

        if choice not in column_mapping:
            print("Invalid input! Please enter a valid number.")
            continue

        column_name = column_mapping[choice]

        # Compute statistics
        print(f"\nSummary Statistics for '{column_name}':")
        print(f"Mean: {df[column_name].mean():.2f}")
        print(f"Median: {df[column_name].median():.2f}")
        print(f"Mode: {df[column_name].mode()[0]:.2f}")
        print(f"Standard Deviation: {df[column_name].std():.2f}")
        print(f"Skewness: {df[column_name].skew():.2f}")
        print(f"Kurtosis: {df[column_name].kurtosis():.2f}")

        # Ask user if they want to continue
        while True:
            more_data = input("\nDo you want to analyze another column? (yes/no): ").strip().lower()
            if more_data in ["yes", "y"]:
                break  # Continue to next iteration of main loop
            elif more_data in ["no", "n"]:
                print("Exiting program. Goodbye!")
                return  # Exit function
            else:
                print("Invalid input! Please enter 'yes' (or 'y') or 'no' (or 'n').")

# Run the function
show_stats()



Select a column for statistical analysis:
1. ordernumber
2. quantityordered
3. priceeach
4. orderlinenumber
5. sales
6. qtr_id
7. month_id
8. year_id
9. msrp
10. exit


Enter the number corresponding to the column:  4



Summary Statistics for 'orderlinenumber':
Mean: 6.47
Median: 6.00
Mode: 1.00
Standard Deviation: 4.23
Skewness: 0.59
Kurtosis: -0.56



Do you want to analyze another column? (yes/no):  y



Select a column for statistical analysis:
1. ordernumber
2. quantityordered
3. priceeach
4. orderlinenumber
5. sales
6. qtr_id
7. month_id
8. year_id
9. msrp
10. exit


Enter the number corresponding to the column:  5



Summary Statistics for 'sales':
Mean: 3553.89
Median: 3184.80
Mode: 3003.00
Standard Deviation: 1841.87
Skewness: 1.16
Kurtosis: 1.79



Do you want to analyze another column? (yes/no):  n


Exiting program. Goodbye!


In [72]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming df is your DataFrame
def plot_graph(graph_type, column_x=None, column_y=None):
    plt.figure(figsize=(10, 5))
    
    if graph_type == "line":
        sns.lineplot(x=df[column_x], y=df[column_y])
    elif graph_type == "area":
        plt.fill_between(df[column_x], df[column_y], alpha=0.5)
    elif graph_type == "box":
        sns.boxplot(x=df[column_x], y=df[column_y])
    elif graph_type == "pie":
        df[column_x].value_counts().plot.pie(autopct="%1.1f%%")
    elif graph_type == "heatmap":
        numeric_df = df.select_dtypes(include=['number'])  # Select only numeric columns
        sns.heatmap(numeric_df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
    
    plt.title(f"{graph_type.capitalize()} Chart")
    plt.xticks(rotation=45)
    plt.show()

def analysis_chatbot():
    while True:
        print("Select an analysis type:")
        print("1. Sales Trends (Time-based)")
        print("2. Seasonal Analysis")
        print("3. Top Performing Products")
        print("4. Correlation Analysis")
        print("5. Exit")
        choice = input("Enter your choice (1-5): ")
        
        if choice == "1":
            print("Select a graph type:")
            print("1. Line Chart")
            print("2. Area Chart")
            print("3. Back")
            graph_choice = input("Enter your choice (1-3): ")
            if graph_choice == "1":
                plot_graph("line", "orderdate", "sales")
            elif graph_choice == "2":
                plot_graph("area", "orderdate", "sales")
            elif graph_choice == "3":
                continue
            else:
                print("Invalid choice!")
        
        elif choice == "2":
            print("Select a graph type:")
            print("1. Box Plot")
            print("2. Back")
            graph_choice = input("Enter your choice (1-2): ")
            if graph_choice == "1":
                plot_graph("box", "month_id", "sales")
            elif graph_choice == "2":
                continue
            else:
                print("Invalid choice!")
        
        elif choice == "3":
            print("Select a graph type:")
            print("1. Pie Chart")
            print("2. Back")
            graph_choice = input("Enter your choice (1-2): ")
            if graph_choice == "1":
                plot_graph("pie", "productline")
            elif graph_choice == "2":
                continue
            else:
                print("Invalid choice!")
        
        elif choice == "4":
            print("Generating Heatmap for Correlation Analysis...")
            plot_graph("heatmap")
        
        elif choice == "5":
            print("Exiting...")
            break
        else:
            print("Invalid choice. Please select a valid option.")

analysis_chatbot()


Select an analysis type:
1. Sales Trends (Time-based)
2. Seasonal Analysis
3. Top Performing Products
4. Correlation Analysis
5. Exit


Enter your choice (1-5):  5


Exiting...
