In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
try:
    df = pd.read_csv('dataset.csv')  # Replace with your actual dataset path
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("File not found. Please check the file path.")

# Display the first few rows to inspect the data
print(df.head())

# Explore the dataset structure
print("\nDataset Info:")
print(df.info())  # Check data types and missing values

# Clean the dataset: Handle missing values (you can fill or drop)
df = df.dropna()  # Drop rows with missing values (or you can use df.fillna() to fill them)

# Task 2: Basic Data Analysis

# Compute basic statistics of numerical columns
print("\nBasic Statistics of the dataset:")
print(df.describe())

# Perform groupings by a categorical column (e.g., region) and compute the mean of a numerical column (e.g., sales)
print("\nAverage Sales per Region:")
print(df.groupby('region')['sales'].mean())

# Task 3: Data Visualization

# Line chart: Show trends over time (for example, time-series data if you have a 'date' column)
plt.figure(figsize=(10, 6))
df['date'] = pd.to_datetime(df['date'])  # Ensure 'date' is in datetime format
plt.plot(df['date'], df['sales'], marker='o', color='b')
plt.title('Sales Trend Over Time')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

# Bar chart: Comparison of a numerical value across categories (e.g., average sales per region)
plt.figure(figsize=(8, 6))
df.groupby('region')['sales'].mean().plot(kind='bar', color='skyblue')
plt.title('Average Sales per Region')
plt.xlabel('Region')
plt.ylabel('Average Sales')
plt.xticks(rotation=45)
plt.show()

# Histogram: Distribution of a numerical column (e.g., sales)
plt.figure(figsize=(8, 6))
df['sales'].hist(bins=20, color='lightcoral')
plt.title('Distribution of Sales')
plt.xlabel('Sales')
plt.ylabel('Frequency')
plt.show()

# Scatter plot: Relationship between two numerical columns (e.g., sales vs. date)
plt.figure(figsize=(8, 6))
sns.scatterplot(x='date', y='sales', data=df, color='red')
plt.title('Sales vs Date')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.xticks(rotation=45)
plt.show()
