# Data Analysis & Visualization
This notebook demonstrates loading, cleaning, analyzing, and visualizing a dataset using **pandas**, **matplotlib**, and **seaborn**.

In [None]:
# Step 0: Create a dummy dataset for demonstration
import pandas as pd

dummy_data = {
    'col_A': [1, 2, None, 4, 5, 6, None, 8, 9, 10],
    'col_B': [1.1, 2.2, 3.3, None, 5.5, 6.6, 7.7, 8.8, None, 10.10],
    'col_C': ['A', 'B', 'A', 'C', None, 'B', 'A', 'C', 'B', None],
    'col_D': ['X', None, 'Y', 'X', 'Z', 'Y', 'X', None, 'Z', 'Y']
}

df_dummy = pd.DataFrame(dummy_data)
df_dummy.to_csv('data.csv', index=False)
print("Dummy 'data.csv' file created.")

In [None]:
# Step 1: Load the dataset with error handling
file_path = 'data.csv'

try:
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred while loading the file: {e}")

In [None]:
# Step 2: Explore and clean the dataset
print("\nFirst 5 rows:")
display(df.head())

print("\nData types and info:")
df.info()

print("\nMissing values before handling:")
display(df.isnull().sum())

# Handle missing values
numerical_cols = df.select_dtypes(include=['number']).columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns

for col in numerical_cols:
    df[col] = df[col].fillna(df[col].mean())

for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

print("\nMissing values after handling:")
display(df.isnull().sum())

In [None]:
# Step 3: Basic Data Analysis
print("Descriptive statistics for numerical columns:")
display(df.describe())

# Group by a categorical column
categorical_col = 'col_C'
numerical_col = 'col_A'
grouped_mean = df.groupby(categorical_col)[numerical_col].mean()

print(f"\nMean of '{numerical_col}' grouped by '{categorical_col}':")
display(grouped_mean)

In [None]:
# Step 4: Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')

# 1. Line Chart
plt.figure(figsize=(10,6))
plt.plot(df.index, df['col_A'], marker='o')
plt.title('Line Chart of col_A over Index')
plt.xlabel('Index')
plt.ylabel('col_A Value')
plt.show()

# 2. Bar Chart of grouped mean
plt.figure(figsize=(8,5))
grouped_mean.plot(kind='bar', color=['skyblue','orange','green'])
plt.title(f'Mean of {numerical_col} grouped by {categorical_col}')
plt.xlabel(categorical_col)
plt.ylabel(f'Mean {numerical_col}')
plt.xticks(rotation=0)
plt.show()

# 3. Histogram of col_B
plt.figure(figsize=(8,5))
sns.histplot(df['col_B'], bins=5, kde=True, color='purple')
plt.title('Histogram of col_B')
plt.xlabel('col_B Value')
plt.ylabel('Frequency')
plt.show()

# 4. Scatter Plot of col_A vs col_B
plt.figure(figsize=(8,5))
sns.scatterplot(data=df, x='col_A', y='col_B', hue='col_C', palette='Set1', s=100)
plt.title('Scatter Plot of col_A vs col_B')
plt.xlabel('col_A')
plt.ylabel('col_B')
plt.legend(title='col_C')
plt.show()

## Step 5: Summary of Findings
- All missing values were handled: numerical columns filled with mean, categorical columns filled with mode.
- Descriptive statistics show ranges and means of numerical columns.
- Grouped analysis reveals differences in col_A across categories of col_C.
- Visualizations (line, bar, histogram, scatter) provide insight into trends, distributions, and relationships.