<a href="https://colab.research.google.com/github/JoremBlue/numpy-pandas-data-analysis/blob/main/numpy_pandas_data_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np

# Create a sample array
data = np.array([1, 2, 3, 4, 5])
print("Original Array:", data)
# it prints [1, 2, 3, 4, 5] - this is also the original array


# Reshape into a 2x3 array (adding one more element for illustration)
data_matrix = np.arange(1, 7).reshape(2, 3)
print("2x3 Matrix:\n", data_matrix)
# np.arange(1, 7) creates [1, 2, 3, 4, 5, 6]
# and reshape(2, 3) shapes the array into a 2 rows 3 columns matrix
# [[1 2 3]
# [4 5 6]]


# Matrix multiplication example:
matrix_a = np.array([[1, 2], [3, 4]])
matrix_b = np.array([[2, 0], [1, 2]])
result = np.dot(matrix_a, matrix_b)
print("Matrix Multiplication Result:\n", result)
# multiplies the 2 matrix by 2x2
# result[0][0] = (1*2 + 2*1) = 2 + 2 = 4
# result[0][1] = (1*0 + 2*2) = 0 + 4 = 4
# result[1][0] = (3*2 + 4*1) = 6 + 4 = 10
# result[1][1] = (3*0 + 4*2) = 0 + 8 = 8

# Matrix Multiplication Result:
# [[ 4  4]
#  [10  8]]


Original Array: [1 2 3 4 5]
2x3 Matrix:
 [[1 2 3]
 [4 5 6]]
Matrix Multiplication Result:
 [[ 4  4]
 [10  8]]


In [None]:
import pandas as pd
import os

dummy_data_csv_path = 'sample_data.csv'

if not os.path.exists(dummy_data_csv_path):
    print(f"Creating a dummy '{dummy_data_csv_path}' file.")
    dummy_data = {'Category': ['A', 'B', 'A', 'C', 'B'],
                  'Sales': [100, 150, 110, 200, 160],
                  'Value': [10, 15, None, 20, 16]} # Include a None for fillna
    dummy_df_to_save = pd.DataFrame(dummy_data)
    dummy_df_to_save.to_csv(dummy_data_csv_path, index=False)

# Load a sample CSV into a DataFrame
# Now the dummy file should exist if the original was not found
try:
    df = pd.read_csv('sample_data.csv')
except FileNotFoundError:
    # This should ideally not happen if the dummy file creation worked
    print("Error: sample_data.csv not found even after attempting creation.")


# Inspecting the first few rows
print(df.head())

# Handling missing values
# Using method='ffill' requires sorting, or can use fillna(0) or mean()
# Let's use fillna(0) for simplicity or ensure 'Value' column exists
if 'Value' in df.columns:
    df.fillna(0, inplace=True) # Using 0 to fill missing values

# Grouping: Compute average sales by category
# Check if 'Category' and 'Sales' columns exist
if 'Category' in df.columns and 'Sales' in df.columns:
    avg_sales = df.groupby('Category')['Sales'].mean()
    print("Average Sales by Category:\n", avg_sales)
else:
    print("Cannot compute average sales by category: Required columns ('Category' or 'Sales') not found.")

  Category  Sales  Value
0        A    100   10.0
1        B    150   15.0
2        A    110    NaN
3        C    200   20.0
4        B    160   16.0
Average Sales by Category:
 Category
A    105.0
B    155.0
C    200.0
Name: Sales, dtype: float64
