# Python Data Analysis Tutorial for Engineers

## Goals
- Learn NumPy for numerical computing
- Learn Pandas for data handling
- Learn Matplotlib for visualization

---
## Setup
Install required libraries:
```bash
pip install numpy pandas matplotlib
```
---

## 1. NumPy Basics
Key concepts:
- Arrays vs Lists
- Array creation (`np.array`, `np.arange`, `np.linspace`)
- Indexing, slicing
- Vectorized operations
- Aggregations (`mean`, `sum`, `std`)


In [None]:
# List: Slower for math-heavy operations (loops are executed in Python).
# Array: Much faster — operations run in optimized C code (vectorized).

# List: need a loop

import time
import numpy as np

# Prepare data
lst = list(range(10000000))            # Python list with 1 million elements
arr = np.array(lst)                   # NumPy array with same elements

# Timing list comprehension
start = time.time()
lst_result = [x*2 for x in lst]
end = time.time()
#print(f"List comprehension time: {end - start} seconds")
print(f"List comprehension time: {end - start:.2f} seconds")

# Timing NumPy vectorized operation
start = time.time()
arr_result = arr * 2
end = time.time()
print(f"Numpy vectorized time: {end - start:.2f} seconds")

In [None]:
import numpy as np

# Create arrays
a = np.array([1, 2, 3, 4, 5])
b = np.linspace(0, 1, 5)

In [None]:
import numpy as np

# Define vectors
v1 = np.array([1, 2, 3])
v2 = np.array([4, 5, 6])

print("v1:", v1)
print("v2:", v2)

# Vector addition and subtraction
print("v1 + v2 =", v1 + v2)
print("v1 - v2 =", v1 - v2)

# Scalar multiplication
print("2 * v1 =", 2 * v1)

# Dot product
print("v1 · v2 =", np.dot(v1, v2))   # 1*4 + 2*5 + 3*6

# Norm (magnitude)
print("‖v1‖ =", np.linalg.norm(v1))

In [None]:
import numpy as np

# 3x3 matrices
A = np.array([[1, 2, 3],
              [4, 5, 6],
              [7, 8, 9]])

B = np.array([[9, 8, 7],
              [6, 5, 4],
              [3, 2, 1]])

print("Matrix A:\n", A)
print("Matrix B:\n", B)

# Element-wise operations
print("A + B:\n", A + B)
print("A * B (element-wise):\n", A * B)

# Matrix multiplication
print("A @ B:\n", A @ B)

# Transpose
print("A.T:\n", A.T)

In [None]:
# 3x2 matrix
C = np.array([[1, 2],
              [3, 4],
              [5, 6]])

# 2x3 matrix
D = np.array([[7, 8, 9],
              [10, 11, 12]])

print("Matrix C (3x2):\n", C)
print("Matrix D (2x3):\n", D)

# Matrix multiplication: (3x2) @ (2x3) → (3x3)
print("C @ D:\n", C @ D)

# Matrix multiplication: (2x3) @ (3x2) → (2x2)
print("D @ C:\n", D @ C)

In [None]:
import numpy as np

arr1 = np.array([1, 2, 3, 4])          # from a list
arr2 = np.array([[1, 2], [3, 4]])      # from nested lists → 2D

print(arr1)

arr = np.arange(0, 10, 2)   # from 0 up to (but not including) 10, step 2
print(arr)

arr = np.linspace(0, 1, 5)   # 5 numbers between 0 and 1
print(arr)


In [None]:
import numpy as np

a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])

print("Array:", a)

# Basic statistics
print("Mean:", np.mean(a))             # average
print("Standard Deviation:", np.std(a)) # spread of data
print("Variance:", np.var(a))          # std^2

# Min and Max
print("Minimum:", np.min(a))
print("Maximum:", np.max(a))

# Percentiles (useful in engineering & data science)
print("25th percentile (Q1):", np.percentile(a, 25))
print("50th percentile (Median):", np.median(a))
print("75th percentile (Q3):", np.percentile(a, 75))

# Sum and Product
print("Sum:", np.sum(a))
print("Product:", np.prod(a))

# Cumulative sums and products
print("Cumulative sum:", np.cumsum(a))
print("Cumulative product:", np.cumprod(a))

### Exercise
- Create an array of numbers 0–99
- Reshape into 10×10 matrix
- Compute row sums and column means

## 2. Pandas for Data Handling

In [None]:
import numpy as np
import pandas as pd

# Create DataFrame
data = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Age": [24, 27, 22, 32],
    "Salary": [50000, 54000, 58000, 62000],
    "Department": ["HR", "Engineering", "Engineering", "HR"]
}
df = pd.DataFrame(data)

print("=== DataFrame ===")
print(df)

# ----------------------------
# 1. Basic Exploration
# ----------------------------
print("\n=== First rows ===")
print(df.head())

print("\n=== Summary Statistics ===")
print(df.describe())   # numeric stats

print("\n=== Info ===")
print(df.info())       # data types + null count

# ----------------------------
# 2. Column Selection
# ----------------------------
print("\n=== Select one column (Series) ===")
print(df["Name"])

print("\n=== Select multiple columns ===")
print(df[["Name", "Salary"]])

# ----------------------------
# 3. Filtering rows
# ----------------------------
print("\n=== Salary > 55,000 ===")
print(df[df["Salary"] > 55000])

print("\n=== Age between 25 and 30 ===")
print(df[(df["Age"] >= 25) & (df["Age"] <= 30)])

# ----------------------------
# 4. Creating new columns
# ----------------------------
df["Bonus"] = df["Salary"] * 0.1
print("\n=== With Bonus column ===")
print(df)

# ----------------------------
# 5. Sorting
# ----------------------------
print("\n=== Sorted by Salary descending ===")
print(df.sort_values(by="Salary", ascending=False))

# ----------------------------
# 6. Grouping and Aggregation
# ----------------------------
print("\n=== Average Salary by Department ===")
print(df.groupby("Department")["Salary"].mean())

print("\n=== Count employees per Department ===")
print(df.groupby("Department")["Name"].count())

# ----------------------------
# 7. Handling Missing Data
# ----------------------------
df2 = df.copy()
df2.loc[2, "Salary"] = None   # introduce a missing value
print("\n=== DataFrame with missing value ===")
print(df2)

print("\n=== Fill missing Salary with average ===")
df2["Salary"] = df2["Salary"].fillna(df2["Salary"].mean())
print(df2)

# ----------------------------
# 8. Exporting
# ----------------------------
df.to_csv("employees.csv", index=False)
print("\nDataFrame exported to employees.csv")


### Exercise
- Load the Iris dataset:
```python
url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv'
iris = pd.read_csv(url)
```
- Find mean of each column
- Filter rows where sepal_length > 5

## 3. Visualization with Matplotlib

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Example data for sine plot
x = np.linspace(0, 10, 100)
y = np.sin(x)

plt.plot(x, y, label="sin(x)")
plt.title("Sine Function")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

# Example DataFrame
data = {
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "Age": [24, 27, 22, 32],
    "Salary": [50000, 54000, 58000, 62000]
}
df = pd.DataFrame(data)

# Scatter plot
plt.scatter(df["Age"], df["Salary"])
plt.title("Age vs Salary")
plt.xlabel("Age")
plt.ylabel("Salary")
plt.show()


### Exercise
- Plot bar chart of average salary per department
- Plot histogram of ages