# NumPy library

NumPy is a library for the Python programming language, adding support for large, multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays

Creating Arrays

In [None]:
# 📦 Importing Numpy Library
import numpy as np

In [None]:
# 🧱 Creating a 1D Numpy Array and Checking Dimensions

#np.array creates an array
a=np.array([1,2,3])
print(a)
#a.ndim returns the dimension of the array
print(a.ndim)

print(a.dtype)

In [None]:
# 🧱 Creating a 2D Numpy Array and Checking Properties (Dimension, Shape, Type)

b=np.array([[1.1,2.4,3.3],[4.1,5.2,6.5]])
print (b)

# b.ndim returns the number of dimensions (2 for a 2D array)
print(b.ndim)

# b.shape returns the shape (rows, columns)
print(b.shape)

# b.dtype returns the data type of elements in the array
print(b.dtype)


Array Indexing and Slicing

In [None]:
# 🧪 Executing a Code Snippet: 2D Integer Array and Element Access

c=np.array([[1,2,3,4],[5,6,7,8],[9,10,11,12]],"\n")
print(c,"\n")
print(c[1,2],"\n")
print("Second row",c[1,:],"\n")# : = all elements 

print("Second to Last Row",c[-2,:],"\n")# : = all elements 

print("Last column",c[:,-1],"\n")
print("The middle 2 elements in row 2",c[1,1:3],"\n")
#set 3 = 20
c[0,2]=20
print("After change","\n",c)

Create Zeros and Ones And Identity Arrays

In [None]:
# ⚪ Creating Arrays Filled with Zeros or Ones

#np.zeros(rows,columns) -> creates array filled with zeros
a=np.zeros((2,3))
print ("a","\n",a,"\n")
#np.ones(rows,columns) -> creates array filled with ones
b=np.ones((2,3))
print ("b","\n",b,"\n")
#np.full((rows,columns),number) -> creates array filled with specific number
c=np.full((2,3),'Kiwilytics')
print ("c","\n",c,"\n")
#np.identity(square -> Rows = columns) -> creates array filled with ones at the diagonal
s=np.identity(3)
print("s","\n",s,"\n")

print(a.dtype)


Create Array with Random Numbers

In [None]:
# 🎲 Generating Random Numbers

#np.random.rand creates array of random numbers between 0 and 1 of specific size
d=np.random.rand(2,3)
print(d,"\n")
#np.random.randint creates array of random numbers between specified start and end integer numbers of specific size
e=np.random.randint(1,6,size=(2,3))#end is not included
print(e)

Sync Arrays

In [None]:
# 🧪 2 Arrays that affect one another

x1=np.array([[1,2,3],[4,5,6]])
print(x1,"\n")
#Any change in x2 will be applied on x1
x2=x1
x2[0,1]=10
print("x1","\n",x1,"\n\n","x2","\n",x2)

In [None]:
# 🧪 2 Arrays that do not affect one another

y1=np.array([[1,2,3],[4,5,6]])
print(y1,"\n")
#copy() creates copy of y1 so any change in y2 will not be applied on y1
y2=y1.copy()
y2[0,1] = 10
print("y1","\n",y1,"\n\n","y2","\n",y2)

Arithmatic Operations

In [None]:
# 🧪 Arithmatic Operations on Arrays

arr=np.array([[1,2,3,4],[5,6,7,8]])
print("arr","\n",arr,"\n")
print("arr+2","\n",arr+2,"\n")
print("arr-2","\n",arr-2,"\n")
print("arr*2","\n",arr*2,"\n")
print("arr/2","\n",arr/2,"\n")
print("arr**2","\n",arr**2,"\n")


Useful NumPy Functions for Data Work

In [None]:
# 🧪 Extend an array with a repeat

arr=np.array([[1,2,3,4],[5,6,7,8]])
#repeat(arr,number) flattening the array and repeat the array number of times
print("arr","\n",arr,"\n")

arr1=np.repeat(arr,2)
print("arr1","\n",arr1,"\n")

#if the axis is specified no flattening will happen
arr2=np.repeat(arr,2,axis=0) #0 = row
print("arr2","\n",arr2,"\n")

arr3=np.repeat(arr,2,axis=1) #1 = column
print("arr3","\n",arr3,"\n")

In [None]:
# 🧪 Create a bigger Array using smaller ones through stacking

a=np.array([1,2,3,4])
print("a",a)
b=np.array([5,6,7,8])
print("b",b)

#np.vstack creates vertical stack of specific order ([order])
print("vertical","\n",np.vstack([a,b,a,b]),"\n")
#np.hstack creates horizontal stack of specific order ([order])
print("horizontal",np.hstack([b,a,b,a]))

Concatenate Arrays

In [None]:
# 🧪 Create Bigger Array through concatenating smaller ones

k=np.array([[1,2,3,4],[5,6,7,8]])
print("K",k,'\n')

k1=np.array([[9,10,11,12],[13,14,15,16]])
print("K1",k1,'\n')

print("Concatenate")
print(np.concatenate((k,k1),axis=0),"\n") #0=row (on top) ~Vstacking
print(np.concatenate((k,k1),axis=1),"\n") #1 = column (Beside) ~Hstacking

#Comparing with Stacking
#print("Stacking")
print(np.vstack([k,k1]),"\n")
print(np.hstack([k,k1]),"\n")

Array Manipulation (Generate, Reshape and Transpose Arrays)

In [None]:
#np.arange(start,end,number of steps) creates array , end is not included
x=np.arange(1,11,2)
print(x,"\n")
#np.linspace(start,end,number of samples) creates evenly spaced samples 
y=np.linspace(1,100,11)
print(y,"\n")
arr=np.array([[1,2,3,4,5,6,7],[8,9,10,11,12,13,14]])
print("arr","\n",arr,"\n")
#arr.reshape(Rows,columns) changes the array size (before 7*2=14, after must equal 14)
print(arr.reshape(7,2))

print("transposed","\n",np.transpose(arr))

print("flattened","\n",arr.flat[:])

#print("arr1","\n",arr1,"\n")

#np.transpose(arr1)-> row=column and column=row


Append Array to Another

In [None]:
# Append an Array to another array

u=np.array([[1,2,3,4],[5,6,7,8]])
print("u","\n",u,"\n")

u1=np.append(u,[[9,10,11,12]],axis=0) #Underneath 
print("u1","\n",u1,"\n")

u2=np.append(u,[[5],[9]],axis=1) #
print("u2","\n",u2,"\n")

Insert and Delete from Arrays

In [None]:
# Append an Array to another array

r=np.array([[1,2,3,4],[5,6,7,8]])
print("r","\n",r,"\n")

r1=np.insert(r,2,[[11,12,13,14]],axis=0)
print("r1","\n",r1,"\n")

r2=np.insert(r,3,[10,11],axis=1)
print("r2","\n",r2,"\n")

#print(r,"\n\n",r1,"\n\n",r2,"\n\n")
r3=np.delete(r,1,axis=1)
print("r3","\n",r3,"\n")


Select Array Subset

In [None]:
#Select Subset of an Array
a=np.array([[1,2,3,4,5],[6,7,8,9,10],[11,12,13,14,15],[16,17,18,19,20],[21,22,23,24,25]])
print(a)

In [None]:
# Try to print only 11,12,16,17
b=a[2:4,0:2]
print(b)


# 🐼 Pandas Essentials: Practical Guide

### Pandas Series and DataFrames Basics

In [None]:
import numpy as np
import pandas as pd

# Series: Revenue over months
monthly_revenue = pd.Series([30000,50000,45000,np.nan,60000], name="Revenue")
#print("🔹 Monthly Revenue Series:\n", monthly_revenue,"\n")

# DataFrame: Employees at different firms
data = {
    'Employee': ['Youssef','Toka','Abdelrahman','Maher'],
    'Company': ['Kiwilytics', 'PwC', 'EY','KPMG'],
    'Experience_Years': [5, 7, 3, 9]
}

df = pd.DataFrame(data)

print("🔹 DataFrame Example:\n", df)

### 🔍 Exploring and Inspecting Your DataFrame

In [None]:
# First few rows
print("🔹 First 5 Rows:")
print(df.head())

# Last few rows

print("\n🔹 Last 5 Rows:")
print(df.tail())



In [None]:
# Info summary
print("\n📋 DataFrame Info:")
df.info()


# Describe numerical columns
print("\n📊 Summary Statistics:")
df.describe()


In [None]:
# Check if any values are missing
print("\n🚨 Missing Values Per Column:")
print(df.isna().sum())


# View full matrix of missing positions
print("\n🧼 Matrix of Missing Values:")
print(df.isna())


### Simulate a missing value

In [None]:
# Simulate a missing experience value

df.loc[1, 'Experience_Years'] = np.nan
print(df,"\n")

# Re-run missing value check
print("\n🚨 After introducing missing data:")
print(df.isna().sum())
print(df.dropna())


# Drop rows with missing data
print("\n🧹 After dropping missing rows:")


In [None]:
# Re-initialize the DataFrame to its original clean state

data = {
    'Employee': ['Youssef','Toka','Abdelrahman','Maher'],
    'Company': ['Kiwilytics', 'PwC', 'EY','KPMG'],
    'Experience_Years': [5, 7, 3, 9]
}

df = pd.DataFrame(data)


# Optional preview
print("✅ DataFrame Reset:")
print(df)

### Selecting and Filtering DataFrames

In [None]:
# Selecting columns
print("🔸 Company Column:\n", df['Company'] ,"\n")
#print("🔸 Name & Experience:\n",  ,"\n")
print("🔸 Company Column:\n", df[['Employee','Experience_Years']] ,"\n")


# Selecting rows
print("🔸 Row by label (loc):\n", df.loc[2],"\n")
print("🔸 Row by position (iloc):\n", df.iloc[2] ,"\n")

# Filtering: Employees with more than 5 years
print("🔸 Experienced Employees:\n", df[df['Experience_Years'] > 5],"\n")

# Filter: Working at Kiwilytics
print("🔸 Kiwilytics Team:\n",df[df['Company'] == 'Kiwilytics'] ,"\n")

### Modifying DataFrames and Adding Columns

In [None]:
# Standardize company names
df['Company'] = df['Company'].str.upper()
print("🔹 Standardized Company Names:\n", df,"\n")

# Add a seniority flag
df['Is_Very_Senior'] = df['Experience_Years'] >= 8
print("🔹 With Seniority Flag:\n", df,"\n")

### Handling Missing Data

In [None]:
# Create dataset of project budgets
df_projects = pd.DataFrame({
    'Client': ['Kiwilytics', 'PwC', 'EY', 'KPMG', 'McKinsey'],
    'Budget_kUSD': [120, 200, np.nan, 150, None]
})
print("🔸 Original Budgets:\n", df_projects,"\n")

# Fill missing budgets with mean

mean_budget = df_projects['Budget_kUSD'].mean()
print(mean_budget)

df_projects['Budget_kUSD'] = df_projects['Budget_kUSD'].fillna(mean_budget)

print("🔹 After Filling Missing Budgets:\n", df_projects,"\n")

### Grouping & Aggregating DataFrames

In [None]:
# Revenue per company per quarter
df_revenue = pd.DataFrame({
    'Company': ['Kiwilytics', 'PwC', 'EY', 'KPMG', 'Kiwilytics', 'PwC'],
    'Quarter': ['Q1', 'Q1', 'Q1', 'Q1', 'Q2', 'Q2'],
    'Revenue_kUSD': [120, 200, 180, 150, 130, 210]
})

print(df_revenue)

grouped = df_revenue.groupby('Company')['Revenue_kUSD'].sum()

print("📊 Total Revenue by Company:\n", grouped,"\n")


### Merging DataFrames

In [None]:
# Merge project managers and projects
df_managers = pd.DataFrame({
    'ManagerID': [1, 2, 3],
    'Name': ['Ibrahim', 'Maher', 'Abdelrahman']
})

print(df_managers)

df_projects = pd.DataFrame({
    'ProjectID': [101, 102, 103],
    'ManagerID': [1, 2, 2],
    'Client': ['Kiwilytics', 'PwC', 'EY']
})
print(df_projects)

merged_df = pd.merge(df_managers, df_projects,on='ManagerID',how='inner')
print("🔗 Merged Managers & Projects:\n", merged_df,"\n")

## Real-World Data Wrangling Problem (with Pandas)

🎯 Problem Statement: Unclean Orders Dataset

In this exercise, you’re stepping into the shoes of a Data Engineer at Kiwilytics working for a fast-growing eCommerce company. Your job is to clean and prepare the raw orders data collected from the online store, customer service, and shipping vendors.

The dataset was exported from the operations system and handed over to the data team. Unfortunately, like most real-world datasets, it’s far from clean. Your mission is to explore, clean, and transform this data so it’s ready for downstream analytics and reporting.

📊 About the Dataset: orders_with_issues.csv

This dataset contains a number of customer order records from an online retail platform. Each row represents a single customer order, including key details about the order date, shipping information, and cost. The data is intentionally messy to simulate what real-world data often looks like when handed to a data team.


💡 Your Objective:

By the end of this exercise, you will:

- Load the dataset into a DataFrame

- Inspect the raw data

- Clean invalid entries and handle missing values

- Engineer new features such as delivery time and shipping type

- Standardize the format for reporting

- Export a clean version

### Loading & Exploring the dataset

In [None]:
#Import Libraries
import numpy as np
import pandas as pd

# Load data

df = pd.read_csv("orders_with_issues.csv")

# Preview
#print("🔍 Sample Data:")
print(df.head())


# Summary
#print("\n🧾 Column Info:")
print(df.info())


# Check missing values
#print("\n🚨 Missing Values:")
print(df.isna().sum)


# Check unique shipping companies
print("\n📦 Unique Shipping Companies:")
print(df['ShippingCompany'].value_counts())


### Cleaning Dates, numeric entries that don't make sense

In [None]:

# Convert dates (Error Types: #coerce # ignore #raise)

df['OrderDate'] = pd.to_datetime(df['OrderDate'], errors='coerce')
df['ShippedDate'] = pd.to_datetime(df['ShippedDate'], errors='coerce')

# Clean shipping cost
df['ShippingCost'] = pd.to_numeric(df['ShippingCost'], errors='coerce')
df.loc[df['ShippingCost'] < 0, 'ShippingCost'] = np.nan
df['ShippingCost'] = df['ShippingCost'].fillna(df['ShippingCost'].median())
# Drop rows where both dates are missing

df = df[~(df['OrderDate'].isna() & df['ShippedDate'].isna())]

print(len(df))






### Handling Nulls, Fixing Fields & Standardization

In [None]:
# Handle nulls
df['OrderID'] = df['OrderID'].fillna(method='ffill')
df['CustomerID'] = df['CustomerID'].fillna("Unknown")
df['ShipCity'] = df['ShipCity'].fillna("Unspecified")

# Standardize names
df['ShipCountry'] = df['ShipCountry'].str.strip().str.title()
df['ShipCity'] = df['ShipCity'].str.strip().str.title()
df['ShippingCompany'] = df['ShippingCompany'].str.strip()

# Fix specific cases
df.loc[df['ShippingCompany'].str.contains("Kiwilytics", na=False), 'ShippingCompany'] = "Kiwilytics Goods Shipping LLC."

### Feature Engineering

In [None]:
# Days between order and shipment

df['DeliveryDays'] = (df['ShippedDate'] - df['OrderDate']).dt.days

# Flags
def get_status(x):
    if pd.isna(x):
        return "Unknown"
    elif x > 15:
        return "Late"
    else:
        return "OnTime"

df['DeliveryStatus'] = df['DeliveryDays'].apply(get_status)



# Domestic vs International


def check_domestic(country):
    if country in domestic_countries:
        return "Yes"
    else:
        return "No"

domestic_countries = ["Germany"]

df['IsDomestic'] = df['ShipCountry'].apply(check_domestic)

df.head()
 


### Grouping

In [73]:
# Grouping examples
avg_shipping_by_company = df.groupby("ShippingCompany")['ShippingCost'].mean()
print("📊 Avg Shipping Cost by Company:")
print(avg_shipping_by_company)

📊 Avg Shipping Cost by Company:
ShippingCompany
Aramex International              255.897451
DHL Express                       228.160192
FedEx Logistics                   229.931321
Kiwilytics Goods Shipping LLC.    244.036829
UPS Worldwide                     259.419333
Name: ShippingCost, dtype: float64


### Export, Reporting & Wrap-up

In [79]:
# Export cleaned file
df.to_csv("cleaned_orders_final.csv)",index=False)

# Final summary
print("\n✅ Final Dataset Snapshot:")
print(df.head())

print("\n📈 Delivery Status Breakdown:")
print(df['DeliveryStatus'].value_counts())


print("\n🌎 Orders by Country:")
print(df['ShipCountry'].value_counts())

print("\n🌎 Orders by City:")
print(df['ShipCity'].value_counts())

print("\n📦 Top 3 Shipping Companies:")

print(df['ShippingCompany'].value_counts().head(3))



✅ Final Dataset Snapshot:
   OrderID CustomerID  OrderDate ShippedDate  ShippingCost ShipCountry  \
0   1000.0       C001 2025-05-17  2025-07-30        234.09     Germany   
1   1001.0       C002 2025-01-26  2025-07-30        320.61      Canada   
2   1002.0       C003 2025-03-08  2025-07-30        165.17      Canada   
3   1003.0       C004 2025-03-24  2025-07-30         12.55     Germany   
4   1004.0       C005 2025-04-15  2025-07-30        186.36      Canada   

    ShipCity                 ShippingCompany  DeliveryDays DeliveryStatus  \
0    Hamburg  Kiwilytics Goods Shipping LLC.          74.0           Late   
1   Montreal                   UPS Worldwide         185.0           Late   
2  Vancouver                 FedEx Logistics         144.0           Late   
3     Munich            Aramex International         128.0           Late   
4  Vancouver                 FedEx Logistics         106.0           Late   

  IsDomestic  
0        Yes  
1         No  
2         No  
3    

💡 Key Takeaways:

Exploratory Data Analysis

- You learned how to inspect raw data, identify data quality issues, and understand column types, missing values, and distributions.

Data Cleaning Techniques
You practiced handling:
- Invalid or missing dates
- Non-numeric and negative values in cost fields
- Nulls in critical columns like OrderID, CustomerID, and ShipCity
- Inconsistent formatting and standardization
- Feature Engineering

You created new features like:
- DeliveryDays: the number of days between order and shipment
- DeliveryStatus: a flag for late vs on-time shipments
- IsDomestic: to distinguish local vs international orders
- Exporting Cleaned Data

You prepared a final, cleaned dataset ready for analytics, reporting, or loading into a database.