# **Pandas Library**

---





# **Part 1 : Intro & Creating Series Objects**


---



In [None]:
# Importing required libraries
# We need Pandas for data handling and Numpy for numerical operations
import pandas as pd
import numpy as np

In [None]:
# Creating a Series from a list
# A Series is a one-dimensional labeled array
Ls=[1,2,4,9,87]
data=pd.Series(Ls)
data

In [None]:
# Accessing the values inside the Series
# This will return a NumPy array of the values
data.values

In [None]:
# Indexing like a regular Python list
# You can access elements using integer-based positions
data[0]   # Output: 1
data[4]   # Output: 87

In [None]:
# Custom Indexing
# You can assign custom labels to the Series instead of default numeric indexes
data=pd.Series(Ls,index=["First Element","Second Element","Third Element","Forth Element","Fifth Element"])
data

In [None]:
# Attempting to access a non-existent label
# This will raise a KeyError because the label doesn't exist
data["First Element"]

In [None]:
# Creating a Series from a dictionary
# Keys become the index, and values become the data
My_dict={"Name":"Alex",
         "Age":29,
         "Job":"Junior Manger",
         "Salary":25000
}
d1=pd.Series(My_dict)
d1

In [None]:
# Slicing using string indexes (label-based)
# When slicing with string labels, the end label IS included
d1['Name':'Job']

In [None]:
# Slicing using integer indexes (position-based)
# When slicing with integer positions, the end index is NOT included
d1[0:2]

# **Part 2 : Creating DataFrames**

---







In [None]:
# Import essential libraries
# pandas: for data manipulation and analysis
# numpy: for efficient numerical computations
import pandas as pd
import numpy as np

In [None]:
# Creating a DataFrame using two Series

# Series representing population for each city
population_dict = {
    'Baghdad': 7655695,
    'Tunis': 633894,
    'Algeria': 3569126,
    'Riyadh': 7678935,
    'Cairo': 9568232
}
population = pd.Series(population_dict)

# Series representing area (in km²) for each city
area_dict = {
    'Baghdad': 204,
    'Tunis': 212,
    'Algeria': 1190,
    'Riyadh': 1973,
    'Cairo': 3085
}
area = pd.Series(area_dict)

# Creating a DataFrame by combining the two Series
data_1 = pd.DataFrame({
    "Population": population,
    "Area": area
})

# Display the resulting DataFrame
data_1


In [None]:
# Creating a DataFrame directly from a dictionary

# Each key in the dictionary represents a column,
# and each value is a list of values for that column.
# We also define the row index manually.

data_2 = pd.DataFrame(
    {
        "Population": [7655695, 633894, 3569126, 7678935, 9568232],
        "Area": [204, 212, 1190, 1973, 3085]
    },
    index=['Baghdad', 'Tunis', 'Algeria', 'Riyadh', 'Cairo']
)

# Display the DataFrame
data_2


In [None]:
# Accessing the index (row labels) of the DataFrame
data_1.index

In [None]:
# Accessing the column labels of the DataFrame
data_1.columns

In [None]:
# Accessing a specific column by its label
data_1['Area']

In [None]:
# If We Want Specific Value from The DataFrame We Will use This   data['column name']['Row Name]
data_1['Area']['Cairo']

In [None]:
# Creating a DataFrame from a single Series
# 'area' Series will become one column named 'Area'
pd.DataFrame(area, columns=['Area'])

In [None]:
# Creating a DataFrame from a single Series using a dictionary
# The key becomes the column name, and the Series becomes the column data
pd.DataFrame({"Area": area})


In [None]:
# Creating a DataFrame from a list of dictionaries
# Each dictionary represents a row, and keys are used as column names
lst = [{'a': 0, 'b': 0}, {'a': 1, 'b': 2}, {'a': 2, 'b': 4}]
pd.DataFrame(lst, index=['First', 'Second', 'Third'])

In [None]:
# Creating a DataFrame from a dynamically generated list of dictionaries
# Each dictionary represents a row with 'a' as a number and 'b' as double that number
d2 = [{'a': i, 'b': 2 * i} for i in range(3)]
pd.DataFrame(d2, index=['First', 'Second', 'Third'])

In [None]:
# Creating a DataFrame with missing values
# If a dictionary is missing a key, pandas automatically fills that cell with NaN (Not a Number)
mis_lst = [{'a': 1, 'b': 2}, {'b': 3, 'c': 4}]
pd.DataFrame(mis_lst, index=['First', 'Second'])

In [None]:
# Creating a DataFrame from a 2D NumPy array

# Generate 30 random integers between 1 and 20 (inclusive)
# Reshape the array into a matrix with 10 rows and 3 columns
ze = np.random.randint(1, 21, 30).reshape(10, 3)

# Convert the NumPy array into a DataFrame with named columns
pd.DataFrame(ze, columns=['C1', 'C2', 'C3'])


Unnamed: 0,C1,C2,C3
0,3,16,3
1,17,18,16
2,1,19,2
3,14,11,5
4,8,6,3
5,8,11,3
6,9,14,14
7,7,8,20
8,5,12,6
9,11,14,8


# **Part 3 : Indexing & Selection**






In [None]:
# Loading real-world ramen rating data from Kaggle
# Source: https://www.kaggle.com/datasets/residentmario/ramen-ratings
# Each row represents a review of a specific ramen product

df = pd.read_csv('ramen-ratings.csv')  # Make sure the CSV is in the same directory
df


In [None]:
# Display the first N rows of the DataFrame using .head(N)
# If no value is passed, it returns the first 5 rows by default
df.head()


In [None]:
# Selecting the 'Country' column from the DataFrame
# This returns a Series containing all country values
df['Country']

In [None]:
# Alternative way to select the 'Country' column using dot notation
# Only works if the column name has no spaces or special characters (except underscores)
df.Country

In [None]:
# Index-based selection using .iloc[row_index, column_index]
# This selects the first row (index 0) and all columns
df.iloc[0, :]

In [None]:
# Selecting the first column (index 0) across all rows using iloc
df.iloc[:, 0]

In [None]:
# Selecting the first 3 rows and all columns using iloc
# This is equivalent to df.head(3)
df.iloc[:3, :]

In [None]:
# Selecting the last column across all rows using iloc and negative indexing
df.iloc[:, -1]

In [None]:
# Selecting the second-to-last column across all rows using iloc
df.iloc[:, -2]

In [None]:
df.head()

In [None]:
# Fancy indexing: selecting specific rows (by index) across all columns
df.iloc[[1, 5, 50, 280, 1000, 1500, 2500], :]

In [None]:
# Fancy indexing: select specific rows and slice specific columns (from 0 to 3, step 2)
# This returns columns 0 and 2 for the selected rows
df.iloc[[1, 5, 50, 280, 1000, 1500, 2500], :4:2]

In [None]:
# Selecting the last 10 rows and all columns using negative indexing with iloc
df.iloc[-10:, :]

In [None]:
# Label-based selection using .loc[row_labels, column_labels]
# Selecting all rows for the columns: Brand, Style, and Country
df.loc[:, ['Brand', 'Style', 'Country']]

In [None]:
# Using iloc for implicit indexing: selects rows from index 1 to 3 (excludes the end)
df.iloc[1:4]

In [None]:
# Using loc for label-based indexing: selects rows from label 1 to 4 (inclusive)
df.loc[1:4]

In [None]:
# Using set_index() to set the 'Review #' column as the new index of the DataFrame
d_1 = df.set_index("Review #")
d_1

# **Part 4 : Conditional Selection & Assigning data**

In [None]:
# Display the first 5 rows of the DataFrame for a quick preview
df.head()

In [None]:
# Question 1: Filter ramen dishes served in Taiwan

# Alternative method using boolean indexing
# df[df['Country'] == 'Taiwan']

# Using .loc for label-based indexing (rows, columns)
df.loc[df['Country'] == 'Taiwan', :]

In [None]:
# Question 2: Filter dishes with ratings higher than the average (assumed to be 2.5)

# Remove entries with 'Unrated' values in the 'Stars' column
df = df.loc[df['Stars'] != 'Unrated']

# Convert 'Stars' column to float for numerical comparison
df['Stars'] = df['Stars'].astype(float)

# Filter dishes with rating greater than 2.5
df.loc[df['Stars'] > 2.5]

In [None]:
# Question 3: Filter Taiwanese dishes with ratings above average (2.5)

# Combine multiple conditions using logical AND (&) inside .loc
df.loc[(df['Country'] == 'Taiwan') & (df['Stars'] > 2.5)]

In [None]:
# Question 4: Display only the 'Brand' and 'Style' columns for high-rated dishes from Taiwan
df.loc[(df['Country'] == 'Taiwan') & (df['Stars'] > 2.5), ['Brand', 'Style']]

In [None]:
# Question 5: Filter dishes served in either Finland or Sweden

# Use .isin() to check if 'Country' value exists in the provided list
# Alternative method using OR condition:
# df.loc[(df['Country'] == 'Finland') | (df['Country'] == 'Sweden')]

df.loc[df['Country'].isin(['Finland', 'Sweden'])]

In [None]:
# Question 6: Filter rows where the 'Top Ten' column has non-null values

# isnull()      → returns True for missing values (NaN)
# notnull()     → returns True for existing (non-null) values
# ~             → logical NOT operator (inverts boolean values)

# Alternative approach using .notnull():
# df.loc[df['Top Ten'].notnull()]

df.loc[~df['Top Ten'].isnull()]

In [None]:
# Assigning data to a column in the DataFrame

# Syntax:
# df['Column_Name'] = value_or_series

In [None]:
# Create a new boolean column indicating whether the dish style is 'Cup'
df['Is_Style_Cup'] = (df['Style'] == 'Cup')
df

In [None]:
# Replace the 'Review #' column with ascending integers starting from 1 to 2577

# The generated list must match the number of rows in the DataFrame
df['Review #'] = list(range(1, 2578))
df

# **Part 5 : Handling Missing Values**

In [None]:
# Load real-world dataset: Ramen Ratings from Kaggle
# Source: https://www.kaggle.com/datasets/residentmario/ramen-ratings

df = pd.read_csv('ramen-ratings.csv')  # Read the dataset into a pandas DataFrame
df  # Display the full DataFrame to get an initial look at the data

In [None]:
# Check for missing (NaN) values in each column of the DataFrame
# axis=0 means the operation is performed column-wise (i.e., across all rows)

df.isnull().sum(axis=0)

In [None]:
# Calculate the percentage of missing (NaN) values in each column
# df.shape returns a tuple: (number of rows, number of columns)
# df.shape[0] gives the total number of rows, used here as the denominator to compute the percentage

df.isnull().sum(axis=0) / df.shape[0] * 100


In [None]:
# Overview of the dropna() method used to remove missing (NaN) values from a DataFrame:
# -------------------------------------------------------------------------------------

# axis (default=0):
# - axis=0 or 'index'   → drop rows with missing values.
# - axis=1 or 'columns' → drop columns with missing values.

# how (default='any'):
# - 'any'  → drop if *any* value is missing.
# - 'all'  → drop only if *all* values are missing.

# thresh:
# - Minimum number of non-null values required to keep a row/column.
# - Example: thresh=3 keeps only rows/columns with at least 3 non-null values.

# subset:
# - Specify a list of column names to restrict the null check to specific columns.

# inplace (default=False):
# - True  → modify the DataFrame directly.
# - False → return a new DataFrame with changes.

# -------------------------------------------------------------------------------------


In [None]:
# Remove all rows that contain at least one missing (NaN) value
# axis=0 is equivalent to axis='index', i.e., row-wise operation

df.dropna(axis=0)

In [None]:
# Remove all columns that contain at least one missing (NaN) value
# axis=1 is equivalent to axis='columns', i.e., column-wise operation

df.dropna(axis=1)

In [None]:
# Apply the change and create a new DataFrame without columns that contain any missing values
# Alternative options:
# - df = df.dropna(axis=1)       → overwrite the original DataFrame
# - df.dropna(axis=0, inplace=True) → drop rows in-place (modifies the original DataFrame)

df_1 = df.dropna(axis=1)
df_1  # Display the cleaned DataFrame with no missing columns

In [None]:
# Drop only the rows where the 'Top Ten' column contains missing (NaN) values
# Other columns may still contain NaNs — this targets one specific column

df.dropna(subset=['Top Ten'])

In [None]:
# Overview of the fillna() method used to fill missing (NaN) values in a DataFrame:
# -----------------------------------------------------------------------------------

# value:
# - A specific value (scalar, dict, Series, or DataFrame) to replace missing values.

# method:
# - Use a method to propagate existing non-null values:
#   ▪ 'ffill' (forward fill): use the previous non-null value.
#   ▪ 'bfill' (backward fill): use the next non-null value.

# axis:
# - axis=0 or 'index'   → fill column-wise (i.e., move down the rows).
# - axis=1 or 'columns' → fill row-wise (i.e., move across the columns).

# inplace (default=False):
# - True  → modifies the original DataFrame in place.
# - False → returns a new DataFrame with the changes.

# -----------------------------------------------------------------------------------


In [None]:
# Fill all missing (NaN) values in the DataFrame with 0
# Useful when 0 is a meaningful default value (e.g., for numeric columns)

df_1 = df.fillna(0)
df_1  # Display the updated DataFrame with NaNs replaced by 0

In [None]:
# Filter and display rows where the 'Style' column has missing (NaN) values

df_1.loc[df_1['Style'].isnull()]

In [None]:
# Fill missing values with custom values per column:
# - 'Style' column → replace NaNs with the string 'Not Available'
# - 'Top Ten' column → replace NaNs with -1 (to indicate missing rank or score)

df_2 = df.fillna({"Style": "Not Available", "Top Ten": -1})
df_2  # Display the updated DataFrame with specified values filled

In [None]:
# Access and display specific rows by index (2152 and 2442) to verify the changes after filling NaN values

df_2.iloc[[2152, 2442], :]

In [None]:
# Count the frequency of each unique value in the 'Style' column
# Useful for understanding the distribution of different noodle styles

df['Style'].value_counts()

In [None]:
# Fill all remaining missing values in the DataFrame with the string 'Pack'
# Note: This may overwrite NaNs in all columns, not just 'Style'

df = df.fillna('Pack')

# Recount the frequency of each unique value in the 'Style' column after filling

df['Style'].value_counts()

In [None]:
# Replace the string 'Unrated' in the 'Stars' column with NaN
# This allows us to treat unrated entries as missing values during analysis

df['Stars'] = df['Stars'].replace("Unrated", np.nan)

In [None]:
# Count the number of missing (NaN) values in the 'Stars' column
# Useful to verify how many 'Unrated' entries were replaced

df.Stars.isnull().sum()

In [None]:
# Return a Series of all non-missing values in the 'Stars' column
# Note: This does NOT modify the original DataFrame

df.Stars.dropna()

In [None]:
# Fill missing values in the 'Stars' column with the mean rating (2.5)
# inplace=True updates the original DataFrame directly

df["Stars"].fillna(2.5, inplace=True)


In [None]:
# Convert the 'Stars' column from object/string to float
# This enables numerical operations like averaging, plotting, etc.

df["Stars"] = df["Stars"].astype(float)

# Count the frequency of each unique star rating (after conversion to float)

df["Stars"].value_counts()