In [2]:
from operator import index

import pandas as pd

The Series object is a one-dimensional data structure, resembling a list, in which each value has an associated index.

In [3]:
# Creating a Series from a list
data = [10, 20, 30, 40]
series = pd.Series(data)
print("Series from a list:\n", series)

# Creating a Series with custom index
data_with_index = pd.Series(data, index=['a', 'b', 'c', 'd'])
print("\nSeries with custom index:\n", data_with_index)

# Creating a Series from a dictionary
data_dict = {'apple': 2, 'banana': 3, 'orange': 5}
series_dict = pd.Series(data_dict)
print("\nSeries from dictionary:\n", series_dict)


Series from a list:
 0    10
1    20
2    30
3    40
dtype: int64

Series with custom index:
 a    10
b    20
c    30
d    40
dtype: int64

Series from dictionary:
 apple     2
banana    3
orange    5
dtype: int64


We can index the Series object just like a NumPy or dictionary array and perform arithmetic operations on it.

In [4]:
# Accessing values by index
print("Value at index 'b':", data_with_index['b'])

# Fancy indexing
print("\nValues at multiple indices:\n", data_with_index[['a', 'c']])

# Arithmetic operations
series1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
series2 = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])

# Addition
print("\nAddition of two Series:\n", series1 + series2)

# Scalar operations
print("\nSeries multiplied by 2:\n", series * 2)


Value at index 'b': 20

Values at multiple indices:
 a    10
c    30
dtype: int64

Addition of two Series:
 a    11
b    22
c    33
d    44
dtype: int64

Series multiplied by 2:
 0    20
1    40
2    60
3    80
dtype: int64


A `DataFrame` object is a two-dimensional data structure, similar to a table. It can be created from various sources, such as dictionary lists, `NumPy` arrays , etc.

In [5]:
# Creating DataFrame from dictionary
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [24, 27, 22],
    'City': ['New York', 'San Francisco', 'Los Angeles']
}
df = pd.DataFrame(data)
print("DataFrame from dictionary:\n", df)

# Creating DataFrame from list of dictionaries
data_list = [
    {'Name': 'Alice', 'Age': 24, 'City': 'New York'},
    {'Name': 'Bob', 'Age': 27, 'City': 'San Francisco'},
    {'Name': 'Charlie', 'Age': 22, 'City': 'Los Angeles'}
]
df_from_list = pd.DataFrame(data_list)
print("\nDataFrame from list of dictionaries:\n", df_from_list)

# Creating DataFrame from NumPy array
import numpy as np

array_data = np.array([[1, 2], [3, 4], [5, 6]])
df_from_array = pd.DataFrame(array_data, columns=['Column1', 'Column2'])
print("\nDataFrame from NumPy array:\n", df_from_array)


DataFrame from dictionary:
       Name  Age           City
0    Alice   24       New York
1      Bob   27  San Francisco
2  Charlie   22    Los Angeles

DataFrame from list of dictionaries:
       Name  Age           City
0    Alice   24       New York
1      Bob   27  San Francisco
2  Charlie   22    Los Angeles

DataFrame from NumPy array:
    Column1  Column2
0        1        2
1        3        4
2        5        6


We can refer to `DataFrame` columns and rows through `.loc[]`, `.iloc[]`, and directly through column names.


In [7]:
# Selecting a column
print("Column 'Age':\n", df['Age'])

# Selecting multiple columns
print("\nColumns 'Name' and 'City':\n", df[['Name', 'City']])

# Row selection by label
print("\nRow selection by label:\n", df.loc[1])

# Row selection by integer position
print("\nRow selection by integer position:\n", df.iloc[1])

# Fancy indexing
print("\nSelecting rows where Age > 23:\n", df[df['Age'] > 23])


Column 'Age':
 0    24
1    27
2    22
Name: Age, dtype: int64

Columns 'Name' and 'City':
       Name           City
0    Alice       New York
1      Bob  San Francisco
2  Charlie    Los Angeles

Row selection by label:
 Name              Bob
Age                27
City    San Francisco
Name: 1, dtype: object

Row selection by integer position:
 Name              Bob
Age                27
City    San Francisco
Name: 1, dtype: object

Selecting rows where Age > 23:
     Name  Age           City
0  Alice   24       New York
1    Bob   27  San Francisco


Extra explanation:
loc - access based on **labels** (label-based)
- loc works on the basis of **row and column labels**. We use it if we want to refer to rows or columns based on their names.
- loc allows you to select data by specifying labels, that is, column names and row indexes.
- It supports both single labels and ranges of labels.

- `iloc` - access based on **numeric indexes** (integer-based)
- `iloc` works on the basis of numeric **positions** in the DataFrame, i.e. numeric indexes.
- It is useful when you want to select data, not looking at the names, but at their positions (e.g. 1st row, 2nd row, etc.).
- `iloc` supports **numeric row and column indexes** (counting from zero).

In [None]:
data_list = [
    {'Name': 'Alice', 'Age': 24, 'City': 'New York'},
    {'Name': 'Bob', 'Age': 27, 'City': 'San Francisco'},
    {'Name': 'Charlie', 'Age': 22, 'City': 'Los Angeles'}
]
df_from_list = pd.DataFrame(data_list, index=["a", "b", "c"])

print("loc")
print("\nRow selection by label:\n", df_from_list.loc["a"])
print("\nRow selection by labels:\n", df_from_list.loc["a":"b"])
print("\nRow selection by labels and columns:\n", df_from_list.loc["a":"b", "Name":"Age"])

print("\n\niloc")
print("\nRow selection by index:\n", df_from_list.iloc[1])
print("\nRow selection by indexes:\n", df_from_list.iloc[1:3])
print("\nRow selection by indexes and columns:\n", df_from_list.iloc[1:3, 0:3])

Pandas allows you to perform arithmetic and other operations on all table cells.

In [28]:
# Adding a new column
df['Salary'] = [50000, 60000, 52000]
print("DataFrame with new column 'Salary':\n", df)

# Modifying values in a column based on condition
df.loc[df['City'] == 'New York', 'Salary'] += 5000
print("\nUpdated DataFrame with modified 'Salary':\n", df)

# Operations on DataFrame columns
df['Bonus'] = df['Salary'] * 0.1
print("\nDataFrame with calculated 'Bonus' column:\n", df)

# Dropping a column
df = df.drop(columns=['Bonus'])
print("\nDataFrame after dropping 'Bonus' column:\n", df)


DataFrame with new column 'Salary':
       Name  Age           City  Salary
0    Alice   24       New York   50000
1      Bob   27  San Francisco   60000
2  Charlie   22    Los Angeles   52000

Updated DataFrame with modified 'Salary':
       Name  Age           City  Salary
0    Alice   24       New York   55000
1      Bob   27  San Francisco   60000
2  Charlie   22    Los Angeles   52000

DataFrame with calculated 'Bonus' column:
       Name  Age           City  Salary   Bonus
0    Alice   24       New York   55000  5500.0
1      Bob   27  San Francisco   60000  6000.0
2  Charlie   22    Los Angeles   52000  5200.0

DataFrame after dropping 'Bonus' column:
       Name  Age           City  Salary
0    Alice   24       New York   55000
1      Bob   27  San Francisco   60000
2  Charlie   22    Los Angeles   52000


Pandas allows detecting, removing and filling in missing data.

In [29]:
# Adding NaN values for demonstration
import numpy as np

df_with_nan = df.copy()
df_with_nan.loc[1, 'Salary'] = np.nan
df_with_nan.loc[2, 'City'] = np.nan
print("DataFrame with NaN values:\n", df_with_nan)

# Detecting NaN values
print("\nDetecting NaN values:\n", df_with_nan.isna())

# Dropping rows with NaN values
print("\nDataFrame after dropping rows with NaN:\n", df_with_nan.dropna())

# Filling NaN values
df_filled = df_with_nan.fillna({'Salary': df['Salary'].mean(), 'City': 'Unknown'})
print("\nDataFrame after filling NaN values:\n", df_filled)


DataFrame with NaN values:
       Name  Age           City   Salary
0    Alice   24       New York  55000.0
1      Bob   27  San Francisco      NaN
2  Charlie   22            NaN  52000.0

Detecting NaN values:
     Name    Age   City  Salary
0  False  False  False   False
1  False  False  False    True
2  False  False   True   False

DataFrame after dropping rows with NaN:
     Name  Age      City   Salary
0  Alice   24  New York  55000.0

DataFrame after filling NaN values:
       Name  Age           City        Salary
0    Alice   24       New York  55000.000000
1      Bob   27  San Francisco  55666.666667
2  Charlie   22        Unknown  52000.000000


Aggregations allow quick conclusions to be drawn by grouping data.

In [30]:
# Creating a DataFrame for aggregation
data = {
    'Category': ['A', 'A', 'B', 'B', 'C', 'C'],
    'Values': [10, 15, 10, 20, 30, 25]
}
df_group = pd.DataFrame(data)

# Aggregating data by category
grouped = df_group.groupby('Category').sum()
print("Sum of values by Category:\n", grouped)

# Applying multiple aggregation functions
grouped_multiple = df_group.groupby('Category').agg(['sum', 'mean'])
print("\nMultiple aggregations by Category:\n", grouped_multiple)


Sum of values by Category:
           Values
Category        
A             25
B             30
C             55

Multiple aggregations by Category:
          Values      
            sum  mean
Category             
A            25  12.5
B            30  15.0
C            55  27.5


Pandas allows you to combine data from different sources through methods such as `concat` and `merge`.

In [31]:
# Concatenation along rows
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2'], 'B': ['B0', 'B1', 'B2']})
df2 = pd.DataFrame({'A': ['A3', 'A4', 'A5'], 'B': ['B3', 'B4', 'B5']})
df_concat = pd.concat([df1, df2], ignore_index=True)
print("Concatenated DataFrame:\n", df_concat)

# Merge example
left = pd.DataFrame({'Key': ['K0', 'K1', 'K2'], 'A': ['A0', 'A1', 'A2']})
right = pd.DataFrame({'Key': ['K0', 'K1', 'K3'], 'B': ['B0', 'B1', 'B3']})
df_merge = pd.merge(left, right, on='Key', how='outer')
print("\nMerged DataFrame:\n", df_merge)


Concatenated DataFrame:
     A   B
0  A0  B0
1  A1  B1
2  A2  B2
3  A3  B3
4  A4  B4
5  A5  B5

Merged DataFrame:
   Key    A    B
0  K0   A0   B0
1  K1   A1   B1
2  K2   A2  NaN
3  K3  NaN   B3


In [35]:
titanic_df = pd.read_csv("../_example_data/titanic.csv", delimiter=',')

Analyze basic statistics of a dataset using `describe()` and `info()`.

In [None]:
# Basic information about the dataset
print("Information about the Titanic dataset:")
titanic_df.info()

# Basic statistics
print("Basic numeric statistics of the Titanic dataset:")
titanic_df.describe()

Data filtering allows passengers to be selected according to specific criteria, such as surviving passengers or those traveling first class.

In [37]:
# A selection of passenger survivors
survived_passengers = titanic_df[titanic_df['Survived'] == 1]
print("Passengers who survived:\n", survived_passengers.head())

# Choice of passengers traveling in first class
first_class_passengers = titanic_df[titanic_df['Pclass'] == 1]
print("\nPassengers traveling in first class:\n", first_class_passengers.head())

# A selection of female survivors under the age of 30
young_female_survivors = titanic_df[(titanic_df['Sex'] == 'female') & 
                                    (titanic_df['Age'] < 30) & 
                                    (titanic_df['Survived'] == 1)]
print("\nYoung female survivors:\n", young_female_survivors.head())


Passengers who survived:
     PassengerId  Survived  Pclass  \
1           893         1       3   
4           896         1       3   
6           898         1       3   
8           900         1       3   
12          904         1       1   

                                             Name     Sex   Age  SibSp  Parch  \
1                Wilkes, Mrs. James (Ellen Needs)  female  47.0      1      0   
4    Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female  22.0      1      1   
6                            Connolly, Miss. Kate  female  30.0      0      0   
8       Abrahim, Mrs. Joseph (Sophie Halaut Easu)  female  18.0      0      0   
12  Snyder, Mrs. John Pillsbury (Nelle Stevenson)  female  23.0      1      0   

     Ticket     Fare Cabin Embarked  
1    363272   7.0000   NaN        S  
4   3101298  12.2875   NaN        S  
6    330972   7.6292   NaN        Q  
8      2657   7.2292   NaN        C  
12    21228  82.2667   B45        S  

Passengers traveling in first class