In [None]:
import pandas as pd # type: ignore

data_dict = {"Name": ["John", "Anna", "Peter"],
             "Age": [28, 24, 33],
             "City": ["New York", "Los Angeles", "Berlin"]}

df = pd.DataFrame(data_dict)

print(df)

"""
    Name  Age         City
0   John   28     New York
1   Anna   24  Los Angeles
2  Peter   33       Berlin
"""

In [None]:
print(df.head(2))  # Print first two rows
print(df.tail(2))  # Print last two rows
print(df.shape)    # Print dimensions of the df (rows, columns): (3, 3)
print(df.columns)  # Print column labels: Index(['Name', 'Age', 'City'], dtype='object')
print(df.dtypes)   # Print data types of each column:
# Name    object
# Age      int64
# City    object
# dtype: object

In [None]:
df["IsYouthful"] = df["Age"].apply(lambda age: "Yes" if age < 30 else "No")
print(df)

"""
    Name  Age         City IsYouthful
0   John   28     New York        Yes
1   Anna   24  Los Angeles        Yes
2  Peter   33       Berlin         No
"""

In [None]:
df2 = pd.DataFrame({"Name": ["Megan"], "Age": [34], "City": ["San Francisco"], "IsYouthful": ["No"]})

df_concatenated = pd.concat([df, df2], ignore_index=True)

print(df_concatenated)

"""
    Name  Age           City IsYouthful
0   John   28       New York        Yes
1   Anna   24    Los Angeles        Yes
2  Peter   33         Berlin         No
3  Megan   34  San Francisco         No (reset index cause ignore_index=True)
"""

In [None]:
print(df['column_name']) # select a single column
print(df[['col1', 'col2']]) # select multiple columns

#df.iloc[row_selection, column_selection]:
df.iloc[1,0] # Select the value in the second row and the first column (1-based)
df.iloc[:2,:2] # Select the first two rows and columns

# REAL world data from Seaborn lib

In [None]:
import pandas as pd
import seaborn as sns

# Load the titanic dataset into a Pandas DataFrame
titanic = sns.load_dataset('titanic')

# Look at the first 3 rows of the DataFrame
print(titanic.head(3))

"""
   survived  pclass     sex   age  ...  deck  embark_town  alive  alone
0         0       3    male  22.0  ...   NaN  Southampton     no  False
1         1       1  female  38.0  ...     C    Cherbourg    yes  False
2         1       3  female  26.0  ...   NaN  Southampton    yes   True

[3 rows x 15 columns]
"""

# Using lambda for DataFrame manipulation
# Create a new column, "IsChild", to mark the passengers who are under 18
titanic["IsChild"] = titanic["age"].apply(lambda age: "Yes" if age < 18 else "No")
print("\nDataFrame after adding the 'IsChild' column:")
print(titanic.head(5))

# Concatenating DataFrames
# Create a new DataFrame
new_data = pd.DataFrame({"survived": [1],
                         "pclass": [3],
                         "sex": ["male"],
                         "age": [32],
                         "sibsp": [0],
                         "parch": [0],
                         "fare": [7.75],
                         "embarked": ["Q"],
                         "class": ["Third"],
                         "who": ["man"],
                         "adult_male": [True],
                         "deck": [None],
                         "embark_town": ["Queenstown"],
                         "alive": ["yes"],
                         "alone": [True],
                         "IsChild": ["No"]})
# Drop columns with all-NA values
# To Handle the Future warning of concat function (since it will not support NA)
new_data = new_data.dropna(axis=1, how='all')
# Concatenate the new data to the original DataFrame
titanic_concat = pd.concat([titanic, new_data], ignore_index=True)
print("\nConcatenated DataFrame:")
print(titanic_concat.tail())

less than 7，表示＜7

not more than 7，表示≤7

# descriptive statistical analysis

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

# Load Titanic dataset
titanic_df = sns.load_dataset('titanic')

mean_age = titanic_df['age'].mean()
median_age = titanic_df['age'].median()
mode_age = titanic_df['age'].mode()[0] #因为返回的是一个Series所以需要index[0]

print(f"Mean age: {mean_age}") # Mean age: 29.69911764705882
print(f"Median age: {median_age}") # Median age: 28.0
print(f"Mode age: {mode_age}") # Mode age: 24.0
# Standard deviation
std_dev_age = np.std(titanic_df['age'])

print(f"Standard deviation of age: {std_dev_age}") # Standard deviation of age: 14.516321150817316

In [None]:
# 创建一个 Series
data = pd.Series([1, 2, 2, 3, 4, 4, 4, 5])

# 众数（mode），Series的众数返回Series
# 指定了列的时候返回某个数，没指定时返回各个列的众数为一个dataframe
# 多个值出现次数相同时，返回多个值
mode_value = data.mode()
print(mode_value)  # 输出: 0    4
                   #       dtype: int64

# 四分位，百分位

In [None]:
# Quartiles and percentiles
# Using Numpy
Q1_age_np = np.percentile(titanic_df['age'].dropna(), 25) # dropna is being used to drop NA values
Q3_age_np = np.percentile(titanic_df['age'].dropna(), 75) #Numpy的Percentile需要DropNA一下

print(f"First quartile of age (Numpy): {Q1_age_np}")
print(f"Third quartile of age (Numpy): {Q3_age_np}")

# Output:
# First quartile of age (Numpy): 20.125 ->有25%的人的年龄低于20.125岁
# Third quartile of age (Numpy): 38.0   ->有75%的人的年龄低于38岁

# Using Pandas
Q1_age_pd = titanic_df['age'].quantile(0.25)
Q3_age_pd = titanic_df['age'].quantile(0.75) #pandas的Quantile不需要DropNA

print(f"First quartile of age (Pandas): {Q1_age_pd}")
print(f"Third quartile of age (Pandas): {Q3_age_pd}")

# Output:
# First quartile of age (Pandas): 20.125
# Third quartile of age (Pandas): 38.0

# Filtering and Sorting(Pandas)

In [None]:
import seaborn as sns
import pandas as pd

# Load dataset
titanic_df = sns.load_dataset('titanic')

# Filter passengers who survived
survivors = titanic_df[titanic_df['survived'] == 1]
print(survivors.head())

"""
   survived  pclass     sex   age  ...  deck  embark_town  alive  alone
1         1       1  female  38.0  ...     C    Cherbourg    yes  False
2         1       3  female  26.0  ...   NaN  Southampton    yes   True
3         1       1  female  35.0  ...     C  Southampton    yes  False
8         1       3  female  27.0  ...   NaN  Southampton    yes  False
9         1       2  female  14.0  ...   NaN    Cherbourg    yes  False

[5 rows x 15 columns]
"""

In [None]:
# Sort survivors by age
sorted_df = survivors.sort_values('age') # ascending 升序
print(sorted_df.head())

"""
     survived  pclass     sex   age  ...  deck  embark_town  alive  alone
803         1       3    male  0.42  ...   NaN    Cherbourg    yes  False
755         1       2    male  0.67  ...   NaN  Southampton    yes  False
644         1       3  female  0.75  ...   NaN    Cherbourg    yes  False
469         1       3  female  0.75  ...   NaN    Cherbourg    yes  False
831         1       2    male  0.83  ...   NaN  Southampton    yes  False

[5 rows x 15 columns]
"""

In [None]:

# Sort survivors by class and age
sorted_df = survivors.sort_values(['pclass', 'age'], ascending=[False, True])
print(sorted_df.head())

"""
     survived  pclass     sex   age  ...  deck  embark_town  alive  alone
803         1       3    male  0.42  ...   NaN    Cherbourg    yes  False
469         1       3  female  0.75  ...   NaN    Cherbourg    yes  False
644         1       3  female  0.75  ...   NaN    Cherbourg    yes  False
172         1       3  female  1.00  ...   NaN  Southampton    yes  False
381         1       3  female  1.00  ...   NaN    Cherbourg    yes  False

[5 rows x 15 columns]
"""

In [None]:
# Filter female passengers who survived
female_survivors = titanic_df[
    (titanic_df['survived'] == 1) & (titanic_df['sex'] == 'female')
]
print(female_survivors.head())

"""
   survived  pclass     sex   age  ...  deck  embark_town  alive  alone
1         1       1  female  38.0  ...     C    Cherbourg    yes  False
2         1       3  female  26.0  ...   NaN  Southampton    yes   True
3         1       1  female  35.0  ...     C  Southampton    yes  False
8         1       3  female  27.0  ...   NaN  Southampton    yes  False
9         1       2  female  14.0  ...   NaN    Cherbourg    yes  False

[5 rows x 15 columns]
"""

# Data Cleaning and Preprocessing
### Missing Data

In [None]:
import seaborn as sns

# Load the dataset
titanic_df = sns.load_dataset('titanic')

# Detect missing values 
missing_values = titanic_df.isnull() # return a same size dataframe as original data
print(missing_values.head(10))
"""
   survived  pclass    sex    age  ...   deck  embark_town  alive  alone
0     False   False  False  False  ...   True        False  False  False
1     False   False  False  False  ...  False        False  False  False
2     False   False  False  False  ...   True        False  False  False
3     False   False  False  False  ...  False        False  False  False
4     False   False  False  False  ...   True        False  False  False
5     False   False  False   True  ...   True        False  False  False
6     False   False  False  False  ...  False        False  False  False
7     False   False  False  False  ...   True        False  False  False
8     False   False  False  False  ...   True        False  False  False
9     False   False  False  False  ...   True        False  False  False

[10 rows x 15 columns]
"""
missing_values_count = titanic_df.isnull().sum()
print(missing_values_count)
"""
survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64
"""

In [None]:
# Drop的前提是缺失数据少，不影响我们的计算
# Copy the original dataset
titanic_df_copy = titanic_df.copy()

# Drop rows with missing values
titanic_df_copy.dropna(inplace=True) # titanic_df_copy is now modified
titanic_df_copy = titanic_df.dropna(inplace=False) # Original df remains the same; you get new_df with modifications

# Check the dataframe
print(titanic_df_copy.isnull().sum())
# There will be no missing values in every column

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Detected missing values visualized
plt.figure(figsize=(10,6))
sns.heatmap(titanic_df.isnull(), cmap='viridis')
plt.show()

In [None]:
# Imputation估算适用于不想改变数据整体大小，因此用平均值，中位数，众数等等来取代缺损
# Impute missing values using mean
titanic_df['age'].fillna(titanic_df['age'].mean(), inplace=True)

# Check the dataframe
print(titanic_df.isnull().sum())
"""
survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64
"""
# Impute missing values using backward fill
titanic_df['age'].fillna(method='bfill', inplace=True)

# Check the dataframe
print(titanic_df.isnull().sum())
# The output is the same as in the previous example