# Numpy Basic

In [None]:
import numpy as np

# Create a Python list
py_list = [1, 2, 3, 4, 5]

# Convert list to a Numpy array
np_array = np.array(py_list)

print(np_array) # Output: [1 2 3 4 5]

# Create a 2D Python list
py_list_2d = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]

# Convert list to a Numpy array
np_array_2d = np.array(py_list_2d)

print(np_array_2d)
# Output:
# [[1 2 3]
#  [4 5 6]
#  [7 8 9]]

np_array = np.array([[1, 2, 3], [4, 5, 6]])

print("Dimensions: ", np_array.ndim) # Dimensions:  2
print("Shape: ", np_array.shape)     # Shape:  (2, 3)
print("Size: ", np_array.size)       # Size: 6
print("Data Type: ", np_array.dtype) # Data Type:  int64

# Indexing: access the element at the first row, third column
print("Indexed Value: ", np_array[0, 2]) # Indexed Value:  3

# Slicing: access the first row 
print("Sliced Value: ", np_array[0,:]) # Sliced Value:  [1 2 3]

# Reshape the array to 3 rows and 2 columns (only applicable if the reshaped total size equals the original size)
reshaped_array = np_array.reshape(3, 2)
print("Reshaped Array:\n", reshaped_array)
# Reshaped Array:
# [[1 2]
#  [3 4]
#  [5 6]]

np_array1 = np.array([1, 2, 3])
np_array2 = np.array([4, 5, 6])

# Addition
print(np_array1 + np_array2) # Output: [5 7 9]

# Subtraction
print(np_array1 - np_array2) # Output: [-3 -3 -3]

# Multiplication
print(np_array1 * np_array2) # Output: [4 10 18]

# Division
print(np_array1 / np_array2) # Output: [0.25 0.4 0.5]

# Pandas Basic

In [None]:
import pandas as pd 

data_dict = {"Name": ["John", "Anna", "Peter"],
             "Age": [28, 24, 33],
             "City": ["New York", "Los Angeles", "Berlin"]}

df = pd.DataFrame(data_dict)

print(df)

"""
    Name  Age         City
0   John   28     New York
1   Anna   24  Los Angeles
2  Peter   33       Berlin
"""
print(df.head(2))  # Print first two rows
print(df.tail(2))  # Print last two rows
print(df.shape)    # Print dimensions of the df (rows, columns): (3, 3)
print(df.columns)  # Print column labels: Index(['Name', 'Age', 'City'], dtype='object')
print(df.dtypes)   # Print data types of each column:
# Name    object
# Age      int64
# City    object
# dtype: object

### Lambda via apply function(IsChild)

In [None]:
df["IsYouthful"] = df["Age"].apply(lambda age: "Yes" if age < 30 else "No")
print(df)

"""
    Name  Age         City IsYouthful
0   John   28     New York        Yes
1   Anna   24  Los Angeles        Yes
2  Peter   33       Berlin         No
"""

In [None]:
df2 = pd.DataFrame({"Name": ["Megan"], "Age": [34], "City": ["San Francisco"], "IsYouthful": ["No"]})

df_concatenated = pd.concat([df, df2], ignore_index=True)

print(df_concatenated)

"""
    Name  Age           City IsYouthful
0   John   28       New York        Yes
1   Anna   24    Los Angeles        Yes
2  Peter   33         Berlin         No
3  Megan   34  San Francisco         No (reset index cause ignore_index=True)
"""

In [None]:
print(df['column_name']) # select a single column
print(df[['col1', 'col2']]) # select multiple columns

#df.iloc[row_selection, column_selection]:
df.iloc[1,0] # Select the value in the second row and the first column (1-based)
df.iloc[:2,:2] # Select the first two rows and columns

### basic Pandas|Titanic data from Seaborn lib

In [None]:
import pandas as pd
import seaborn as sns

# Load the titanic dataset into a Pandas DataFrame
titanic = sns.load_dataset('titanic')

# Look at the first 3 rows of the DataFrame
print(titanic.head(3))

"""
   survived  pclass     sex   age  ...  deck  embark_town  alive  alone
0         0       3    male  22.0  ...   NaN  Southampton     no  False
1         1       1  female  38.0  ...     C    Cherbourg    yes  False
2         1       3  female  26.0  ...   NaN  Southampton    yes   True

[3 rows x 15 columns]
"""

# Using lambda for DataFrame manipulation
# Create a new column, "IsChild", to mark the passengers who are under 18
titanic["IsChild"] = titanic["age"].apply(lambda age: "Yes" if age < 18 else "No")
print("\nDataFrame after adding the 'IsChild' column:")
print(titanic.head(5))

# less than 7，under 7, 表示＜7
# not more than 7，表示≤7

# Concatenating DataFrames
# Create a new DataFrame
new_data = pd.DataFrame({"survived": [1],
                         "pclass": [3],
                         "sex": ["male"],
                         "age": [32],
                         "sibsp": [0],
                         "parch": [0],
                         "fare": [7.75],
                         "embarked": ["Q"],
                         "class": ["Third"],
                         "who": ["man"],
                         "adult_male": [True],
                         "deck": [None],
                         "embark_town": ["Queenstown"],
                         "alive": ["yes"],
                         "alone": [True],
                         "IsChild": ["No"]})
# Drop columns with all-NA values
# To Handle the Future warning of concat function (since it will not support NA)
new_data = new_data.dropna(axis=1, how='all')
# Concatenate the new data to the original DataFrame
titanic_concat = pd.concat([titanic, new_data], ignore_index=True)
print("\nConcatenated DataFrame:")
print(titanic_concat.tail())


# descriptive statistical analysis

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

# Load Titanic dataset
titanic_df = sns.load_dataset('titanic')

mean_age = titanic_df['age'].mean()
median_age = titanic_df['age'].median()
mode_age = titanic_df['age'].mode()[0] #因为返回的是一个Series所以需要index[0]

print(f"Mean age: {mean_age}") # Mean age: 29.69911764705882
print(f"Median age: {median_age}") # Median age: 28.0
print(f"Mode age: {mode_age}") # Mode age: 24.0
# Standard deviation
std_dev_age = np.std(titanic_df['age'])

print(f"Standard deviation of age: {std_dev_age}") # Standard deviation of age: 14.516321150817316

In [None]:
# 创建一个 Series
data = pd.Series([1, 2, 2, 3, 4, 4, 4, 5])

# 众数（mode），Series的众数返回Series
# 指定了列的时候返回某个数，没指定时返回各个列的众数为一个dataframe
# 多个值出现次数相同时，返回多个值
mode_value = data.mode()
print(mode_value)  # 输出: 0    4
                   #       dtype: int64

### 四分位，百分位

In [None]:
# Quartiles and percentiles
# Using Numpy
Q1_age_np = np.percentile(titanic_df['age'].dropna(), 25) # dropna is being used to drop NA values
Q3_age_np = np.percentile(titanic_df['age'].dropna(), 75) # Numpy的Percentile需要DropNA一下

print(f"First quartile of age (Numpy): {Q1_age_np}")
print(f"Third quartile of age (Numpy): {Q3_age_np}")

# Output:
# First quartile of age (Numpy): 20.125 ->有25%的人的年龄低于20.125岁
# Third quartile of age (Numpy): 38.0   ->有75%的人的年龄低于38岁

# Using Pandas
Q1_age_pd = titanic_df['age'].quantile(0.25)
Q3_age_pd = titanic_df['age'].quantile(0.75) # pandas的Quantile不需要DropNA

print(f"First quartile of age (Pandas): {Q1_age_pd}")
print(f"Third quartile of age (Pandas): {Q3_age_pd}")

# Output:
# First quartile of age (Pandas): 20.125
# Third quartile of age (Pandas): 38.0

# Filtering and Sorting(Pandas)

In [None]:
import seaborn as sns
import pandas as pd

# Load dataset
titanic_df = sns.load_dataset('titanic')

# Filter passengers who survived
survivors = titanic_df[titanic_df['survived'] == 1]
print(survivors.head())

"""
   survived  pclass     sex   age  ...  deck  embark_town  alive  alone
1         1       1  female  38.0  ...     C    Cherbourg    yes  False
2         1       3  female  26.0  ...   NaN  Southampton    yes   True
3         1       1  female  35.0  ...     C  Southampton    yes  False
8         1       3  female  27.0  ...   NaN  Southampton    yes  False
9         1       2  female  14.0  ...   NaN    Cherbourg    yes  False

[5 rows x 15 columns]
"""

In [None]:
# Sort survivors by age
sorted_df = survivors.sort_values('age') # ascending 升序
print(sorted_df.head())

"""
     survived  pclass     sex   age  ...  deck  embark_town  alive  alone
803         1       3    male  0.42  ...   NaN    Cherbourg    yes  False
755         1       2    male  0.67  ...   NaN  Southampton    yes  False
644         1       3  female  0.75  ...   NaN    Cherbourg    yes  False
469         1       3  female  0.75  ...   NaN    Cherbourg    yes  False
831         1       2    male  0.83  ...   NaN  Southampton    yes  False

[5 rows x 15 columns]
"""

In [None]:

# Sort survivors by class and age
sorted_df = survivors.sort_values(['pclass', 'age'], ascending=[False, True])
print(sorted_df.head())

"""
     survived  pclass     sex   age  ...  deck  embark_town  alive  alone
803         1       3    male  0.42  ...   NaN    Cherbourg    yes  False
469         1       3  female  0.75  ...   NaN    Cherbourg    yes  False
644         1       3  female  0.75  ...   NaN    Cherbourg    yes  False
172         1       3  female  1.00  ...   NaN  Southampton    yes  False
381         1       3  female  1.00  ...   NaN    Cherbourg    yes  False

[5 rows x 15 columns]
"""

In [None]:
# Filter female passengers who survived
female_survivors = titanic_df[
    (titanic_df['survived'] == 1) & (titanic_df['sex'] == 'female')
]
print(female_survivors.head())

"""
   survived  pclass     sex   age  ...  deck  embark_town  alive  alone
1         1       1  female  38.0  ...     C    Cherbourg    yes  False
2         1       3  female  26.0  ...   NaN  Southampton    yes   True
3         1       1  female  35.0  ...     C  Southampton    yes  False
8         1       3  female  27.0  ...   NaN  Southampton    yes  False
9         1       2  female  14.0  ...   NaN    Cherbourg    yes  False

[5 rows x 15 columns]
"""

# Data Cleaning and Preprocessing

## Missing Data Handling

### Detection

In [None]:
import seaborn as sns

# Load the dataset
titanic_df = sns.load_dataset('titanic')

# Detect missing values 
missing_values = titanic_df.isnull() # return boolean dataframe, same size as original data
print(missing_values.head(10))
"""
   survived  pclass    sex    age  ...   deck  embark_town  alive  alone
0     False   False  False  False  ...   True        False  False  False
1     False   False  False  False  ...  False        False  False  False
2     False   False  False  False  ...   True        False  False  False
3     False   False  False  False  ...  False        False  False  False
4     False   False  False  False  ...   True        False  False  False
5     False   False  False   True  ...   True        False  False  False
6     False   False  False  False  ...  False        False  False  False
7     False   False  False  False  ...   True        False  False  False
8     False   False  False  False  ...   True        False  False  False
9     False   False  False  False  ...   True        False  False  False

[10 rows x 15 columns]
"""
missing_values_count = titanic_df.isnull().sum()
print(missing_values_count)
"""
survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64
"""

import matplotlib.pyplot as plt

# Detected missing values visualized
plt.figure(figsize=(10,6))
sns.heatmap(titanic_df.isnull(), cmap='viridis')
plt.show()

### Drop

In [None]:
# Drop的前提是缺失数据少，不影响我们的计算
# Copy the original dataset
titanic_df_copy = titanic_df.copy()

# Drop rows with missing values
titanic_df_copy.dropna(inplace=True) # titanic_df_copy is now modified
titanic_df_copy = titanic_df.dropna(inplace=False) # Original df remains the same; you get new_df with modifications

# Check the dataframe
print(titanic_df_copy.isnull().sum())
# There will be no missing values in every column

print(titanic_df.shape)
# TODO: Clean the dataset to remove rows with missing 'age' data
titanic_df['age_missing'] = titanic_df['age'].isnull()
print(titanic_df['age_missing'].unique()) 

missing_values_count = titanic_df['age'].isnull().sum()
print(missing_values_count)

titanic_df_removed = titanic_df[titanic_df['age_missing'] == False]
print(titanic_df_removed.shape)

### Imputation

In [None]:
# Imputation估算适用于不想改变数据整体大小，因此用平均值，中位数，众数等等来取代缺损
# Impute missing values using mean
titanic_df['age'].fillna(titanic_df['age'].mean(), inplace=True)

# Check the dataframe
print(titanic_df.isnull().sum())
"""
survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64
"""
# Impute missing values using backward fill
titanic_df['age'].fillna(method='bfill', inplace=True)

# Check the dataframe
print(titanic_df.isnull().sum())
# The output is the same as in the previous example

## Categorical Data Encoding 类型特征编码

In [None]:
import pandas as pd
import seaborn as sns

# Load Titanic dataset
titanic_df = sns.load_dataset('titanic')
# Display unique categories in 'sex' and 'embark_town'
print(titanic_df['sex'].unique()) # Output: ['male' 'female']
print(titanic_df['embark_town'].unique()) # Output: ['Southampton' 'Cherbourg' 'Queenstown' nan]

### factorize(Label)

In [None]:
# Label Encoding for 'sex'
titanic_df['sex_encoded'] = pd.factorize(titanic_df['sex'])[0]
# factorize返回2个item，index0是numerical encoded label，index1是unique value array
# 下面显示了index0
print(titanic_df[['sex', 'sex_encoded']].head())
"""
      sex  sex_encoded
0    male            0
1  female            1
2  female            1
3  female            1
4    male            0
"""
# 因为male最先出现，所以被factorize函数label为0，female被label为1

# 如果想要指定label可以直接用replace函数
# Replace 'yes' with 1 and 'no' with 0
titanic_df['alive_encoded'] = titanic_df['alive'].replace({'yes': 1, 'no': 0})


### get_dummies(One-Hot)

In [None]:
# One-Hot Encoding for 'embark_town'
encoded_df = pd.get_dummies(titanic_df['embark_town'], prefix='town')
# get_dummies为变量embark_town的每个类别，创建了一个binary column(TF=10)，适用于没有排序性的类别

titanic_df = pd.concat([titanic_df, encoded_df], axis=1)
print(titanic_df.head())
"""
   survived  pclass     sex  ...  town_Cherbourg  town_Queenstown  town_Southampton
0         0       3    male  ...           False            False              True
1         1       1  female  ...            True            False             False
2         1       3  female  ...           False            False              True
3         1       1  female  ...           False            False              True
4         0       3    male  ...           False            False              True
"""

## Scaling(numerical -> diff ranges)

### Standard Scaler(Normal distribution)

In [None]:
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# Load the dataset and drop rows with missing values
titanic_df = sns.load_dataset('titanic').dropna()

# Initialize the StandardScaler
std_scaler = StandardScaler()

# Fit and transform the 'age' column
titanic_df['age_std'] = std_scaler.fit_transform(np.array(titanic_df['age']).reshape(-1, 1))
# reshape使得age列从1D变成了2D array
# 下面的代码实现了同样的功能，但是直接将age列变成了一个dataframe
titanic_df['age_std'] = std_scaler.fit_transform(titanic_df[['age']])

# Check the transformed 'age' column
print(titanic_df['age','age_std'].head())
"""
1     0.152082
3    -0.039875
6     1.175852
10   -2.023430
11    1.431795
Name: age, dtype: float64
"""

In [None]:
# Check if data is normally distributed?
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from scipy.stats import shapiro, kstest, anderson

# Shapiro-Wilk Test(small sample)
stat, p = shapiro(titanic_df['age'])
print(f'Shapiro-Wilk test p-value: {p}')

# Kolmogorov-Smirnov Test(larger sample)
stat, p = kstest(titanic_df['age'], 'norm', args=(titanic_df['age'].mean(), titanic_df['age'].std()))
print(f'Kolmogorov-Smirnov test p-value: {p}')

# Anderson-Darling Test(powerful)
result = anderson(titanic_df['age'])
print(f'Anderson-Darling test statistic: {result.statistic}')

# Histogram
sns.histplot(data=titanic_df['age'], kde=True)
plt.show()

# QQ Plot
stats.probplot(titanic_df['age'], dist="norm", plot=plt)
plt.show()


### Min-Max Scaler(No specific Shape/Distribution)归一化

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
min_max_scaler = MinMaxScaler()

# Fit and transform the 'fare' column
titanic_df['fare'] = min_max_scaler.fit_transform(np.array(titanic_df['fare']).reshape(-1, 1))

# Check the transformed 'fare' column
print(titanic_df['fare'].head())
"""
1     0.139136
3     0.103644
6     0.101229
10    0.032596
11    0.051822
Name: fare, dtype: float64
"""

### Robust Scaler(四分位IQR 抵御异常值影响)

In [None]:
from sklearn.preprocessing import RobustScaler

# Initialize the RobustScaler
robust_scaler = RobustScaler()

# Fit and transform the 'fare' column
titanic_df['fare'] = robust_scaler.fit_transform(np.array(titanic_df['fare']).reshape(-1, 1))

# Check the transformed 'fare' column
print(titanic_df['fare'].head())
"""
1     0.236871
3    -0.064677
6    -0.085199
10   -0.668325
11   -0.504975
Name: fare, dtype: float64
"""
# 异常值变成了small positive and nagative values

## Outlier Detection

### Z-score

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

# Load the dataset
titanic_df = sns.load_dataset('titanic')

# Calculate Z-scores
titanic_df['age_zscore'] = np.abs((titanic_df.age - titanic_df.age.mean()) / titanic_df.age.std())

# Get rows of outliers according to the Z-score method (using a threshold of 3)
outliers_zscore = titanic_df[(titanic_df['age_zscore'] > 3)]
print("Outliers detected by the Z-score method:")
print(outliers_zscore)
"""
     survived  pclass   sex   age  ...  embark_town  alive  alone age_zscore
630         1       1  male  80.0  ...  Southampton    yes   True   3.462699
851         0       3  male  74.0  ...  Southampton     no   True   3.049660

[2 rows x 16 columns]
"""

### IQR(25%~75%)

In [None]:
# Calculate IQR
Q1 = titanic_df['age'].quantile(0.25)
Q3 = titanic_df['age'].quantile(0.75)
IQR = Q3 - Q1

# Define Bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Get rows of outliers according to IQR method
outliers_iqr = titanic_df[(titanic_df['age'] < lower_bound) | (titanic_df['age'] > upper_bound)]
print(outliers_iqr)
"""
     survived  pclass   sex   age  ...  embark_town  alive  alone age_zscore
33          0       2  male  66.0  ...  Southampton     no   True   2.498943
54          0       1  male  65.0  ...    Cherbourg     no  False   2.430103
96          0       1  male  71.0  ...    Cherbourg     no   True   2.843141
116         0       3  male  70.5  ...   Queenstown     no   True   2.808721
280         0       3  male  65.0  ...   Queenstown     no   True   2.430103
456         0       1  male  65.0  ...  Southampton     no   True   2.430103
493         0       1  male  71.0  ...    Cherbourg     no   True   2.843141
630         1       1  male  80.0  ...  Southampton    yes   True   3.462699
672         0       2  male  70.0  ...  Southampton     no   True   2.774301
745         0       1  male  70.0  ...  Southampton     no  False   2.774301
851         0       3  male  74.0  ...  Southampton     no   True   3.049660

[11 rows x 16 columns]
"""

### Drop Or Replace?

In [None]:
# Using the Z-score method
titanic_df = titanic_df[titanic_df['age_zscore'] <= 3]

# Using the IQR method
titanic_df = titanic_df[(titanic_df['age'] >= lower_bound) & (titanic_df['age'] <= upper_bound)]

# using mean
titanic_df.loc[titanic_df['age_zscore'] > 3, 'age'] = titanic_df['age'].mean()

# using median
# Handle outliers in the 'age' column detected by the Z-score method by replacing them with median
titanic_clean = titanic_df
titanic_clean.loc[titanic_clean['age_zscore'] > 3, 'age'] = titanic_df['age'].median()

# Print cleaned data
print("\nData after handling outliers detected by Z-score method:")
print(titanic_clean)

titanic_df.loc[(titanic_df['age'] < lower_bound) | (titanic_df['age'] > upper_bound), 'age'] = titanic_df['age'].median()