# Numpy Basic

In [None]:
import numpy as np

# Create a Python list
py_list = [1, 2, 3, 4, 5]

# Convert list to a Numpy array
np_array = np.array(py_list)

print(np_array) # Output: [1 2 3 4 5]

# Create a 2D Python list
py_list_2d = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]

# Convert list to a Numpy array
np_array_2d = np.array(py_list_2d)

print(np_array_2d)
# Output:
# [[1 2 3]
#  [4 5 6]
#  [7 8 9]]

np_array = np.array([[1, 2, 3], [4, 5, 6]])

print("Dimensions: ", np_array.ndim) # Dimensions:  2
print("Shape: ", np_array.shape)     # Shape:  (2, 3)
print("Size: ", np_array.size)       # Size: 6
print("Data Type: ", np_array.dtype) # Data Type:  int64

# Indexing: access the element at the first row, third column
print("Indexed Value: ", np_array[0, 2]) # Indexed Value:  3

# Slicing: access the first row 
print("Sliced Value: ", np_array[0,:]) # Sliced Value:  [1 2 3]

# Reshape the array to 3 rows and 2 columns (only applicable if the reshaped total size equals the original size)
reshaped_array = np_array.reshape(3, 2)
print("Reshaped Array:\n", reshaped_array)
# Reshaped Array:
# [[1 2]
#  [3 4]
#  [5 6]]

np_array1 = np.array([1, 2, 3])
np_array2 = np.array([4, 5, 6])

# Addition
print(np_array1 + np_array2) # Output: [5 7 9]

# Subtraction
print(np_array1 - np_array2) # Output: [-3 -3 -3]

# Multiplication
print(np_array1 * np_array2) # Output: [4 10 18]

# Division
print(np_array1 / np_array2) # Output: [0.25 0.4 0.5]

# Pandas Basic

In [None]:
import pandas as pd 

data_dict = {"Name": ["John", "Anna", "Peter"],
             "Age": [28, 24, 33],
             "City": ["New York", "Los Angeles", "Berlin"]}

df = pd.DataFrame(data_dict)

print(df)

"""
    Name  Age         City
0   John   28     New York
1   Anna   24  Los Angeles
2  Peter   33       Berlin
"""
print(df.head(2))  # Print first two rows
print(df.tail(2))  # Print last two rows
print(df.shape)    # Print dimensions of the df (rows, columns): (3, 3)
print(df.columns)  # Print column labels: Index(['Name', 'Age', 'City'], dtype='object')
print(df.dtypes)   # Print data types of each column:
# Name    object
# Age      int64
# City    object
# dtype: object

### Lambda & apply function(IsChild)

In [None]:
df["IsYouthful"] = df["Age"].apply(lambda age: "Yes" if age < 30 else "No")
print(df)

"""
    Name  Age         City IsYouthful
0   John   28     New York        Yes
1   Anna   24  Los Angeles        Yes
2  Peter   33       Berlin         No
"""

In [None]:
df2 = pd.DataFrame({"Name": ["Megan"], "Age": [34], "City": ["San Francisco"], "IsYouthful": ["No"]})

df_concatenated = pd.concat([df, df2], ignore_index=True)

print(df_concatenated)

"""
    Name  Age           City IsYouthful
0   John   28       New York        Yes
1   Anna   24    Los Angeles        Yes
2  Peter   33         Berlin         No
3  Megan   34  San Francisco         No (reset index cause ignore_index=True)
"""

In [None]:
print(df['column_name']) # select a single column
print(df[['col1', 'col2']]) # select multiple columns

#df.iloc[row_selection, column_selection]:
df.iloc[1,0] # Select the value in the second row and the first column (1-based)
df.iloc[:2,:2] # Select the first two rows and columns

### basic Pandas|Titanic data from Seaborn lib

In [None]:
import pandas as pd
import seaborn as sns

# Load the titanic dataset into a Pandas DataFrame
titanic = sns.load_dataset('titanic')

# Look at the first 3 rows of the DataFrame
print(titanic.head(3))

"""
   survived  pclass     sex   age  ...  deck  embark_town  alive  alone
0         0       3    male  22.0  ...   NaN  Southampton     no  False
1         1       1  female  38.0  ...     C    Cherbourg    yes  False
2         1       3  female  26.0  ...   NaN  Southampton    yes   True

[3 rows x 15 columns]
"""

# Using lambda for DataFrame manipulation
# Create a new column, "IsChild", to mark the passengers who are under 18
titanic["IsChild"] = titanic["age"].apply(lambda age: "Yes" if age < 18 else "No")
print("\nDataFrame after adding the 'IsChild' column:")
print(titanic.head(5))

# less than 7，under 7, 表示＜7
# not more than 7，表示≤7

# Concatenating DataFrames
# Create a new DataFrame
new_data = pd.DataFrame({"survived": [1],
                         "pclass": [3],
                         "sex": ["male"],
                         "age": [32],
                         "sibsp": [0],
                         "parch": [0],
                         "fare": [7.75],
                         "embarked": ["Q"],
                         "class": ["Third"],
                         "who": ["man"],
                         "adult_male": [True],
                         "deck": [None],
                         "embark_town": ["Queenstown"],
                         "alive": ["yes"],
                         "alone": [True],
                         "IsChild": ["No"]})
# Drop columns with all-NA values
# To Handle the Future warning of concat function (since it will not support NA)
new_data = new_data.dropna(axis=1, how='all')
# Concatenate the new data to the original DataFrame
titanic_concat = pd.concat([titanic, new_data], ignore_index=True)
print("\nConcatenated DataFrame:")
print(titanic_concat.tail())


# descriptive statistical analysis

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

# Load Titanic dataset
titanic_df = sns.load_dataset('titanic')

mean_age = titanic_df['age'].mean()
median_age = titanic_df['age'].median()
mode_age = titanic_df['age'].mode()[0] #因为返回的是一个Series所以需要index[0]

print(f"Mean age: {mean_age}") # Mean age: 29.69911764705882
print(f"Median age: {median_age}") # Median age: 28.0
print(f"Mode age: {mode_age}") # Mode age: 24.0
# Standard deviation
std_dev_age = np.std(titanic_df['age'])

print(f"Standard deviation of age: {std_dev_age}") # Standard deviation of age: 14.516321150817316

In [None]:
# 创建一个 Series
data = pd.Series([1, 2, 2, 3, 4, 4, 4, 5])

# 众数（mode），Series的众数返回Series
# 指定了列的时候返回某个数，没指定时返回各个列的众数为一个dataframe
# 多个值出现次数相同时，返回多个值
mode_value = data.mode()
print(mode_value)  # 输出: 0    4
                   #       dtype: int64

### 四分位，百分位

In [None]:
# Quartiles and percentiles
# Using Numpy
Q1_age_np = np.percentile(titanic_df['age'].dropna(), 25) # dropna is being used to drop NA values
Q3_age_np = np.percentile(titanic_df['age'].dropna(), 75) # Numpy的Percentile需要DropNA一下

print(f"First quartile of age (Numpy): {Q1_age_np}")
print(f"Third quartile of age (Numpy): {Q3_age_np}")

# Output:
# First quartile of age (Numpy): 20.125 ->有25%的人的年龄低于20.125岁
# Third quartile of age (Numpy): 38.0   ->有75%的人的年龄低于38岁

# Using Pandas
Q1_age_pd = titanic_df['age'].quantile(0.25)
Q3_age_pd = titanic_df['age'].quantile(0.75) # pandas的Quantile不需要DropNA

print(f"First quartile of age (Pandas): {Q1_age_pd}")
print(f"Third quartile of age (Pandas): {Q3_age_pd}")

# Output:
# First quartile of age (Pandas): 20.125
# Third quartile of age (Pandas): 38.0

# Filtering and Sorting(Pandas)

In [None]:
import seaborn as sns
import pandas as pd

# Load dataset
titanic_df = sns.load_dataset('titanic')

# Filter passengers who survived
survivors = titanic_df[titanic_df['survived'] == 1]
print(survivors.head())

"""
   survived  pclass     sex   age  ...  deck  embark_town  alive  alone
1         1       1  female  38.0  ...     C    Cherbourg    yes  False
2         1       3  female  26.0  ...   NaN  Southampton    yes   True
3         1       1  female  35.0  ...     C  Southampton    yes  False
8         1       3  female  27.0  ...   NaN  Southampton    yes  False
9         1       2  female  14.0  ...   NaN    Cherbourg    yes  False

[5 rows x 15 columns]
"""

In [None]:
# Sort survivors by age
sorted_df = survivors.sort_values('age') # ascending 升序
print(sorted_df.head())

"""
     survived  pclass     sex   age  ...  deck  embark_town  alive  alone
803         1       3    male  0.42  ...   NaN    Cherbourg    yes  False
755         1       2    male  0.67  ...   NaN  Southampton    yes  False
644         1       3  female  0.75  ...   NaN    Cherbourg    yes  False
469         1       3  female  0.75  ...   NaN    Cherbourg    yes  False
831         1       2    male  0.83  ...   NaN  Southampton    yes  False

[5 rows x 15 columns]
"""

In [None]:

# Sort survivors by class and age
sorted_df = survivors.sort_values(['pclass', 'age'], ascending=[False, True])
print(sorted_df.head())

"""
     survived  pclass     sex   age  ...  deck  embark_town  alive  alone
803         1       3    male  0.42  ...   NaN    Cherbourg    yes  False
469         1       3  female  0.75  ...   NaN    Cherbourg    yes  False
644         1       3  female  0.75  ...   NaN    Cherbourg    yes  False
172         1       3  female  1.00  ...   NaN  Southampton    yes  False
381         1       3  female  1.00  ...   NaN    Cherbourg    yes  False

[5 rows x 15 columns]
"""

In [None]:
# Filter female passengers who survived
female_survivors = titanic_df[
    (titanic_df['survived'] == 1) & (titanic_df['sex'] == 'female')
]
print(female_survivors.head())

"""
   survived  pclass     sex   age  ...  deck  embark_town  alive  alone
1         1       1  female  38.0  ...     C    Cherbourg    yes  False
2         1       3  female  26.0  ...   NaN  Southampton    yes   True
3         1       1  female  35.0  ...     C  Southampton    yes  False
8         1       3  female  27.0  ...   NaN  Southampton    yes  False
9         1       2  female  14.0  ...   NaN    Cherbourg    yes  False

[5 rows x 15 columns]
"""

# Data Cleaning and Preprocessing

## Missing Data Handling

### Detection

In [None]:
import seaborn as sns

# Load the dataset
titanic_df = sns.load_dataset('titanic')

# Detect missing values 
missing_values = titanic_df.isnull() # return boolean dataframe, same size as original data
print(missing_values.head(10))
"""
   survived  pclass    sex    age  ...   deck  embark_town  alive  alone
0     False   False  False  False  ...   True        False  False  False
1     False   False  False  False  ...  False        False  False  False
2     False   False  False  False  ...   True        False  False  False
3     False   False  False  False  ...  False        False  False  False
4     False   False  False  False  ...   True        False  False  False
5     False   False  False   True  ...   True        False  False  False
6     False   False  False  False  ...  False        False  False  False
7     False   False  False  False  ...   True        False  False  False
8     False   False  False  False  ...   True        False  False  False
9     False   False  False  False  ...   True        False  False  False

[10 rows x 15 columns]
"""
missing_values_count = titanic_df.isnull().sum()
print(missing_values_count)
"""
survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64
"""

import matplotlib.pyplot as plt

# Detected missing values visualized
plt.figure(figsize=(10,6))
sns.heatmap(titanic_df.isnull(), cmap='viridis')
plt.show()

### Drop

In [None]:
# Drop的前提是缺失数据少，不影响我们的计算
# Copy the original dataset
titanic_df_copy = titanic_df.copy()

# Drop rows with missing values
titanic_df_copy.dropna(inplace=True) # titanic_df_copy is now modified
titanic_df_copy = titanic_df.dropna(inplace=False) # Original df remains the same; you get new_df with modifications

# Check the dataframe
print(titanic_df_copy.isnull().sum())
# There will be no missing values in every column

print(titanic_df.shape)
# TODO: Clean the dataset to remove rows with missing 'age' data
titanic_df['age_missing'] = titanic_df['age'].isnull()
print(titanic_df['age_missing'].unique()) 

missing_values_count = titanic_df['age'].isnull().sum()
print(missing_values_count)

titanic_df_removed = titanic_df[titanic_df['age_missing'] == False]
print(titanic_df_removed.shape)

### Imputation

In [None]:
# Imputation估算适用于不想改变数据整体大小，因此用平均值，中位数，众数等等来取代缺损
# Impute missing values using mean
titanic_df['age'].fillna(titanic_df['age'].mean(), inplace=True)

# Check the dataframe
print(titanic_df.isnull().sum())
"""
survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64
"""
# Impute missing values using backward fill
titanic_df['age'].fillna(method='bfill', inplace=True)

# Check the dataframe
print(titanic_df.isnull().sum())
# The output is the same as in the previous example

## Categorical Data Encoding 类型特征编码

In [None]:
import pandas as pd
import seaborn as sns

# Load Titanic dataset
titanic_df = sns.load_dataset('titanic')
# Display unique categories in 'sex' and 'embark_town'
print(titanic_df['sex'].unique()) # Output: ['male' 'female']
print(titanic_df['embark_town'].unique()) # Output: ['Southampton' 'Cherbourg' 'Queenstown' nan]

### factorize(Label)

In [None]:
# Label Encoding for 'sex'
titanic_df['sex_encoded'] = pd.factorize(titanic_df['sex'])[0]
# factorize返回2个item，index0是numerical encoded label，index1是unique value array
# 下面显示了index0
print(titanic_df[['sex', 'sex_encoded']].head())
"""
      sex  sex_encoded
0    male            0
1  female            1
2  female            1
3  female            1
4    male            0
"""
# 因为male最先出现，所以被factorize函数label为0，female被label为1

# 如果想要指定label可以直接用replace函数
# Replace 'yes' with 1 and 'no' with 0
titanic_df['alive_encoded'] = titanic_df['alive'].replace({'yes': 1, 'no': 0})


### get_dummies(One-Hot)

In [None]:
# One-Hot Encoding for 'embark_town'
encoded_df = pd.get_dummies(titanic_df['embark_town'], prefix='town')
# get_dummies为变量embark_town的每个类别，创建了一个binary column(TF=10)，适用于没有排序性的类别

titanic_df = pd.concat([titanic_df, encoded_df], axis=1)
print(titanic_df.head())
"""
   survived  pclass     sex  ...  town_Cherbourg  town_Queenstown  town_Southampton
0         0       3    male  ...           False            False              True
1         1       1  female  ...            True            False             False
2         1       3  female  ...           False            False              True
3         1       1  female  ...           False            False              True
4         0       3    male  ...           False            False              True
"""

## Scaling(numerical -> diff ranges)

### Standard Scaler(Normal distribution)

In [None]:
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# Load the dataset and drop rows with missing values
titanic_df = sns.load_dataset('titanic').dropna()

# Initialize the StandardScaler
std_scaler = StandardScaler()

# Fit and transform the 'age' column
titanic_df['age_std'] = std_scaler.fit_transform(np.array(titanic_df['age']).reshape(-1, 1))
# reshape使得age列从1D变成了2D array
# 下面的代码实现了同样的功能，但是直接将age列变成了一个dataframe
titanic_df['age_std'] = std_scaler.fit_transform(titanic_df[['age']])

# Check the transformed 'age' column
print(titanic_df['age','age_std'].head())
"""
1     0.152082
3    -0.039875
6     1.175852
10   -2.023430
11    1.431795
Name: age, dtype: float64
"""

In [None]:
# Check if data is normally distributed?
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from scipy.stats import shapiro, kstest, anderson

# Shapiro-Wilk Test(small sample)
stat, p = shapiro(titanic_df['age'])
print(f'Shapiro-Wilk test p-value: {p}')

# Kolmogorov-Smirnov Test(larger sample)
stat, p = kstest(titanic_df['age'], 'norm', args=(titanic_df['age'].mean(), titanic_df['age'].std()))
print(f'Kolmogorov-Smirnov test p-value: {p}')

# Anderson-Darling Test(powerful)
result = anderson(titanic_df['age'])
print(f'Anderson-Darling test statistic: {result.statistic}')

# Histogram
sns.histplot(data=titanic_df['age'], kde=True)
plt.show()

# QQ Plot
stats.probplot(titanic_df['age'], dist="norm", plot=plt)
plt.show()


### Min-Max Scaler(No specific Shape/Distribution)归一化

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the MinMaxScaler
min_max_scaler = MinMaxScaler()

# Fit and transform the 'fare' column
titanic_df['fare'] = min_max_scaler.fit_transform(np.array(titanic_df['fare']).reshape(-1, 1))

# Check the transformed 'fare' column
print(titanic_df['fare'].head())
"""
1     0.139136
3     0.103644
6     0.101229
10    0.032596
11    0.051822
Name: fare, dtype: float64
"""

### Robust Scaler(四分位IQR 抵御异常值影响)

In [None]:
from sklearn.preprocessing import RobustScaler

# Initialize the RobustScaler
robust_scaler = RobustScaler()

# Fit and transform the 'fare' column
titanic_df['fare'] = robust_scaler.fit_transform(np.array(titanic_df['fare']).reshape(-1, 1))

# Check the transformed 'fare' column
print(titanic_df['fare'].head())
"""
1     0.236871
3    -0.064677
6    -0.085199
10   -0.668325
11   -0.504975
Name: fare, dtype: float64
"""
# 异常值变成了small positive and nagative values

## Outlier Detection

### Z-score

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

# Load the dataset
titanic_df = sns.load_dataset('titanic')

# Calculate Z-scores
titanic_df['age_zscore'] = np.abs((titanic_df.age - titanic_df.age.mean()) / titanic_df.age.std())

# Get rows of outliers according to the Z-score method (using a threshold of 3)
outliers_zscore = titanic_df[(titanic_df['age_zscore'] > 3)]
print("Outliers detected by the Z-score method:")
print(outliers_zscore)
"""
     survived  pclass   sex   age  ...  embark_town  alive  alone age_zscore
630         1       1  male  80.0  ...  Southampton    yes   True   3.462699
851         0       3  male  74.0  ...  Southampton     no   True   3.049660

[2 rows x 16 columns]
"""

### IQR(25%~75%)

In [None]:
# Calculate IQR
Q1 = titanic_df['age'].quantile(0.25)
Q3 = titanic_df['age'].quantile(0.75)
IQR = Q3 - Q1

# Define Bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Get rows of outliers according to IQR method
outliers_iqr = titanic_df[(titanic_df['age'] < lower_bound) | (titanic_df['age'] > upper_bound)]
print(outliers_iqr)
"""
     survived  pclass   sex   age  ...  embark_town  alive  alone age_zscore
33          0       2  male  66.0  ...  Southampton     no   True   2.498943
54          0       1  male  65.0  ...    Cherbourg     no  False   2.430103
96          0       1  male  71.0  ...    Cherbourg     no   True   2.843141
116         0       3  male  70.5  ...   Queenstown     no   True   2.808721
280         0       3  male  65.0  ...   Queenstown     no   True   2.430103
456         0       1  male  65.0  ...  Southampton     no   True   2.430103
493         0       1  male  71.0  ...    Cherbourg     no   True   2.843141
630         1       1  male  80.0  ...  Southampton    yes   True   3.462699
672         0       2  male  70.0  ...  Southampton     no   True   2.774301
745         0       1  male  70.0  ...  Southampton     no  False   2.774301
851         0       3  male  74.0  ...  Southampton     no   True   3.049660

[11 rows x 16 columns]
"""

### Drop Or Replace?

In [None]:
# Using the Z-score method
titanic_df = titanic_df[titanic_df['age_zscore'] <= 3]

# Using the IQR method
titanic_df = titanic_df[(titanic_df['age'] >= lower_bound) & (titanic_df['age'] <= upper_bound)]

# using mean
titanic_df.loc[titanic_df['age_zscore'] > 3, 'age'] = titanic_df['age'].mean()

# using median
# Handle outliers in the 'age' column detected by the Z-score method by replacing them with median
titanic_clean = titanic_df
titanic_clean.loc[titanic_clean['age_zscore'] > 3, 'age'] = titanic_df['age'].median()

# Print cleaned data
print("\nData after handling outliers detected by Z-score method:")
print(titanic_clean)

titanic_df.loc[(titanic_df['age'] < lower_bound) | (titanic_df['age'] > upper_bound), 'age'] = titanic_df['age'].median()

## Correlation Coeffcient

In [None]:
import seaborn as sns

# Load the dataset
titanic_df = sns.load_dataset('titanic')

# Calculate and print the correlation matrix
corr_matrix = titanic_df.corr(numeric_only=True)
# corr function returns a DataFrame with the correlation coefficients between all pairs of numeric columns in titanic_df.

print(corr_matrix)
"""
            survived    pclass       age  ...      fare  adult_male     alone
survived    1.000000 -0.338481 -0.077221  ...  0.257307   -0.557080 -0.203367
pclass     -0.338481  1.000000 -0.369226  ... -0.549500    0.094035  0.135207
age        -0.077221 -0.369226  1.000000  ...  0.096067    0.280328  0.198270
sibsp      -0.035322  0.083081 -0.308247  ...  0.159651   -0.253586 -0.584471
parch       0.081629  0.018443 -0.189119  ...  0.216225   -0.349943 -0.583398
fare        0.257307 -0.549500  0.096067  ...  1.000000   -0.182024 -0.271832
adult_male -0.557080  0.094035  0.280328  ... -0.182024    1.000000  0.404744
alone      -0.203367  0.135207  0.198270  ... -0.271832    0.404744  1.000000

[8 rows x 8 columns]
"""
# -0.549500 as the passenger class decreases (3rd class to 1st class), the ticket fare increases.
print(corr_matrix.loc['age','pclass'])

# If 'fare' and 'pclass' are highly correlated
clean_df = titanic_df.drop('fare', axis=1)
# dropping the fare column. The axis=1 parameter indicates that we want to drop a column (for dropping a row, we would have used axis=0). 

### Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
titanic_df = sns.load_dataset('titanic')

# Calculate the correlation matrix
corr_matrix = titanic_df.corr(numeric_only=True)

# Create a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm')

# Add a title
plt.title('Heatmap of the Correlation Matrix')

# Show the plot
plt.show()


## Feature Engineering
 larger families might have a lower chance of survival due to difficulties keeping the family together during the sinking, or certain age groups might have a higher or lower survival rate.

In [None]:
# family_size, by adding sibsp (number of siblings/spouses aboard) and parch (number of parents/children aboard)
# Load the data
import seaborn as sns

titanic_df = sns.load_dataset('titanic')

# Create a new feature, 'family_size'
titanic_df['family_size'] = titanic_df['sibsp'] + titanic_df['parch'] + 1 # plus one (the passenger themself)
print(titanic_df.head())
"""
   survived  pclass     sex   age  ...  embark_town  alive  alone family_size
0         0       3    male  22.0  ...  Southampton     no  False           2
1         1       1  female  38.0  ...    Cherbourg    yes  False           2
2         1       3  female  26.0  ...  Southampton    yes   True           1
3         1       1  female  35.0  ...  Southampton    yes  False           2
4         0       3    male  35.0  ...  Southampton     no   True           1

[5 rows x 16 columns]
"""

# Create a new feature, 'is_alone'
titanic['is_alone'] = 1
titanic.loc[titanic['family_size'] > 1, 'is_alone'] = 0
# Print the first 5 rows of the dataset
print(titanic[['sibsp', 'parch', 'family_size', 'is_alone']].head())


### Continuous -> categorical(Pandas cut())

In [None]:
# Import pandas
import pandas as pd

# Define the bin edges
age_bins = [0, 12, 18, 30, 45, 100]

# Define the bin labels
age_labels = ['Child', 'Teenager', 'Young Adult', 'Middle Age', 'Senior']

# 标签一定比bin的edge少一个！！！Bin labels must be one fewer than the number of bin edges

# Create the age group feature
titanic_df['age_group'] = pd.cut(titanic_df['age'], bins=age_bins, labels=age_labels)

# Show the first few rows of the data
print(titanic_df.head())
"""
   survived  pclass     sex   age  ...  alive  alone  family_size    age_group
0         0       3    male  22.0  ...     no  False            2  Young Adult
1         1       1  female  38.0  ...    yes  False            2   Middle Age
2         1       3  female  26.0  ...    yes   True            1  Young Adult
3         1       1  female  35.0  ...    yes  False            2   Middle Age
4         0       3    male  35.0  ...     no   True            1   Middle Age

[5 rows x 17 columns]
"""
# Check the distribution of the 'age_group' column
print(titanic_df['age_group'].value_counts())
"""
age_group
Young Adult    270
Middle Age     202
Senior         103
Teenager        70
Child           69
Name: count, dtype: int64
"""

# Numpy,Pandas Advanced

In [None]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

dataset = fetch_california_housing()

df = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
df["MedHouseValue"] = dataset.target
# 以上两行可以用以下一行完成
df = pd.DataFrame(data=np.c_[data['data'], data['target']], # c_ 函数列结合，r_行结合
                  columns=data['feature_names'] + ['MedHouseValue'])
print(np.c_[np.array([1,2,3]),np.array([4,5,6])])
"""
[[1 4]
 [2 5]
 [3 6]]
 """

print(np.r_[np.array([1,2,3]),np.array([4,5,6])])
"""
[1 2 3 4 5 6]
"""

print(df.head())
"""
   MedInc  HouseAge  AveRooms  ...  Latitude  Longitude  MedHouseValue
0  8.3252      41.0  6.984127  ...     37.88    -122.23          4.526
1  8.3014      21.0  6.238137  ...     37.86    -122.22          3.585
2  7.2574      52.0  8.288136  ...     37.85    -122.24          3.521
3  5.6431      52.0  5.817352  ...     37.85    -122.25          3.413
4  3.8462      52.0  6.281853  ...     37.85    -122.25          3.422

[5 rows x 9 columns]
"""

print("Size of the dataframe: ", df.shape) # Output: (20640, 9)
print("\nStatistical Summary for the dataset:")
print(df.describe())
"""
             MedInc      HouseAge  ...     Longitude  MedHouseValue
count  20640.000000  20640.000000  ...  20640.000000   20640.000000
mean       3.870671     28.639486  ...   -119.569704       2.068558
std        1.899822     12.585558  ...      2.003532       1.153956
min        0.499900      1.000000  ...   -124.350000       0.149990
25%        2.563400     18.000000  ...   -121.800000       1.196000
50%        3.534800     29.000000  ...   -118.490000       1.797000
75%        4.743250     37.000000  ...   -118.010000       2.647250
max       15.000100     52.000000  ...   -114.310000       5.000010

[8 rows x 9 columns]
"""

MedInc: This is the median income for households within a block (scaled and capped at 15 for higher median 
incomes and at 0.5 for lower median incomes).

HouseAge: This is the median house age within a block.

AveRooms: This is the average number of rooms in the houses within a block.

AveBedrms: This is the average number of bedrooms in the houses within a block.

Population: This is the total population within a block.

AveOccup: This is the average house occupancy, computed as the total population within a block divided by the number of households.

Latitude and Longitude: These are the geographic coordinates of the block groups.

MedHouseValue: This is the median house value for households within a block (measured in 100,000s).

In [None]:
print("\nChecking for missing values in the dataset:")
print(df.isnull().sum())
"""
MedInc           0
HouseAge         0
AveRooms         0
AveBedrms        0
Population       0
AveOccup         0
Latitude         0
Longitude        0
MedHouseValue    0
dtype: int64
"""

## Matrix

### +，-，*，/

In [None]:
import numpy as np
matrix_a = np.random.randint(255, size=(3, 3))

A = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(A)
"""
[[1 2 3]
 [4 5 6]
 [7 8 9]]
"""
B = np.array([[9, 8, 7], [6, 5, 4], [3, 2, 1]])
"""
[[9 8 7]
 [6 5 4]
 [3 2 1]]
"""

C = A + B
print(C)
"""
[[10 10 10]
 [10 10 10]
 [10 10 10]]
"""
D = A - B
print(D)
"""
[[-8 -6 -4]
 [-2  0  2]
 [ 4  6  8]]
"""

E = A * B
print(E)
# 要注意！E_ij = A_ij * B_ij ->element-wise multiplication!!!
# 不同于线性代数中的矩阵乘法
"""
[[ 9 16 21]
 [24 25 24]
 [21 16  9]]
"""
# Matrix multiplication with @
F = A @ B

# Matrix multiplication with np.dot()
G = np.dot(A, B)

F = np.linalg.inv(E)  # Finding the inverse of matrix E
# Only invertible可逆矩阵
print(F)
"""
[[-0.73611111  0.88888889 -0.65277778]
 [ 1.33333333 -1.66666667  1.33333333]
 [-0.65277778  0.88888889 -0.73611111]]
"""

print(np.dot(E,F))
"""
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
"""

### Singular and Degenerate 奇异矩阵和退化矩阵只能求伪逆

In [None]:
# np.linalg.inv(A) doesn't exist, it will throw an exception
FP = np.linalg.pinv(A)  # Finding the pseudo-inverse matrix of A
print(FP)
"""
[[-6.38888889e-01 -1.66666667e-01  3.05555556e-01]
 [-5.55555556e-02  1.26893721e-16  5.55555556e-02]
 [ 5.27777778e-01  1.66666667e-01 -1.94444444e-01]]
"""

### Transpose转置

In [None]:
G = np.transpose(A)
print(G)
"""
[[1 4 7]
 [2 5 8]
 [3 6 9]]
"""

H = A.T
print(H)
"""
[[1 4 7]
 [2 5 8]
 [3 6 9]]
"""

### Stack堆叠

In [None]:
# Assuming these are two features from our dataset
feature_1 = np.array([[123], [456], [789]])
feature_2 = np.array([[321], [654], [987]])

# Combine the two features into one matrix
data_features = np.hstack((feature_1, feature_2))
print(data_features)
"""
[[123 321]
 [456 654]
 [789 987]]
"""

### Normalize 正规化

In [None]:
normalized_data_features = data_features / np.linalg.norm(data_features) # L2 norm
print(normalized_data_features)
"""
[[0.08022761 0.2093745 ]
 [0.2974292  0.42657609]
 [0.51463079 0.64377768]]
"""

# 确保正规化后的值在0和1之间
normalized_data_features_minmax = (data_features - np.min(data_features)) / (np.max(data_features) - np.min(data_features))
print(normalized_data_features_minmax)
"""
[[0.         0.22916667]
 [0.38541667 0.61458333]
 [0.77083333 1.        ]]
"""

### weights权重点乘

In [None]:
# Simulate the weights of features
weights = np.array([0.4, 0.6])

# Calculate the weighted sum of features
weighted_sum_features = np.dot(data_features, weights)
print(weighted_sum_features)
# Output: [241.8 574.8 907.8]

## GroupBy & Apply

### split

In [None]:
import pandas as pd

# Create a simple dataframe
data = {'Company': ['GOOG', 'GOOG', 'MSFT', 'MSFT', 'FB', 'FB'],
       'Person': ['Sam', 'Charlie', 'Amy', 'Vanessa', 'Carl', 'Sarah'],
       'Sales': [200, 120, 340, 124, 243, 350]}
df = pd.DataFrame(data)

# Apply groupby
df_grouped = df.groupby('Company')
# 要注意，groupby函数不会返回一个dataframe，返回了一个object里面包含了很多methods
print(df_grouped.sum())
"""
             Person  Sales
Company                   
FB        CarlSarah    593
GOOG     SamCharlie    320
MSFT     AmyVanessa    464
"""

for key, item in df_grouped:
    print("\nGroup Key: {}".format(key))
    print(df_grouped.get_group(key), "\n")
"""
Group Key: FB
  Company Person  Sales
4      FB   Carl    243
5      FB  Sarah    350 


Group Key: GOOG
  Company   Person  Sales
0    GOOG      Sam    200
1    GOOG  Charlie    120 


Group Key: MSFT
  Company   Person  Sales
2    MSFT      Amy    340
3    MSFT  Vanessa    124
"""

print(df.groupby('Company').apply(lambda x: x['Sales'].max()))
"""
Company
FB      350
GOOG    200
MSFT    340
dtype: int64
"""

### apply

In [None]:
import numpy as np
import pandas as pd

# Create a dataframe
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
                   'B': ['one', 'one', 'two', 'three','two', 'two', 'one', 'three'],
                   'C': np.random.randn(8),
                   'D': np.random.randn(8)})

# Define a function
def get_sum(row):
    return row.sum()

# Apply the function 
df['sum'] = df[['C', 'D']].apply(get_sum, axis=1)

print(df)
"""
     A      B         C         D       sum
0  foo    one -0.343200  0.184665 -0.158535
1  bar    one  0.058870  1.835614  1.894484
2  foo    two  0.801743 -0.184409  0.617333
3  bar  three  0.935406  0.124109  1.059515
4  foo    two  0.782074  0.583470  1.365544
5  bar    two  0.138934  0.710407  0.849341
6  foo    one  0.364633  1.147963  1.512596
7  foo  three -1.364677  1.719538  0.354861
"""

### EXAMPLE

In [None]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

# Fetch the dataset
data = fetch_california_housing(as_frame=True)

# create a DataFrame
housing_df = pd.DataFrame(data=data.data, columns=data.feature_names)

# Add a 'RoomPerHousehold' feature to the dataframe
housing_df['RoomPerHousehold'] = housing_df['AveRooms'] / housing_df['AveOccup']

# Perform log transformation on 'MedInc' feature (apply np.log to the column)
df['MedInc'] = np.log(df['MedInc'])

# Group by 'RoomPerHousehold' category and get the median of 'MedInc' for each group
med_house_val = housing_df.groupby('RoomPerHousehold').apply(lambda x: x['MedInc'].median())

# Print the result
print(med_house_val)

# Define income category
housing_df['income_cat'] = pd.cut(housing_df['MedInc'],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

# Group by income category and calculate the average population
average_population = housing_df.groupby('income_cat').apply(lambda x: x['Population'].mean())

print(average_population)
"""
income_cat
1    1105.806569
2    1418.232336
3    1448.062465
4    1488.974718
5    1389.890347
dtype: float64
"""

housing_df['Age_cat'] = pd.cut(housing_df['HouseAge'],
                               bins=[7, 14, 21, 28, 35, np.inf],
                               labels=[1, 2, 3, 4, 5])

# Group by Age_cat and calculate the average number of bedrooms
average_value = housing_df.groupby('Age_cat').apply(lambda x: x['AveBedrms'].mean())

# Plot the result as a bar chart
average_value.plot(kind='bar', title='Average Number of Bedrooms by House Age Category')

## Optimization优化

### Memory reduce by vectorization向量化

In [None]:
import numpy as np
import time

# Define a large array
large_array = np.random.rand(10**6)

# Python way of summing elements in an array
start = time.time()
print("Built-in list sum", sum(large_array))  # This calculates the sum using Python's built-in function
print("Time to calculate the sum in a Python list:", time.time() - start)
# Prints "Time to calculate the sum in a Python list: 0.0417783260345459"

# Numpy way
start = time.time()
print("Numpy sum:", np.sum(large_array))  # This calculates the sum using Numpy's vectorized function
print("Time to calculate the sum in a Numpy list:", time.time() - start)
# Prints "Time to calculate the sum in a Numpy list: 0.00037097930908203125"

# Define a large array
large_array = np.random.rand(50000)

# Compute the sum using Python's built-in function, time it, and print the result
python_start_time = pd.Timestamp.now()
python_sum = sum(large_array)
python_end_time = pd.Timestamp.now()
print(f"Python Sum: {python_sum}, Time Taken: {python_end_time - python_start_time}")
"""
Python Sum: 25041.96793358683, Time Taken: 0 days 00:00:00.020172
"""

# Compute the sum using Numpy's built-in function, time it, and print the result
numpy_start_time = pd.Timestamp.now()
numpy_sum = np.sum(large_array)
numpy_end_time = pd.Timestamp.now()
print(f"Numpy Sum: {numpy_sum}, Time Taken: {numpy_end_time - numpy_start_time}")
"""
Numpy Sum: 25041.967933586937, Time Taken: 0 days 00:00:00.000167
"""

### catogorical数据比int和Float节省内存

In [None]:
df['Type'] = pd.Categorical(df['Type'])
df['MedInc'] = df['MedInc'].astype('category')

# 浮点Float32比float64节省内存，int32比int64节省内存，downcast用于下划数据类型
# Downcast data type for 'AveBedrms' column
df['AveBedrms'] = pd.to_numeric(df['AveBedrms'], downcast='float')
df['Population'] = df['Population'].astype('int32')
df_new = df.astype({
    'AveBedrms': 'float32',
    'AveRooms': 'float32',
    'AveOccup': 'float32',
    'Latitude': 'float32',
    'Longitude': 'float32',
    'MedInc': 'float32',
    'Population': 'int32',
    'HouseAge': 'int32'}, 
    copy=True) # True返回一个新DataFrame

# Regular way
df_copy = df[df['Population'] > 1000]
df_copy.dropna(inplace=True)

# Optimized way
# Method Chaining将多个步骤在一行代码
df[df['Population'] > 1000].dropna(inplace=True)

### EXAMPLE

In [None]:
import pandas as pd
from sklearn import datasets
import numpy as np

# Load the California Housing dataset
california = datasets.fetch_california_housing()
df = pd.DataFrame(data=np.c_[california['data'], california['target']], columns=california['feature_names'] + ['target'])

def memory_usage_pandas(df):
    bytes = df.memory_usage(deep=True).sum()
    return bytes / 1024**2  # Convert bytes to megabytes

original_memory = memory_usage_pandas(df)

# Optimize memory usage in Pandas using categorical data types
# California Housing dataset does not have any Categorical features, so we will use downcasting
df['AveBedrms'] = pd.to_numeric(df['AveBedrms'], downcast='float')
df['AveRooms'] = pd.to_numeric(df['AveRooms'], downcast='float')
optimized_memory = memory_usage_pandas(df)

print(f'Original memory usage: {original_memory} MB')
print(f'Optimized memory usage: {optimized_memory} MB')
print(f'Memory saved: {original_memory - optimized_memory} MB')

reduced_percent = pd.to_numeric((original_memory - optimized_memory)/original_memory *100, downcast='float')
print("The memory reduced: ", reduced_percent, "%")


## 跨学科数据例


In [None]:
import pandas as pd

# DNA sequences for several genes
data = {
    "Gene": ["Gene A", "Gene B", "Gene C", "Gene D"],
    "Sequence": ["ATCGTACGA", "CGATCGATG", "TAGCTAG", "CGTAGCTA"],
    "Discovery_Date": pd.date_range('02/15/1935', periods=4),
    "Popularity_on_Social_Network": np.array([1250, 500, 800, 2000])
}

df_genes = pd.DataFrame(data)
print(df_genes)
"""
     Gene Discovery_Date  Popularity_on_Social_Network
0  Gene A     1935-02-15                          1250
1  Gene B     1935-02-16                           500
2  Gene C     1935-02-17                           800
3  Gene D     1935-02-18                          2000
"""
df_genes['Length'] = df_genes['Sequence'].apply(len)
print(df_genes)
"""
     Gene   Sequence  Length
0  Gene A  ATCGTACGA       9
1  Gene B  CGATCGATG       9
2  Gene C    TAGCTAG       7
3  Gene D   CGTAGCTA       8
"""

data_bio = {
    "Gene": ["Gene M", "Gene N", "Gene O", "Gene P"],
    "Sequence": ["TGCCGTA", "AATGCGT", "CGTACGT", "GGCTATG"]
}
df_bio = pd.DataFrame(data_bio)

# lowercase the DNA Sequence
df_bio['Sequence'] = df_bio['Sequence'].apply(lambda x: str.lower(x))

# Reverse the DNA Sequence
df_bio['Reverse_Sequence'] = df_bio['Sequence'].apply(lambda x: x[::-1])
"""
     Gene Sequence Reverse_Sequence
0  Gene M  TGCCGTA          ATGCCGT
1  Gene N  AATGCGT          TGCGTAA
2  Gene O  CGTACGT          TGCATGC
3  Gene P  GGCTATG          GTATCGG
     Gene Sequence Reverse_Sequence
0  Gene M  TGCCGTA          ATGCCGT
1  Gene N  AATGCGT          TGCGTAA
2  Gene O  CGTACGT          TGCATGC
3  Gene P  GGCTATG          GTATCGG
"""
# TODO: Compute the length of each reversed DNA sequence and add it to a new column, "Reverse_Sequence_Length"
df_bio['Reverse_Sequence_Length'] = df_bio['Sequence'].apply(lambda x: x[::-1]).apply(len)

# Suppose we need to sort the data based on its popularity on the social network
df_gene_info_sorted = df_genes.sort_values(by='Popularity_on_Social_Network', ascending=False)

# Print the DataFrame after sorting
print(df_gene_info_sorted)
"""
     Gene Discovery_Date  Popularity_on_Social_Network
3  Gene D     1935-02-18                          2000
0  Gene A     1935-02-15                          1250
2  Gene C     1935-02-17                           800
1  Gene B     1935-02-16                           500
"""

In [None]:
import numpy as np

# Star dataset (Simulated data for demonstration)
data = {
    "Star_ID": np.arange(1, 5),
    "Right_Ascension": [204.85, 63.70, 305.29, 45.2],
    "Declination": [-29.72, 38.03, -14.78, 7.8],
    "Magnitude": [2.04, 1.25, 3.17, 1.9],
    "Observation_Date": pd.date_range('01/01/2020', periods=4)
}

df_stars = pd.DataFrame(data)
print(df_stars)
"""
   Star_ID  Right_Ascension  Declination  Magnitude Observation_Date
0        1           204.85       -29.72       2.04       2020-01-01
1        2            63.70        38.03       1.25       2020-01-02
2        3           305.29       -14.78       3.17       2020-01-03
3        4            45.20         7.80       1.90       2020-01-04
"""
filter_date = pd.to_datetime('2020-01-02')
filtered_stars = df_stars[df_stars['Observation_Date'] > filter_date]
print(filtered_stars)
"""
   Star_ID  Right_Ascension  Declination  Magnitude Observation_Date
2        3           305.29       -14.78       3.17       2020-01-03
3        4            45.20         7.80       1.90       2020-01-04
"""

In [None]:
# Social interaction data (Simulated for demonstration)
data = {
    "Person": ["Alice", "Bob", "Charlie", "Dave"],
    "Friends": [10, 5, 8, 2],
    "Posts": [100, 50, 80, 200]
}

df_social = pd.DataFrame(data)
print(df_social)
"""
    Person  Friends  Posts
0    Alice       10    100
1      Bob        5     50
2  Charlie        8     80
3     Dave        2    200
"""
df_social['Posts_per_Friend'] = df_social['Posts'] / df_social['Friends']
print(df_social)
"""
    Person  Friends  Posts  Posts_per_Friend
0    Alice       10    100              10.0
1      Bob        5     50              10.0
2  Charlie        8     80              10.0
3     Dave        2    200             100.0
"""

# Model Develop and Evaluation

## Supervised 

### Exploration(Wine-quality)

In [None]:
import datasets
import pandas as pd

# Loading Dataset
red_wine = datasets.load_dataset('codesignal/wine-quality', split='red')
white_wine = datasets.load_dataset('codesignal/wine-quality', split='white')
red_wine = pd.DataFrame(red_wine)
white_wine = pd.DataFrame(white_wine)

# Checking the shape of the dataset
print("Red Wine Dataset Shape: ", red_wine.shape) # Red Wine Dataset Shape:  (1599, 12)
print("White Wine Dataset Shape: ", white_wine.shape) # White Wine Dataset Shape:  (4898, 12)

# Check Red Wine Dataset data types
print("Red Wine Dataset Data Types:")
print(red_wine.dtypes)
"""
Red Wine Dataset Data Types:
fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                 float64
dtype: object
"""

# Check White Wine Dataset data types
print("\nWhite Wine Dataset Data Types:")
print(white_wine.dtypes)
"""
the structure is the same as in the red wine dataset
"""

# Describing Red Wine Dataset
print("Red Wine Dataset Description:")
print(red_wine.describe())
"""
Red Wine Dataset Description:
       fixed acidity  volatile acidity  ...      alcohol      quality
count    1599.000000       1599.000000  ...  1599.000000  1599.000000
mean        8.319637          0.527821  ...    10.422983     5.636023
std         1.741096          0.179060  ...     1.065668     0.807569
min         4.600000          0.120000  ...     8.400000     3.000000
25%         7.100000          0.390000  ...     9.500000     5.000000
50%         7.900000          0.520000  ...    10.200000     6.000000
75%         9.200000          0.640000  ...    11.100000     6.000000
max        15.900000          1.580000  ...    14.900000     8.000000

[8 rows x 12 columns]
"""

# Unique values
print("\nUnique values in Red Wine Dataset:")
print(red_wine.nunique())
"""
Unique values in Red Wine Dataset:
fixed acidity            96
volatile acidity        143
citric acid              80
residual sugar           91
chlorides               153
free sulfur dioxide      60
total sulfur dioxide    144
density                 436
pH                       89
sulphates                96
alcohol                  65
quality                   6
dtype: int64
"""

# Describing White Wine Dataset
print("\nWhite Wine Dataset Description:")
print(white_wine.describe())
"""
White Wine Dataset Description:
       fixed acidity  volatile acidity  ...      alcohol      quality
count    4898.000000       4898.000000  ...  4898.000000  4898.000000
mean        6.854788          0.278241  ...    10.514267     5.877909
std         0.843868          0.100795  ...     1.230621     0.885639
min         3.800000          0.080000  ...     8.000000     3.000000
25%         6.300000          0.210000  ...     9.500000     5.000000
50%         6.800000          0.260000  ...    10.400000     6.000000
75%         7.300000          0.320000  ...    11.400000     6.000000
max        14.200000          1.100000  ...    14.200000     9.000000

[8 rows x 12 columns]
"""

# Unique values
print("\nUnique values in White Wine Dataset:")
print(white_wine.nunique())
"""
Unique values in White Wine Dataset:
fixed acidity            68
volatile acidity        125
citric acid              87
residual sugar          310
chlorides               160
free sulfur dioxide     132
total sulfur dioxide    251
density                 890
pH                      103
sulphates                79
alcohol                 103
quality                   7
dtype: int64
"""
# Check missing values in Red Wine Dataset
print("Missing values in Red Wine Dataset:")
print(red_wine.isnull().sum()) # There are no null values in all columns


# Check missing values in White Wine Dataset
print("\nMissing values in White Wine Dataset:")
print(white_wine.isnull().sum()) # There are no null values in all columns

In [None]:
import matplotlib.pyplot as plt

# Plot for Red Wine
plt.hist(red_wine.quality, bins=10, color='red', alpha=0.7) # alpha=opaque不透明1，transparent透明0
plt.xlabel('Quality')
plt.ylabel('Count')
plt.title('Quality Distribution for Red Wine')
plt.show()

# Plot for White Wine
plt.hist(white_wine.quality, bins=10, color='skyblue', alpha=0.7)
plt.xlabel('Quality')
plt.ylabel('Count')
plt.title('Quality Distribution for White Wine')
plt.show()

### Gradient Descent

In [None]:
def gradient_descent(x, y, theta, alpha, iterations):
    """
    x -- input dataset
    y -- target dataset
    theta -- initial parameters
    alpha -- learning rate
    iterations -- the number of times to execute the algorithm
    """

    m = y.size # number of data points
    cost_list = [] # list to store the cost function value at each iteration
    theta_list = [theta] # list to store the values of theta at each iteration
    
    for i in range(iterations):
        # calculate our prediction based on our current theta
        prediction = np.dot(x, theta)
        
        # compute the error between our prediction and the actual values
        error = prediction - y
        
        # calculate the cost function
        cost = 1 / (2*m) * np.dot(error.T, error)
        
        # append the cost to the cost_list
        cost_list.append(np.squeeze(cost)) # 降维2D到1D
        
        # calculate the gradient descent and update the theta
        theta = theta - (alpha * (1/m) * np.dot(x.T, error))
        
        # append the updated theta to the theta_list
        theta_list.append(theta)
    
    # return the final values of theta, list of all theta, and list of all costs, respectively 
    return theta, theta_list, cost_list

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
import matplotlib.pyplot as plt
import datasets

# Load Wine Quality Dataset
red_wine = datasets.load_dataset('codesignal/wine-quality', split='red')
red_wine = pd.DataFrame(red_wine)

# Only consider the 'alcohol' column as a predictive feature for now
x = pd.DataFrame(red_wine['alcohol'])
y = red_wine['quality']

# Splitting datasets into training and testing datasets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) 
# random state是同一个值时，样本的洗牌方式是一样的，这样可以保证结果的可重复性，用于再现

# We set our parameters to start at 0
theta = np.zeros(x_train.shape[1]).reshape(-1, 1)

# Define the number of iterations and alpha value
alpha = 0.0001
iters = 1000

# Applying Gradient Descent
y_train = np.array(y_train).reshape(-1, 1)
g, theta_list, cost_list = gradient_descent(x_train, y_train, theta, alpha, iters)
# g是在train数据上迭代出的最终theta

print(cost_list)
plt.plot(range(1, iters + 1), cost_list, color='blue')
plt.rcParams["figure.figsize"] = (10,6)
plt.grid()
plt.xlabel('Number of iterations')
plt.ylabel('Cost (J)')
plt.title('Convergence of gradient descent')
plt.show()

y_test = np.array(y_test).reshape(-1, 1)
g_test, theta_test_list, cost_test_list = gradient_descent(x_test, y_test, g, alpha, iters)
# 带入g到test数据，观察Cost的下降

plt.plot(range(1, iters + 1), cost_test_list, color='brown')
plt.rcParams["figure.figsize"] = (10,6)
plt.grid()
plt.xlabel('Number of iterations')
plt.ylabel('Cost (J)')
plt.title('Convergence of gradient descent on the test dataset')
plt.show()

### Linear Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import pandas as pd

# Load the wine dataset
import datasets
red_wine = datasets.load_dataset('codesignal/wine-quality', split='red')
red_wine = pd.DataFrame(red_wine) 

# Select features and target variable
features = red_wine.drop('quality', axis=1) # 除target以外全是feature
features = red_wine[['fixed acidity', 'volatile acidity']]
target = red_wine['quality']

# Split the dataset into a training set and a testing set
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Instantiate and fit the model
model = LinearRegression()
model.fit(features_train, target_train)

# Predict the test features
predictions = model.predict(features_test)

# Evaluate the model
mse = metrics.mean_squared_error(target_test, predictions)
print('Mean Squared Error:', mse) # Mean Squared Error: 0.39002514396395416

r2_score = metrics.r2_score(target_test, predictions)
print('R-squared:', r2_score) # R-squared: 0.4031803412796231

In [None]:
import matplotlib.pyplot as plt

# Plot target vs prediction
plt.scatter(target_test, predictions, color='blue')
# Plot the ideal prediction line (with zero error)
plt.plot([target_test.min(), target_test.max()], [target_test.min(), target_test.max()], 'k--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted')
plt.show()

### Logistic Regression

In [None]:
# Import necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Load the wine dataset
import datasets
import pandas as pd
red_wine = datasets.load_dataset('codesignal/wine-quality', split='red')
red_wine = pd.DataFrame(red_wine)

# Convert the quality ratings to binary (Good - 1 and Not Good - 0)
red_wine['quality'] = red_wine['quality'].apply(lambda quality : 1 if quality >= 7 else 0)

# Split the dataset into features and target variable
X = red_wine.drop('quality', axis=1)
y = red_wine['quality']

# Split the dataset into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Create a Logistic Regression object
lr = LogisticRegression(max_iter=1000)

# Train the model using the training sets
lr.fit(X_train, y_train)

# Print the learned parameters
print("Model Coefficients: ", lr.coef_[0])
print("Intercept: ", lr.intercept_[0])
"""
[[-0.02641816 -3.24280912 -0.04024957  0.07795443 -1.26020881  0.02151089
  -0.01866486 -1.04040183 -2.50766981  2.00156001  0.9266963 ]]
[-1.77875604]
"""
# X = beta_0 + beta_1 * X_1 + ...beta_n * X_n
# coef_ = beta_1, beta_2, ... , beta_n
# intercept_ = beta_0

# Make predictions on the test dataset
y_pred = lr.predict(X_test)

# Import metrics module for accuracy calculation
from sklearn import metrics

# Model accuracy
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
# Accuracy:  0.8875

# Model Precision
print("Precision: ", metrics.precision_score(y_test, y_pred))
# Precision:  0.5172413793103449

# Model Recall
print("Recall: ", metrics.recall_score(y_test, y_pred))
# Recall:  0.2727272727272727

# Model F1-Score
print("F1 Score: ", metrics.f1_score(y_test, y_pred))
# F1 Score:  0.3571428571428571

# Model AUC
print("AUC: ", metrics.roc_auc_score(y_test, y_pred))
# AUC:  0.6198930481283422


### MAE/MSE/RMSE/R2

In [None]:
from sklearn import metrics
import numpy as np

# In our example, fitted is a numpy array that our linear regression model predicted for wine quality
fitted = np.array([3.6, 2.7, 2.4]) 

# While actual is a numpy array containing the real wine qualities
actual = np.array([3.5, 2.9, 2.6]) 

# For calculating MAE, pass the actual and predicted arrays to mean_absolute_error()
mae = metrics.mean_absolute_error(actual, fitted)
print(f"Mean Absolute Error (MAE): {mae}")
# Mean Absolute Error (MAE): 0.16666666666666666 = (0.1+0.2+0.2)/3

# For calculating MSE, use the mean_squared_error function
mse = metrics.mean_squared_error(actual, fitted)
print(f"Mean Squared Error (MSE): {mse}")
# Mean Squared Error (MSE): 0.029999999999999995

# RMSE is calculated as the square root of MSE, using the np.sqrt() function
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")
# Root Mean Squared Error (RMSE): 0.1732050807568877

# For calculating the R-squared value, use the r2_score function
r2 = metrics.r2_score(actual, fitted)
print(f"R-squared: {r2}")
# R-squared: 0.7857142857142857

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import numpy as np

# Load the Red Wine Quality Data
wine = datasets.load_dataset('codesignal/wine-quality', split='red')
wine = pd.DataFrame(wine)

# Separate Features and Target
X = wine.drop('quality', axis=1)
Y = wine['quality']

# Split the data into training and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

# Train the Model
model = LinearRegression()
model.fit(X_train, Y_train)

# Make prediction
Y_pred = model.predict(X_test)

# Calculate metrics
mae = metrics.mean_absolute_error(Y_test, Y_pred)
mse = metrics.mean_squared_error(Y_test, Y_pred)
rmse = np.sqrt(mse)
r2 = metrics.r2_score(Y_test, Y_pred)

# Print metrics
print(f"Mean Absolute Error (MAE): {mae}")
# Mean Absolute Error (MAE): 0.4696330928661111
print(f"Mean Squared Error (MSE): {mse}")
# Mean Squared Error (MSE): 0.384471197820124
print(f"Root Mean Squared Error (RMSE): {rmse}")
# Root Mean Squared Error (RMSE): 0.6200574149384265
print(f"R-squared: {r2}")
# R-squared: 0.32838876395802286

### Accuracy/Precision/Recall/F1 Score/AUC-ROC

In [None]:
from sklearn import metrics

# Let y_test be a numpy array with the actual wine quality classes ('good' or 'not good') for the test dataset
y_test = np.array(['not good', 'good', 'good', 'not good', 'good'])

# And let pred be a numpy array with the predicted classes by our model for the test dataset
pred = np.array(['not good', 'good', 'not good', 'good', 'good'])

# For calculating Accuracy
accuracy = metrics.accuracy_score(y_test, pred)
print(f"Accuracy: {accuracy}")

# For calculating Precision, use the precision_score function
# Note: It considers 'good' as the positive class by default (this can be changed using the pos_label parameter)
precision = metrics.precision_score(y_test, pred, pos_label="good")
print(f"Precision: {precision}")

# For calculating Recall
recall = metrics.recall_score(y_test, pred, pos_label="good")
print(f"Recall: {recall}")

# For calculating F1 Score
f1 = metrics.f1_score(y_test, pred, pos_label="good")
print(f"F1 Score: {f1}")

# For computing AUC-ROC, we need the probabilities of the positive class ('good'), let's assume y_proba as an array of these probabilities 
y_proba = np.array([0.1, 0.7, 0.3, 0.8, 0.7])
auc_roc = metrics.roc_auc_score(y_test, y_proba)
print(f"AUC-ROC: {auc_roc}")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import datasets

# Load the Red Wine Quality Data
wine = datasets.load_dataset('codesignal/wine-quality', split='red')
wine = pd.DataFrame(wine)

# Transform the quality column into binary labels
wine['quality'] = np.where(wine["quality"] >= wine["quality"].quantile(0.75), 
                            'good', 'not good')

# Transform the labels into numerical form
le = LabelEncoder()
wine['quality'] = le.fit_transform(wine['quality'])

# Separate Features and Target
X = wine.drop('quality', axis=1)
Y = wine['quality']

# Split into training and test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    test_size=0.3,
                                                    random_state=42)

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
 
# Fit the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, Y_train)

# Make predictions
Y_pred = model.predict(X_test)

# Compute evaluation metrics
print(f"Accuracy: {metrics.accuracy_score(Y_test, Y_pred)}")
print(f"Precision: {metrics.precision_score(Y_test, Y_pred)}")
print(f"Recall: {metrics.recall_score(Y_test, Y_pred)}")
print(f"F1-Score: {metrics.f1_score(Y_test, Y_pred)}")

proba_pred = model.predict_proba(X_test)[:, 1]  # Probabilities of the positive class 
print(f"AUC-ROC: {metrics.roc_auc_score(Y_test, proba_pred)}")

### k-fold Cross-Validation 

In [None]:
from sklearn.model_selection import cross_val_score

clf = LinearRegression()
# clf represents an instance of a machine learning model you've already constructed (e.g., clf = LinearRegression())
scores = cross_val_score(clf, X, y, cv=5) # k=5

### Correlation

In [None]:
import pandas as pd
import datasets

# Import the dataset
red_wine = datasets.load_dataset('codesignal/wine-quality', split='red')
red_wine_df = pd.DataFrame(red_wine)

# Compute the correlation matrix
corr = red_wine_df.corr(method='pearson', min_periods=10) # method = pearson, kendall, spearman
# min_periods说明两个feature之间计算相关性，至少需要10个数据点，说明除去missingvalue之外至少还有10个数据点

# Print the correlation matrix
print(corr)
"""
                      fixed acidity  volatile acidity  ...   alcohol   quality
fixed acidity              1.000000         -0.256131  ... -0.061668  0.124052
volatile acidity          -0.256131          1.000000  ... -0.202288 -0.390558
citric acid                0.671703         -0.552496  ...  0.109903  0.226373
residual sugar             0.114777          0.001918  ...  0.042075  0.013732
chlorides                  0.093705          0.061298  ... -0.221141 -0.128907
free sulfur dioxide       -0.153794         -0.010504  ... -0.069408 -0.050656
total sulfur dioxide      -0.113181          0.076470  ... -0.205654 -0.185100
density                    0.668047          0.022026  ... -0.496180 -0.174919
pH                        -0.682978          0.234937  ...  0.205633 -0.057731
sulphates                  0.183006         -0.260987  ...  0.093595  0.251397
alcohol                   -0.061668         -0.202288  ...  1.000000  0.476166
quality                    0.124052         -0.390558  ...  0.476166  1.000000

[12 rows x 12 columns]
"""
import seaborn as sns
import matplotlib.pyplot as plt

# Draw the heatmap
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', vmin=-1, vmax=1)
# annot添加标签，cmap更改颜色，vmin和vmax调整颜色比例

plt.title('Correlation heatmap for Red Wine features')
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datasets

# Load the dataset
red_wine = datasets.load_dataset('codesignal/wine-quality', split='red')
red_wine_df = pd.DataFrame(red_wine)

# Compute the correlation matrix
corr = red_wine_df.corr()

# Print the correlation matrix with a precision of 2 decimal places
print(corr.round(2))

# Focus only on 'quality'
corr_quality = corr_matrix[['quality']].drop('quality')
corr_quality['quality'] = corr_quality[abs(corr_quality["quality"]) > 0.3 ]

sns.heatmap(corr_quality, annot=True, fmt=".2f")
plt.title('Correlation heatmap for Wine Quality and most related features')
plt.show()

# Create a heatmap
sns.heatmap(corr, annot=True, fmt=".2f")
plt.title('Correlation heatmap for the Red Wine Dataset')
plt.show()

# Let's look at two features: 'pH' and 'fixed acidity'
# Calculate 'pH' against 'fixed acidity' to find out its correlation
feature_corr = red_wine_df['pH'].corr(red_wine_df['fixed acidity'])
# Calculate 'density' against 'quality' to find out its correlation
feature_corr = red_wine_df['quality'].corr(red_wine_df['density'])

# Print the correlation
print("\nThe correlation of 'fixed acidity' with 'pH' is %.3f\n" % (feature_corr))

# Display the correlation graphically via a scatter plot
plt.figure(figsize=(8,4))
sns.scatterplot(x='fixed acidity', y='pH', data=red_wine_df, hue='quality')
plt.title('Scatterplot showing correlation between \'pH\' and \'Fixed Acidity\'')
plt.show()

### Overfitting & Underfitting

In [None]:
# Exemplify overfitting and underfitting 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

# Generate some data
np.random.seed(0)
x = np.random.rand(40, 1) ** 2
y =  (10 - 1. / (x.ravel() + 0.1)) + np.random.randn(40)

# Define a function to fit the model
def PolynomialRegression(degree=2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree), LinearRegression(**kwargs))
# make_pipeline封装了两个步骤，最简单例子：make_pipeline(StandardScaler(), LogisticRegression())

# Fit the model
X_test = np.linspace(-0.1, 1.1, 500)[:, None]
plt.figure(figsize=(12, 6))
plt.scatter(x.ravel(), y, color='black')
axis = plt.axis()
for degree in [1, 3, 30]:
    y_test = PolynomialRegression(degree).fit(x, y).predict(X_test)
    plt.plot(X_test.ravel(), y_test, label='degree={0}'.format(degree))
plt.xlim(-0.1, 1.0)
plt.ylim(-2, 12)
plt.legend(loc='best');

# Show the plot
plt.show()

### Hyperparameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Load the dataset
import datasets
red_wine = datasets.load_dataset('codesignal/wine-quality', split='red')
red_wine = pd.DataFrame(red_wine)

# Separate features and target
X = red_wine.drop(columns='quality')
y = pd.cut(red_wine['quality'], bins=[0, 6.5, 10], labels=['bad', 'good'])

# Standardize the features
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Create a Logistic Regression model
logistic = LogisticRegression(solver='saga', tol=0.01)
pipe = make_pipeline(logistic)

# Set up the grid
param_grid = {
    'logisticregression__C': np.logspace(-2, 2, 5),
    'logisticregression__penalty': ['l1', 'l2'],
}

# Initiate Grid search with cross-validation
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, verbose=0)
# GridSearch在指定网格param_grid上穷举，返回5-fold的交叉验证分数最高 模型

grid.fit(X_train, y_train)

# Print the best parameters
print('Best parameters: ', grid.best_params_)
# Best parameters:  {'logisticregression__C': 0.09999999999999999, 'logisticregression__penalty': 'l2'}

## Unsupervised 8 lessons 37 practices