In [1]:
import pandas as pd
import matplotlib
path_data = "https://raw.githubusercontent.com/mlee-pnu/IDS/main/FDS08/"
matplotlib.use('Agg')
%matplotlib inline
import matplotlib.pyplot as plots
import plotly.express as px
import plotly.graph_objects as go
plots.style.use('fivethirtyeight')
import numpy as np

# 8.1. Applying a Function to a Column

In [2]:
def cut_off_at_100(x):
    """The smaller of x and 100"""
    return min(x, 100)

In [3]:
cut_off_at_100(17)

17

In [4]:
cut_off_at_100(117)

100

In [5]:
cut_off_at_100(100)

100

In [6]:
ages = pd.DataFrame(
    {
        "Person": np.array(["A", "B", "C", "D", "E", "F"]),
        "Age": np.array([17, 117, 52, 100, 6, 101]),
    }
)
ages

Unnamed: 0,Person,Age
0,A,17
1,B,117
2,C,52
3,D,100
4,E,6
5,F,101


In [7]:
ages['Age'].apply(cut_off_at_100)

0     17
1    100
2     52
3    100
4      6
5    100
Name: Age, dtype: int64

In [8]:
ages['Cut Off Age'] = ages['Age'].apply(cut_off_at_100)
ages

Unnamed: 0,Person,Age,Cut Off Age
0,A,17,17
1,B,117,100
2,C,52,52
3,D,100,100
4,E,6,6
5,F,101,100


## 8.1.2. Functions as Values

In [9]:
cut_off_at_100

<function __main__.cut_off_at_100(x)>

In [10]:
cut_off = cut_off_at_100

In [11]:
cut_off

<function __main__.cut_off_at_100(x)>

## 8.1.3. Example: Prediction

In [12]:
# Data on heights of parents and their adult children
path_data = "https://raw.githubusercontent.com/mlee-pnu/IDS/main/FDS08/"
family_heights = pd.read_csv(path_data + 'family_heights.csv')
family_heights = family_heights.drop(family_heights.columns[3], axis=1)
family_heights

Unnamed: 0,family,father,mother,children,childNum,sex,childHeight
0,1,78.5,67.0,4,1,male,73.2
1,1,78.5,67.0,4,2,female,69.2
2,1,78.5,67.0,4,3,female,69.0
3,1,78.5,67.0,4,4,female,69.0
4,2,75.5,66.5,4,1,male,73.5
...,...,...,...,...,...,...,...
929,203,62.0,66.0,3,1,male,64.0
930,203,62.0,66.0,3,2,female,62.0
931,203,62.0,66.0,3,3,female,61.0
932,204,62.5,63.0,2,1,male,66.5


In [13]:
# Extract midparentHeight and childHeight from galton
parent_averages = (family_heights['father'] + 1.08 * family_heights['mother']) / 2
child_height = family_heights['childHeight']
heights = pd.DataFrame({'Parent Average': parent_averages,
                        'Child':child_height})
heights

Unnamed: 0,Parent Average,Child
0,75.43,73.2
1,75.43,69.2
2,75.43,69.0
3,75.43,69.0
4,73.66,73.5
...,...,...
929,66.64,64.0
930,66.64,62.0
931,66.64,61.0
932,65.27,66.5


In [14]:
# Scatter of Child vs. MidParent
fig = px.scatter(heights,
                 x='Parent Average',
                 y='Child',
                 color_discrete_sequence=['rgb(2, 21 ,51)'],
                 opacity=0.7)
# Set layout
fig.update_layout(title='Scatter of Child vs. Parent Average')

fig.show()

In [15]:
# Add shape to fig
fig.add_shape(type='line', x0=67.5, y0=50, x1=67.5, y1=85, line_color='red')
fig.add_shape(type='line', x0=68.5, y0=50, x1=68.5, y1=85, line_color='red')

# Add marker to fig
fig.add_trace(
    go.Scatter(
        mode='markers',
        x=[68],
        y=[66.24],
        marker=dict(
            color='Gold',
            size=12,
        ),
        showlegend=False
    )
)

fig.show()

In [16]:
filter = (67.5 <= heights['Parent Average']) & (heights['Parent Average'] <= 68.5)
close_to_68 = heights.loc[filter]
close_to_68

Unnamed: 0,Parent Average,Child
233,68.44,62.0
396,67.94,71.2
397,67.94,67.0
516,68.33,62.5
517,68.23,73.0
...,...,...
885,67.60,69.0
886,67.60,68.0
887,67.60,67.7
888,67.60,64.5


In [17]:
np.mean(close_to_68.Child)

66.24045801526718

In [18]:
# Define predict_child for predicting any value of the midparent height
def predict_child(p_avg, heights):
    """Predict the height of a child whose parents have a midparent height of mpht.

    The prediction is the average height of the children whose midparent height is
    in the range mpht plus or minus 0.5.
    """
    ## First, define a filter that selects rows where the midparent height ('Parent Average' column) is within ±0.5 units of the given midparent height (p_avg).

    filter = (p_avg - 0.5 <= heights['Parent Average']) & (heights['Parent Average'] <= p_avg + 0.5)

    ## Then, apply this filter to the DataFrame heights using the loc function, which selects the rows satisfying the filter conditions.

    close_points = heights.loc[filter]

    ## Finally, calculates the average height of children in this subset
    return

In [19]:
predict_child(68, heights)

In [20]:
predict_child(66, heights)

In [21]:
# Apply predict_child to all the midparent heights
heights_with_predictions = heights.copy()
heights_with_predictions['Prediction'] = heights['Parent Average'].apply(predict_child, heights=heights)
heights_with_predictions

Unnamed: 0,Parent Average,Child,Prediction
0,75.43,73.2,
1,75.43,69.2,
2,75.43,69.0,
3,75.43,69.0,
4,73.66,73.5,
...,...,...,...
929,66.64,64.0,
930,66.64,62.0,
931,66.64,61.0,
932,65.27,66.5,


In [22]:
# Draw the original scatter plot along with the predicted values
fig = go.Figure()

# Add trace Child
fig.add_trace(go.Scatter(
    mode='markers',
    x=heights_with_predictions['Parent Average'],
    y=heights_with_predictions['Child'],
    name='Child',
    marker_color='rgba(2, 21 ,51, .8)')) # rgb + opacity

# Add trace Prediction
fig.add_trace(go.Scatter(
    mode='markers',
    x=heights_with_predictions['Parent Average'],
    y=heights_with_predictions['Prediction'],
    name='Predication',
    marker_color='rgba(250, 211, 102, .8)')) # rgb + opacity

# Set layout
fig.update_layout(title="Prediction of Children's heights",
                  xaxis_title='Parent Average',
                  yaxis_title='')

fig.show()

In [23]:
predict_child(65, heights)