In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model

### Check logistic curve:

In [None]:
low = -4
high = 4
xplot = np.linspace(start=low, stop=high)

w = 1
b = 0
yplot = 1 / (1 + np.exp(-(w * xplot + b)))
plt.plot(xplot, yplot, '-', color='black',
         label=f'$y = \\frac{{1}}{{1 + e^{{-(w x + b)}}}}$ for w={w}, b={b}')
plt.plot(-b / w, 1/2, '.', color='black')
plt.title('Logistic/sigmoid function')

w = 1
b = 1
yplot = 1 / (1 + np.exp(-(w * xplot + b)))
plt.plot(xplot, yplot, '-', color='red',
         label=f'$y = \\frac{{1}}{{1 + e^{{-(w x + b)}}}}$ for w={w}, b={b}')
plt.plot(-b / w, 1/2, '.', color='red')

w = 2
b = 0
yplot = 1 / (1 + np.exp(-(w * xplot + b)))
plt.plot(xplot, yplot, '-', color='green',
         label=f'$y = \\frac{{1}}{{1 + e^{{-(w x + b)}}}}$ for w={w}, b={b}')
plt.plot(-b / w, 1/2, '.', color='green')

plt.legend()
plt.show(block=False)

### Toy lecture example:

In [None]:
X = np.array([[-1], [0], [0], [1]])
y = np.array([0, 0, 1, 1])
N = y.shape[0]
model = linear_model.LogisticRegression(C=1000)
# First try commenting out next three lines and setting b and w by eye.
# Also try varying C, above.
model.fit(X, y)
b = model.intercept_
w = model.coef_[0]
print(f'intercept={b}, slope={w}, training score={model.score(X, y)}')
print(f'predictions for X={X} and y={y} are y_hat={model.predict(X)}')

# plot data
plt.plot(X, y, 'o', color='black', label=r'data $\{(x_i, y_i)\}$')
plt.title('Toy logistic regression for (-1, 0), (0, 0), (0, 1), (1, 1)')
plt.xlabel('x')
plt.xlim(low, high)
margin = 0.1
plt.ylim(-(1 + margin), 2 + margin)

# plot curve
xplot = np.linspace(start=low, stop=high)
yplot = 1 / (1 + np.exp(-(w * xplot + b)))
plt.plot(xplot, yplot, label=r'logistic curve $\hat{P}(y = 1)$')

# find and plot sample proportions
x_values, x_counts = np.unique(X, return_counts=True)
n_x_values = x_values.shape[0]
success_proportion_per_x_value = np.zeros(n_x_values)
for i in np.arange(n_x_values):
    success_proportion_per_x_value[i] = np.sum(y[X[:, 0] == x_values[i]]) / x_counts[i]

probs = model.predict_proba(X)[:, 1] # column 1 is P(y_i = 1); column 0 is P(y_i = 0)
plt.plot(x_values, success_proportion_per_x_value, '.', color='red',
         label='sample proportions')

plt.legend()
plt.savefig('toyLogistic.png')
plt.show(block=False)

### 1D x real data example
on proportions of girls at various ages who have reached menarche
(onset of menstruation).

In [None]:
df_raw = pd.read_csv('http://www.stat.wisc.edu/~jgillett/451/data/menarche.csv')
df_raw
# The first row says "0 out of 376 girls with average age 9.21 have
# reached menarche." The tenth row says "29 out of 93 girls with
# average age 12.33 have reached menarche." The last row says "1049
# out of 1049 girls with average age 17.58 have reached menarche."

In [None]:
# I made a second data file called menarche_cases.csv from
# menarche.csv that gives one line for each girl in the study
# indicating her age and whether (1) or not (0) she has reached
# menarche. e.g. For the tenth row of menarche.csv, I made 29 rows
# "12.33,1" and 64=93-29 rows "12.33,0" in menarche_cases.csv.
df = pd.read_csv('http://www.stat.wisc.edu/~jgillett/451/data/menarche_cases.csv')
df

In [None]:
x = df['age'].to_numpy()
X = x.copy()
nrows = X.shape[0]
X.shape = (nrows, 1)

y = df['reached_menarche'].to_numpy()

model = linear_model.LogisticRegression()
model.fit(X, y)
b = model.intercept_
w = model.coef_[0]
print(f'intercept={b}, slope={w}, training score={model.score(X, y)}')

In [None]:
# plot data
low = 8
high = 20
plt.plot(X, y, '.', color='black', label='data (many duplicates)')
plt.xlim(low, high)
margin = 0.1
plt.ylim(0 - margin, 1 + margin)
plt.title('Proportions of girls who have reached menarche')
plt.xlabel('age')
plt.ylabel('proportion')

# plot curve
xplot = np.linspace(start=low, stop=high)
yplot = 1 / (1 + np.exp(-(w * xplot + b)))
plt.plot(xplot, yplot, label='logistic curve')

# find and plot sample proportions
x_values, x_counts = np.unique(X, return_counts=True)
n_x_values = x_values.shape[0]
success_proportion_per_x_value = np.zeros(n_x_values)
for i in np.arange(n_x_values):
    success_proportion_per_x_value[i] = np.sum(y[X[:, 0] == x_values[i]]) / x_counts[i]

probs = model.predict_proba(X)[:, 0] # column 0 is P(y_i = 1); column 1 is P(y_i = 0)
plt.plot(x_values, success_proportion_per_x_value, '.', color='red',
         label='sample proportions')

plt.legend(loc='center right')
plt.show(block=False)

### Add 2D x example to show linear decision boundary.

In [None]:
# ...