In [1]:
import pandas as pd

# 创建一个空的DataFrame来存储所有年份的数据
all_names = pd.DataFrame()

# 遍历每一年
for year in range(1880, 2011):
    # 读取该年份的数据
    path = f'datasets/babynames/yob{year}.txt'
    names = pd.read_csv(path, names=['name', 'sex', 'births'])
    
    # 添加一个表示年份的列
    names['year'] = year
    
    # 将该年份的数据添加到all_names中
    all_names = pd.concat([all_names, names], ignore_index=True)

# 根据年份和性别对数据进行分组，然后计算每一组的出生总数
total_births = all_names.pivot_table('births', index='year', columns='sex', aggfunc=sum)

# 添加一个表示总体出生数的列
total_births['total'] = total_births['F'] + total_births['M']

total_births

sex,F,M,total
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1880,90993,110493,201486
1881,91955,100748,192703
1882,107851,113687,221538
1883,112322,104632,216954
1884,129021,114445,243466
...,...,...,...
2006,1896468,2050234,3946702
2007,1916888,2069242,3986130
2008,1883645,2032310,3915955
2009,1827643,1973359,3801002


# ARIMA（自回归积分移动平均）算法

In [2]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt

# 将索引转换为DatetimeIndex，并提供频率信息
total_births.index = pd.date_range(start='1880', end='2010', freq='AS')

# 使用1880年到1980年的数据进行训练
train = total_births.loc['1880':'1980']

# 对总出生人口进行预测
model = ARIMA(train['total'], order=(5,1,0))
model_fit = model.fit()
forecast_total = model_fit.forecast(steps=30)
print('Total Forecast: ', forecast_total)

# 对男性出生人口进行预测
model = ARIMA(train['M'], order=(5,1,0))
model_fit = model.fit()
forecast_male = model_fit.forecast(steps=30)
print('Male Forecast: ', forecast_male)

# 对女性出生人口进行预测
model = ARIMA(train['F'], order=(5,1,0))
model_fit = model.fit()
forecast_female = model_fit.forecast(steps=30)
print('Female Forecast: ', forecast_female)

Total Forecast:  1981-01-01    3.480993e+06
1982-01-01    3.526446e+06
1983-01-01    3.568241e+06
1984-01-01    3.617116e+06
1985-01-01    3.658138e+06
1986-01-01    3.683757e+06
1987-01-01    3.706214e+06
1988-01-01    3.728086e+06
1989-01-01    3.748233e+06
1990-01-01    3.764893e+06
1991-01-01    3.777777e+06
1992-01-01    3.788867e+06
1993-01-01    3.798967e+06
1994-01-01    3.807795e+06
1995-01-01    3.815159e+06
1996-01-01    3.821257e+06
1997-01-01    3.826509e+06
1998-01-01    3.831117e+06
1999-01-01    3.835081e+06
2000-01-01    3.838428e+06
2001-01-01    3.841262e+06
2002-01-01    3.843698e+06
2003-01-01    3.845802e+06
2004-01-01    3.847604e+06
2005-01-01    3.849137e+06
2006-01-01    3.850444e+06
2007-01-01    3.851565e+06
2008-01-01    3.852528e+06
2009-01-01    3.853351e+06
2010-01-01    3.854054e+06
Freq: AS-JAN, Name: predicted_mean, dtype: float64
Male Forecast:  1981-01-01    1.803286e+06
1982-01-01    1.826484e+06
1983-01-01    1.848413e+06
1984-01-01    1.873279e+0

In [3]:
# 计算总出生人口的预测误差率
total_actual = total_births.loc[pd.date_range(start='1981', end='2010', freq='AS'), 'total']
total_error = abs((forecast_total.values - total_actual.values) / total_actual.values) * 100
for year, error in zip(pd.date_range(start='1981', end='2010', freq='AS').year, total_error):
    print(f'Total Error Rate in {year}: {error}')

# 计算男性出生人口的预测误差率
male_actual = total_births.loc[pd.date_range(start='1981', end='2010', freq='AS'), 'M']
male_error = abs((forecast_male.values - male_actual.values) / male_actual.values) * 100
for year, error in zip(pd.date_range(start='1981', end='2010', freq='AS').year, male_error):
    print(f'Male Error Rate in {year}: {error}')

# 计算女性出生人口的预测误差率
female_actual = total_births.loc[pd.date_range(start='1981', end='2010', freq='AS'), 'F']
female_error = abs((forecast_female.values - female_actual.values) / female_actual.values) * 100
for year, error in zip(pd.date_range(start='1981', end='2010', freq='AS').year, female_error):
    print(f'Female Error Rate in {year}: {error}')

Total Error Rate in 1981: 0.711485037497872
Total Error Rate in 1982: 0.6211149348542297
Total Error Rate in 1983: 3.1237004715484322
Total Error Rate in 1984: 3.7870912828181864
Total Error Rate in 1985: 2.59496746609763
Total Error Rate in 1986: 3.6657426568921765
Total Error Rate in 1987: 2.875486883182745
Total Error Rate in 1988: 1.005581751478898
Total Error Rate in 1989: 2.4633470089072613
Total Error Rate in 1990: 4.669979641630336
Total Error Rate in 1991: 2.9629404963021044
Total Error Rate in 1992: 1.295042635739912
Total Error Rate in 1993: 0.835040018243658
Total Error Rate in 1994: 2.5041858847098553
Total Error Rate in 1995: 4.258106451729998
Total Error Rate in 1996: 4.853393294914488
Total Error Rate in 1997: 5.619991820135399
Total Error Rate in 1998: 4.246207680930617
Total Error Rate in 1999: 3.9203047604945516
Total Error Rate in 2000: 1.6623841482334818
Total Error Rate in 2001: 2.7409924932324436
Total Error Rate in 2002: 2.9571404611030907
Total Error Rate in 20

In [4]:
# 计算总出生人口的预测误差率
total_actual = total_births.loc[pd.date_range(start='1981', end='2010', freq='AS'), 'total']
total_error = abs((forecast_total.values - total_actual.values) / total_actual.values) * 100
print('Total Error Rate: ', total_error.mean())

# 计算男性出生人口的预测误差率
male_actual = total_births.loc[pd.date_range(start='1981', end='2010', freq='AS'), 'M']
male_error = abs((forecast_male.values - male_actual.values) / male_actual.values) * 100
print('Male Error Rate: ', male_error.mean())

# 计算女性出生人口的预测误差率
female_actual = total_births.loc[pd.date_range(start='1981', end='2010', freq='AS'), 'F']
female_error = abs((forecast_female.values - female_actual.values) / female_actual.values) * 100
print('Female Error Rate: ', female_error.mean())

Total Error Rate:  2.66768202114045
Male Error Rate:  2.52970129769353
Female Error Rate:  2.798154913268582


# 随机森林模型

In [5]:
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor

# 创建随机森林模型
model_total = RandomForestRegressor(n_estimators=100)
model_female = RandomForestRegressor(n_estimators=100)
model_male = RandomForestRegressor(n_estimators=100)

# 对年份进行编码
X_train = [[year] for year in range(1880, 2011)]
y_train_total = total_births['total']
y_train_female = total_births['F']
y_train_male = total_births['M']

# 训练模型
model_total.fit(X_train, y_train_total)
model_female.fit(X_train, y_train_female)
model_male.fit(X_train, y_train_male)

# 对出生人口进行预测
X_test = [[year] for year in range(1981, 2011)]
forecast_total = model_total.predict(X_test)
forecast_female = model_female.predict(X_test)
forecast_male = model_male.predict(X_test)

print('Total Forecast: ', forecast_total)
print('Female Forecast: ', forecast_female)
print('Male Forecast: ', forecast_male)

Total Forecast:  [3454151.47 3484734.43 3474154.12 3488485.22 3538388.39 3560005.
 3603922.48 3681033.47 3809835.94 3912802.74 3902261.75 3852606.04
 3787649.66 3721251.33 3674735.42 3646630.73 3634362.64 3659624.74
 3689862.83 3746567.01 3745911.4  3742910.91 3785291.89 3810561.38
 3853195.76 3923117.5  3966483.33 3922385.32 3834873.71 3729449.02]
Female Forecast:  [1662454.03 1681902.77 1676411.83 1681331.65 1708436.12 1720065.66
 1737606.87 1778258.54 1828298.83 1872644.85 1873516.98 1845960.74
 1817957.67 1791166.41 1763059.09 1752510.14 1745234.44 1760743.06
 1773898.65 1799822.17 1801037.13 1798476.74 1819291.34 1829857.
 1846758.18 1886221.71 1906082.25 1887027.81 1833823.34 1794159.26]
Male Forecast:  [1786340.86 1803795.52 1796298.68 1805127.65 1832746.11 1842555.98
 1866779.31 1914221.13 1970767.67 2028847.68 2024181.2  2000083.5
 1970309.72 1935243.34 1906568.6  1895807.33 1889941.32 1906565.98
 1917573.87 1947854.74 1946635.27 1943237.24 1963825.85 1979756.55
 1999726.25 20

In [6]:
import numpy as np

# 真实值
real_total = total_births.loc['1981':'2010']['total']
real_female = total_births.loc['1981':'2010']['F']
real_male = total_births.loc['1981':'2010']['M']

# 计算误差率
error_rate_total = np.abs((forecast_total - real_total) / real_total) * 100
error_rate_female = np.abs((forecast_female - real_female) / real_female) * 100
error_rate_male = np.abs((forecast_male - real_male) / real_male) * 100

# 打印误差率
print('Total Error Rate: ', error_rate_total)
print('Female Error Rate: ', error_rate_female)
print('Male Error Rate: ', error_rate_male)

Total Error Rate:  1981-01-01    0.065083
1982-01-01    0.569056
1983-01-01    0.404552
1984-01-01    0.096244
1985-01-01    0.763505
1986-01-01    0.183200
1987-01-01    0.036126
1988-01-01    0.269212
1989-01-01    0.860316
1990-01-01    0.924797
1991-01-01    0.234612
1992-01-01    0.365449
1993-01-01    0.534642
1994-01-01    0.174475
1995-01-01    0.420716
1996-01-01    0.061731
1997-01-01    0.316339
1998-01-01    0.420163
1999-01-01    0.014718
2000-01-01    0.770593
2001-01-01    0.190688
2002-01-01    0.257464
2003-01-01    0.304282
2004-01-01    0.104878
2005-01-01    0.417514
2006-01-01    0.597575
2007-01-01    0.492876
2008-01-01    0.164208
2009-01-01    0.891126
2010-01-01    1.970175
Freq: AS-JAN, Name: total, dtype: float64
Female Error Rate:  1981-01-01    0.262712
1982-01-01    0.598878
1983-01-01    0.414848
1984-01-01    0.063264
1985-01-01    0.640547
1986-01-01    0.350786
1987-01-01    0.005690
1988-01-01    0.047971
1989-01-01    0.800744
1990-01-01    1.297197

In [7]:
# 计算误差率的平均值
avg_error_rate_total = error_rate_total.mean()
avg_error_rate_female = error_rate_female.mean()
avg_error_rate_male = error_rate_male.mean()

# 打印误差率的平均值
print('Average Total Error Rate: ', avg_error_rate_total)
print('Average Female Error Rate: ', avg_error_rate_female)
print('Average Male Error Rate: ', avg_error_rate_male)

Average Total Error Rate:  0.4292104232375884
Average Female Error Rate:  0.4031947434299269
Average Male Error Rate:  0.4286994066091774
