In [43]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [44]:
df=pd.read_csv('/data/jxwang_data/data/california-house-prices/train.csv')
df.info()
df.sample(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47439 entries, 0 to 47438
Data columns (total 41 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Id                           47439 non-null  int64  
 1   Address                      47439 non-null  object 
 2   Sold Price                   47439 non-null  float64
 3   Summary                      47085 non-null  object 
 4   Type                         47439 non-null  object 
 5   Year built                   46394 non-null  float64
 6   Heating                      40587 non-null  object 
 7   Cooling                      26745 non-null  object 
 8   Parking                      46065 non-null  object 
 9   Lot                          33258 non-null  float64
 10  Bedrooms                     44567 non-null  object 
 11  Bathrooms                    43974 non-null  float64
 12  Full bathrooms               39574 non-null  float64
 13  Total interior l

Unnamed: 0,Id,Address,Sold Price,Summary,Type,Year built,Heating,Cooling,Parking,Lot,...,Parking features,Tax assessed value,Annual tax amount,Listed On,Listed Price,Last Sold On,Last Sold Price,City,Zip,State
3517,3517,2371 Santa Ana S,465000.0,Looking for a recently built (2008) home with ...,SingleFamily,2008.0,Central,Central Air,"Garage, Garage Faces Front, Garage - Two Door",3604.0,...,"Garage, Garage Faces Front, Garage - Two Door",350262.0,4582.0,2020-01-02,450000.0,,,Los Angeles,90059,CA
34699,34699,1776 Alameda Diablo,4298000.0,"1776 Alameda Diablo, Diablo, CA 94528 is a sin...",SingleFamily,1952.0,"Forced air, Gas",Central,"Garage, Garage - Attached, Covered",22651.2,...,"Garage, Garage - Attached, Covered",3077500.0,35494.0,2020-09-27,4298000.0,2007-09-18,3200000.0,Diablo,94528,CA
46651,46651,3346 El Encanto Ct,170000.0,Charming two bedroom condo with two bathrooms ...,Condo,1972.0,,,2 Car Garage,,...,2 Car Garage,137363.0,2045.0,2020-11-08,185000.0,2015-06-29,125000.0,Bakersfield,93301,CA


In [45]:
df_train=df.copy()
df_test=pd.read_csv('/data/jxwang_data/data/california-house-prices/test.csv')
for field in ['Listed On', 'Last Sold On']:
    df_train[field]=pd.to_datetime(df[field])
    df_test[field]=pd.to_datetime(df_test[field])
    # field转为日期形式

In [46]:
cate_cols = []
num_cols = []
date_cols = []
dtypes = df_train.dtypes
for col, dtype in dtypes.items():
    if dtype=='object':
        cate_cols.append(col)
    elif dtype.name.startswith('datetime'):
        date_cols.append(col)
    else:
        num_cols.append(col)
# 这里其实没啥，就是把所有特征分门别类的放置

In [47]:
id_col = 'Id'
target_col = 'Sold Price'

for col in [id_col, target_col]:
    num_cols.remove(col)
    # 把ID和最后的预测删掉

print(cate_cols, num_cols, date_cols)

['Address', 'Summary', 'Type', 'Heating', 'Cooling', 'Parking', 'Bedrooms', 'Region', 'Elementary School', 'Middle School', 'High School', 'Flooring', 'Heating features', 'Cooling features', 'Appliances included', 'Laundry features', 'Parking features', 'City', 'State'] ['Year built', 'Lot', 'Bathrooms', 'Full bathrooms', 'Total interior livable area', 'Total spaces', 'Garage spaces', 'Elementary School Score', 'Elementary School Distance', 'Middle School Score', 'Middle School Distance', 'High School Score', 'High School Distance', 'Tax assessed value', 'Annual tax amount', 'Listed Price', 'Last Sold Price', 'Zip'] ['Listed On', 'Last Sold On']


In [48]:
from sklearn.base import BaseEstimator, TransformerMixin
# BaseEstimator有点像nn.module，是sklearn里面所有估计器的一个基类
from pandas.api.types import is_string_dtype, is_numeric_dtype

class Num_Features(BaseEstimator, TransformerMixin):
    def __init__(self, cols = [], fillna = False, addna = False):
        self.fillna = fillna
        self.cols = cols
        self.addna = addna
        self.na_cols = []
        self.imputers = {}
    def fit(self, X, y=None):
        for col in self.cols:
            if self.fillna:
                self.imputers[col] = X[col].median()
            if self.addna and X[col].isnull().sum():
                self.na_cols.append(col)
        print(self.na_cols, self.imputers)
        return self
    def transform(self, X, y=None):
        df = X.loc[:, self.cols]
        for col in self.imputers:
            df[col].fillna(self.imputers[col], inplace=True)
        for col in self.na_cols:
            df[col+'_na'] = pd.isnull(df[col])
        return df

In [49]:
class Imputer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy, fill_value):
        self.strategy = strategy
        self.fill_value = fill_value
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        for col, content in X.items():
            X[col].fillna(self.fill_value, inplace=True)
        return X

In [50]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer, StandardScaler
from sklearn.impute import SimpleImputer
num_pipeline = Pipeline([
    ('select_num', Num_Features(cols=num_cols, fillna='median', addna=True)),
])
X_num = num_pipeline.fit_transform(df_train) # 上面继承了之后，下面就可以用这个fit_transform
# 其实Num_Features这个class干的事情就是，把有缺失值的地方填上median，并且告诉你哪些地方有缺失值，填了些啥
# 然后给了一个[col+'_na']这样后缀的列，告诉你哪些样本，是na

['Year built', 'Lot', 'Bathrooms', 'Full bathrooms', 'Total interior livable area', 'Total spaces', 'Garage spaces', 'Elementary School Score', 'Elementary School Distance', 'Middle School Score', 'Middle School Distance', 'High School Score', 'High School Distance', 'Tax assessed value', 'Annual tax amount', 'Last Sold Price'] {'Year built': 1967.0, 'Lot': 6502.0, 'Bathrooms': 2.0, 'Full bathrooms': 2.0, 'Total interior livable area': 1566.0, 'Total spaces': 1.0, 'Garage spaces': 1.0, 'Elementary School Score': 6.0, 'Elementary School Distance': 0.5, 'Middle School Score': 5.0, 'Middle School Distance': 1.0, 'High School Score': 6.0, 'High School Distance': 1.3, 'Tax assessed value': 547524.0, 'Annual tax amount': 7129.0, 'Listed Price': 949000.0, 'Last Sold Price': 598000.0, 'Zip': 94114.0}


In [63]:
class CatEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols, max_n_cat=7, onehot_cols=[], orders={}):
        self.cols = cols
        self.onehot_cols=onehot_cols
        self.cats = {}
        self.max_n_cat = max_n_cat
        self.orders = orders
    def fit(self, X, y=None):
        df_cat =  X.loc[:, self.cols]
        for n,c in df_cat.items():
            df_cat[n].fillna('NAN', inplace=True)
            df_cat[n] = c.astype('category').cat.as_ordered()
            if n in self.orders:
                df_cat[n].cat.set_categories(self.orders[n], ordered=True, inplace=True)
            cats_count = len(df_cat[n].cat.categories)
            if cats_count<=2 or cats_count>self.max_n_cat:
                self.cats[n] = df_cat[n].cat.categories
                if n in self.onehot_cols:
                    self.onehot_cols.remove(n)
            elif n not in self.onehot_cols:
                self.onehot_cols.append(n)

        print(self.onehot_cols)
        return self
    def transform(self, df, y=None):
        X = df.loc[:, self.cols]
        for col in self.cats:
            X[col].fillna('NAN', inplace=True)
            X.loc[:,col] = pd.Categorical(X[col], categories=self.cats[col], ordered=True)
            X.loc[:,col] = X[col].astype('category').cat.codes

        # for n,c in X.items():
        #     if n in self.cats:
        #         X[n] = pd.Categorical(c, categories=self.cats[n], ordered=True)
        #         X[n] = X[n].cat.codes + 1
        #     else:
        #         X[n] = c.astype('category').cat.as_ordered()
        if len(self.onehot_cols):
            df_1h = pd.get_dummies(X[self.onehot_cols], dummy_na=True)
            df_drop=X.drop(self.onehot_cols,axis=1)
            return pd.concat([df_drop, df_1h], axis=1)

        return X

In [64]:
cat_pipeline = Pipeline([
    ('cat_encoder', CatEncoder(cols=cate_cols))
])
X_cate = cat_pipeline.fit_transform(df_train)

[]


In [65]:
def add_datepart(df, field_name, prefix=None, drop=True, time=False):
    field = df[field_name]
    if prefix is None:
        prefix = re.sub('[Dd]ate$', '', field_name)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    # Pandas removed `dt.week` in v1.1.10
    week = field.dt.isocalendar().week.astype(field.dt.day.dtype) if hasattr(field.dt, 'isocalendar') else field.dt.week
    for n in attr: df[prefix + n] = getattr(field.dt, n.lower()) if n != 'Week' else week
    mask = ~field.isna()
    df[prefix + 'Elapsed'] = np.where(mask,field.values.astype(np.int64) // 10 ** 9,np.nan)
    if drop: df.drop(field_name, axis=1, inplace=True)
    return df

In [66]:
import re
class Datepart(BaseEstimator, TransformerMixin):
    def __init__(self, cols, time=False):
        self.cols = cols
        self.time = time
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        df_dates = X.loc[:, self.cols]
        for col in self.cols:
            add_datepart(df_dates, col, time=False)
        return df_dates
    
date_pipeline = Pipeline([
    ('datepart', Datepart(cols=date_cols)),
    ('imputer', Imputer(strategy="constant", fill_value=-1)),
])

In [67]:
X_date = date_pipeline.fit_transform(df_train)

In [69]:
y_train = np.log(df_train[target_col])
X_train = pd.concat([X_num, X_cate, X_date], axis=1)
X_train.shape, y_train.shape
# 上面一堆操作其实就是对数字变量进行fillna，用中值，类别变量on-hot，日期变量做统一的标准化

Unnamed: 0,Year built,Lot,Bathrooms,Full bathrooms,Total interior livable area,Total spaces,Garage spaces,Elementary School Score,Elementary School Distance,Middle School Score,...,Last Sold OnDay,Last Sold OnDayofweek,Last Sold OnDayofyear,Last Sold OnIs_month_end,Last Sold OnIs_month_start,Last Sold OnIs_quarter_end,Last Sold OnIs_quarter_start,Last Sold OnIs_year_end,Last Sold OnIs_year_start,Last Sold OnElapsed
0,1969.0,1.0,0.0,2.0,1.0,0.0,0.0,7.0,0.4,5.0,...,-1.0,-1.0,-1.0,False,False,False,False,False,False,-1.000000e+00
1,1926.0,4047.0,2.0,2.0,872.0,1.0,1.0,3.0,0.8,2.0,...,30.0,4.0,242.0,False,False,False,False,False,False,1.567123e+09
2,1958.0,9147.0,3.0,1.0,1152.0,0.0,0.0,6.0,0.5,5.0,...,-1.0,-1.0,-1.0,False,False,False,False,False,False,-1.000000e+00
3,1947.0,6502.0,3.0,3.0,2612.0,0.0,0.0,9.0,0.2,7.0,...,30.0,1.0,243.0,False,False,False,False,False,False,1.472515e+09
4,1967.0,6502.0,2.0,2.0,1566.0,1.0,1.0,6.0,8.5,5.0,...,27.0,0.0,179.0,False,False,False,False,False,False,1.466986e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47434,1965.0,20908.8,2.0,2.0,1432.0,2.0,2.0,3.0,2.4,5.0,...,-1.0,-1.0,-1.0,False,False,False,False,False,False,-1.000000e+00
47435,1999.0,6502.0,2.0,2.0,1560.0,0.0,0.0,5.0,2.4,6.0,...,-1.0,-1.0,-1.0,False,False,False,False,False,False,-1.000000e+00
47436,1919.0,6756.0,2.0,2.0,1860.0,0.0,0.0,7.0,0.6,3.0,...,1.0,4.0,121.0,False,True,False,False,False,False,1.430438e+09
47437,2017.0,5945.0,3.0,2.0,2125.0,4.0,4.0,7.0,0.5,7.0,...,24.0,0.0,237.0,False,False,False,False,False,False,1.598227e+09


In [70]:
X_train

Unnamed: 0,Year built,Lot,Bathrooms,Full bathrooms,Total interior livable area,Total spaces,Garage spaces,Elementary School Score,Elementary School Distance,Middle School Score,...,Last Sold OnDay,Last Sold OnDayofweek,Last Sold OnDayofyear,Last Sold OnIs_month_end,Last Sold OnIs_month_start,Last Sold OnIs_quarter_end,Last Sold OnIs_quarter_start,Last Sold OnIs_year_end,Last Sold OnIs_year_start,Last Sold OnElapsed
0,1969.0,1.0,0.0,2.0,1.0,0.0,0.0,7.0,0.4,5.0,...,-1.0,-1.0,-1.0,False,False,False,False,False,False,-1.000000e+00
1,1926.0,4047.0,2.0,2.0,872.0,1.0,1.0,3.0,0.8,2.0,...,30.0,4.0,242.0,False,False,False,False,False,False,1.567123e+09
2,1958.0,9147.0,3.0,1.0,1152.0,0.0,0.0,6.0,0.5,5.0,...,-1.0,-1.0,-1.0,False,False,False,False,False,False,-1.000000e+00
3,1947.0,6502.0,3.0,3.0,2612.0,0.0,0.0,9.0,0.2,7.0,...,30.0,1.0,243.0,False,False,False,False,False,False,1.472515e+09
4,1967.0,6502.0,2.0,2.0,1566.0,1.0,1.0,6.0,8.5,5.0,...,27.0,0.0,179.0,False,False,False,False,False,False,1.466986e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47434,1965.0,20908.8,2.0,2.0,1432.0,2.0,2.0,3.0,2.4,5.0,...,-1.0,-1.0,-1.0,False,False,False,False,False,False,-1.000000e+00
47435,1999.0,6502.0,2.0,2.0,1560.0,0.0,0.0,5.0,2.4,6.0,...,-1.0,-1.0,-1.0,False,False,False,False,False,False,-1.000000e+00
47436,1919.0,6756.0,2.0,2.0,1860.0,0.0,0.0,7.0,0.6,3.0,...,1.0,4.0,121.0,False,True,False,False,False,False,1.430438e+09
47437,2017.0,5945.0,3.0,2.0,2125.0,4.0,4.0,7.0,0.5,7.0,...,24.0,0.0,237.0,False,False,False,False,False,False,1.598227e+09


接下来就是激动人心的调参时刻

In [71]:
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import RandomForestRegressor
# 这里用了oob去更好的调参
# 感觉oob其实就有点点像k-fold
model = RandomForestRegressor(oob_score=True, random_state=3, n_jobs=-1)
params ={
    'n_estimators': [200, 300, 400, 500, 600, 700],# [300, 400, 500, 600, 700],
    'min_samples_leaf': [2],# [1, 2, 3, 5, 10, 25],
    'max_features': [0.5],# [None, 0.5, 'sqrt', 'log2'],
    'max_depth': [13],# [5, 6, 7, 8, 10, 15, 20],
    'min_samples_split': [2]# [2, 3, 4]
}

best_score = 0
for g in ParameterGrid(params):
    print(g)
    model.set_params(**g)
    model.fit(X_train, y_train)
    if model.oob_score_ > best_score:
        best_score = model.oob_score_
        best_grid = g
        print('oob:', best_score, best_grid)

{'max_depth': 13, 'max_features': 0.5, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
oob: 0.9408908036288887 {'max_depth': 13, 'max_features': 0.5, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}


In [72]:
# 正式的模型
from sklearn.ensemble import RandomForestRegressor
m = RandomForestRegressor(n_jobs=-1, n_estimators=200, oob_score=True, max_depth=17, min_samples_leaf=4, min_samples_split=2, max_features=0.5)
m.fit(X_train, y_train)
m.oob_score_

0.9409949847051191

接下来做特征的筛选，现在特征还是之前的全量特征（79）

In [73]:
def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}).sort_values('imp', ascending=False)

fi = rf_feat_importance(m, X_train)
fi[:50]

Unnamed: 0,cols,imp
15,Listed Price,0.695555
13,Tax assessed value,0.101928
14,Annual tax amount,0.083738
16,Last Sold Price,0.025286
4,Total interior livable area,0.021632
17,Zip,0.016947
65,Listed OnElapsed,0.007448
2,Bathrooms,0.004254
7,Elementary School Score,0.004138
78,Last Sold OnElapsed,0.003425


In [76]:
del_cols = []
keep_cols = ['Listed Price', 'Tax assessed value', 'Annual tax amount', 'Last Sold Price', 'Total interior livable area', 'Zip']
# 保留重要性前6个的
# 当然这个列表也是可以自定义的

Threshold = 0.0009 
to_keep = fi[fi.imp > Threshold].cols
to_keep = [col for _, col in to_keep.items()]
to_keep # 用上面那个重要性的阈值，筛出来，生成一个我们要保留的列表 # 第一步 留了25个

['Listed Price',
 'Tax assessed value',
 'Annual tax amount',
 'Last Sold Price',
 'Total interior livable area',
 'Zip',
 'Listed OnElapsed',
 'Bathrooms',
 'Elementary School Score',
 'Last Sold OnElapsed',
 'Listed OnYear',
 'Year built',
 'Lot',
 'Type',
 'Last Sold OnYear',
 'Middle School Score',
 'Parking',
 'Bedrooms',
 'High School Distance',
 'Summary',
 'High School',
 'Address',
 'Elementary School Distance',
 'Elementary School',
 'Last Sold OnDayofyear']

In [81]:
# 下面两个循环，其实就是，如果你自定义了要删除的fea，那就删掉，如果没有自定义的，那就添加进去
for col in del_cols:
    if col in to_keep:
        to_keep.remove(col)
for col in keep_cols:
    if col not in to_keep:
        to_keep.append(col)
print(to_keep)
df_keep = X_train[to_keep].copy()

['Listed Price', 'Tax assessed value', 'Annual tax amount', 'Last Sold Price', 'Total interior livable area', 'Zip', 'Listed OnElapsed', 'Bathrooms', 'Elementary School Score', 'Last Sold OnElapsed', 'Listed OnYear', 'Year built', 'Lot', 'Type', 'Last Sold OnYear', 'Middle School Score', 'Parking', 'Bedrooms', 'High School Distance', 'Summary', 'High School', 'Address', 'Elementary School Distance', 'Elementary School', 'Last Sold OnDayofyear']


In [83]:
m1 = RandomForestRegressor(n_jobs=-1, random_state=3, n_estimators=300, oob_score=True, max_depth=13, min_samples_leaf=2, min_samples_split=2, max_features=0.5)
m1.fit(df_keep, y_train)
print(m1.oob_score_)
# 得到一个oob的基线水平
# 不过这里的模型应该是基于这些特征又调参了一次的
# oob对于分类默认是acc，对于回归默认是r2 score，这玩意最好也是1

0.941188402919227


In [84]:
cols = to_keep
scores = []
feats = []
for col in cols:
    tmp = to_keep.copy()
    if col in keep_cols:
        continue
    tmp.remove(col)
    df_tmp = X_train[tmp].copy()
    m1 = RandomForestRegressor(n_jobs=-1, random_state=3, n_estimators=30, oob_score=True, max_depth=13, min_samples_leaf=2, min_samples_split=2, max_features=0.5)
    m1.fit(df_tmp, y_train)
    scores.append(m1.oob_score_)
    feats.append(col)
#     print(col, m1.oob_score_)

to_del = sorted(zip(scores, feats), reverse=True)
to_del

[(0.9377127142393253, 'Elementary School Distance'),
 (0.9376087927105013, 'Bedrooms'),
 (0.9373582809545971, 'Elementary School'),
 (0.9373029414958909, 'Type'),
 (0.9372611693799537, 'Lot'),
 (0.9372424874767573, 'Middle School Score'),
 (0.9372395546718241, 'Address'),
 (0.9372275791233796, 'Last Sold OnElapsed'),
 (0.937218927289895, 'Last Sold OnYear'),
 (0.9372087857382156, 'Elementary School Score'),
 (0.9371580646442932, 'Bathrooms'),
 (0.9371115539132985, 'Last Sold OnDayofyear'),
 (0.9370337608816615, 'Year built'),
 (0.9370204663332201, 'High School Distance'),
 (0.936879075310085, 'Parking'),
 (0.9367790537603086, 'Summary'),
 (0.9365369609077454, 'Listed OnYear'),
 (0.9364896257631361, 'Listed OnElapsed'),
 (0.9361110827041546, 'High School')]

oob其实是这样，删掉某个元素，变化最小的，那这个元素肯定对于整体性能没啥作用，所以我们就可以删掉了，当然这里因为涉及到实际意义，所以删除也不是完全按照oob来的

In [86]:
# 最好提交的特征，18个
to_keep_final=['Listed Price', 'Tax assessed value', 'Last Sold Price', 'Zip', 'Total interior livable area', 'Elementary School Score', 
'Listed OnElapsed', 'Last Sold OnElapsed',
'Full bathrooms', 'Year built', 
'Listed OnYear', 
'Lot', 
'Parking','Type', 'Middle School Score', 'High School Distance', 'Elementary School Distance', 'Bedrooms']
X_train_final = X_train[to_keep_final].copy()

In [88]:
# 2nd pass grid search to determine the final parameters
# 再来调一次
from sklearn.model_selection import ParameterGrid
model = RandomForestRegressor(oob_score=True, random_state=3, n_jobs=-1, max_features=0.5)
params ={
    'n_estimators': [500],# [400, 500, 600, 700, 900, 1000, 1100],
    'min_samples_leaf': [2],# [1, 2, 3, 5, 10, 25],
    'max_features': [0.5],# [0.5, 'sqrt', 'log2'],
    'max_depth': [10],# [5, 6, 7, 8],
    'min_samples_split': [2]# [2, 3, 4]
}

best_score = 0
for g in ParameterGrid(params):
    model.set_params(**g)
    model.fit(X_train_final, y_train)
    if model.oob_score_ > best_score:
        best_score = model.oob_score_
        best_grid = g
        print('best oob:', best_score, best_grid)
        # 这里也可以发现，最好的oob相较于之前其实是下降了一些的，但是我们想要模型有更好的泛化能力，所以一些些下降我们是可以接受的

best oob: 0.9394796224669343 {'max_depth': 10, 'max_features': 0.5, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 500}


In [89]:
# 最好成绩的超参数
model_final = RandomForestRegressor(n_jobs=-1, n_estimators=550, max_depth=17, min_samples_leaf=4, min_samples_split=2, max_features=0.45)
model_final.fit(X_train_final, y_train)

In [90]:
X_test_num = num_pipeline.transform(df_test)
X_test_cate = cat_pipeline.transform(df_test)
X_test_date = date_pipeline.transform(df_test)
df_t = pd.concat([X_test_num, X_test_cate, X_test_date], axis=1)
df_t = df_t[to_keep_final]

In [91]:
pred=model_final.predict(df_t)
df_pred=pd.DataFrame({'Id':df_test['Id'],'Sold Price': np.exp(pred)})
print(df_pred.head())

      Id    Sold Price
0  47439  8.139330e+05
1  47440  4.940148e+05
2  47441  8.317103e+05
3  47442  7.987447e+05
4  47443  1.062152e+06


再上面就是常见的预测啥的了，没啥特殊的