In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Задача: разработать сервис, который будет предсказывать стоимость домов, основываясь на истории предложений.

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

from ast import literal_eval
from sklearn.preprocessing import (StandardScaler, MinMaxScaler)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, StackingRegressor)
from sklearn.tree import DecisionTreeRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

from sklearn.metrics import f1_score
from sklearn.metrics import (mean_squared_error, mean_absolute_error)
from sklearn.metrics import mean_absolute_percentage_error

In [3]:
# fix the RANDOM_SEED value so that the experiments are reproducible:
RANDOM_SEED = 42

In [4]:
data = pd.read_csv('/kaggle/input/diplomdata/data.csv')
data.head()

In [5]:
data.columns

In [6]:
data.info()

Посмотрю на количество пустых значений в столбцах

In [7]:
nan_count=100*data.isna().sum().sort_values(ascending=False)/data.shape[0]
fig=px.bar(x=nan_count.index,y=nan_count.values, labels={"y": "Nan ammount (%)","x": "Feature"})
fig.show()

Самое простое решение для пропущенных значений – отбросить строки или весь столбец. Я использую 80% в качестве значения и отброшу  столбцы, в которых отсутствуют значения, превышающие этот порог.

In [8]:
threshold = 0.8
# Удаление столбцов с коэффициентом пропущенных значений выше порога
data = data[data.columns[data.isnull().mean() < threshold]]

In [9]:
data.info()

# EDA

В дальнейшем, я заккоментирую некоторые строчки кода, которые помогали мне анализировать информацию, так как out в них очень большой по обьему и не нужен для общего понимания хода анализа

# status

In [10]:
data.status.isna().sum()

In [11]:
data.status.value_counts().head(30)

In [12]:
# data.status.unique()

In [13]:
data['status'] = data['status'].apply(lambda x: str(x).lower()) # приведу данные к нижнему регистру

In [14]:
def status_cod(status):
    if 'for sale'in status: return 'for sale'
    elif 'active' in status: return 'active'
    elif 'foreclosure' in status: return 'foreclosure'
    elif 'pending' in status: return 'pending'
    elif 'continue' in status: return 'active'
    elif 'new' in status: return 'new construction'
    elif status == 'nan': return 'No info'
    else: 
        return 'other'
    
data['status'] = data['status'].apply(lambda x: status_cod(x))
data['status'].value_counts().head(10)

# propertyType

In [15]:
data['propertyType'].value_counts()

In [16]:
data['propertyType'] = data['propertyType'].apply(lambda x: str(x).lower()) # приведу данные к нижнему регистру

In [17]:
data['propertyType'].value_counts().head(30)

In [18]:
def property_cod(status):
    if 'single' in status: return 'home'
    elif 'condo' in status: return 'condo'
    elif 'land' in status: return 'land'
    elif 'townhouse' in status: return 'townhouse'
    elif status == 'nan': return 'No info'
    else: 
        return 'other'
    
data['propertyType'] = data['propertyType'].apply(lambda x: property_cod(x))
data['propertyType'].value_counts()

# street

In [19]:
data.street.isna().sum()

In [20]:
data.street.value_counts().head(30)

Сначала приведу данные к нижнему регистру и заменю два пустых значения самыми распостраненными

In [21]:
data['street'] = data['street'].apply(lambda x: str(x).lower())

In [22]:
data['street'].fillna(data['street'].value_counts().head(1), inplace=True)

In [23]:
data.street.value_counts().head(30)

In [24]:
def street_cod(status):
    if 'address not disclosed' in status: return 'unknown address'
    elif 'undisclosed address ' in status: return 'unknown address'
    elif '(undisclosed address)' in status: return 'unknown address'
    elif 'unknown address' in status: return 'unknown address'
    elif 'st' in status: return 'street'
    elif 'street' in status: return 'street'
    elif 'ave' in status: return 'avenue'
    elif 'avenue' in status: return 'avenue'
    elif 'rd' in status: return 'road'
    elif 'road' in status: return 'road'
    elif 'ln' in status: return 'lane'
    elif 'lane' in status: return 'lane'
    elif  'drive' in status: return 'drive'
    elif 'dr' in status: return 'drive'
    elif 'boulevard' in status: return 'boulevard'
    elif 'blvd' in status: return 'boulevard'
    elif 'trl' in status: return 'trail'
    elif 'trail' in status: return 'trail'
    elif 'tr' in status: return 'trail'
    elif 'hwy' in status: return 'way'
    elif 'way' in status: return 'way'
    elif 'circle' in status: return 'circle'
    elif 'cir'  in status: return 'circle'
    elif 'court' in status: return 'court'
    elif 'ct' in status: return 'court'
    elif 'place' in status: return 'place'
    elif 'pl' in status: return 'place'
    else: 
        return 'other'
   
    
data['street'] = data['street'].apply(lambda x: street_cod(x))
data['street'].value_counts()

# baths

In [25]:
data.baths.isna().sum()

In [26]:
data['baths'] = data['baths'].apply(lambda x: str(x).lower()) # приведу данные к нижнему регистру

In [27]:
data['baths'].value_counts().head(30)

In [28]:
data['baths'] = data['baths'].apply(lambda x: x.replace(',','.'))

In [29]:
# data.baths.unique()

In [30]:
  def baths_cod(status):
    if status in ['nan','sq. ft. ','~','-- baths','—','0','0 / 0','0.00', '0.0'] : return 'No info'
    elif status in ['1.000','bathrooms: 1', '1','1.0','1.5','1 ba','1.500','1.250','1.5 baths','1.5+','1.75 baths','1.1 baths','1.75 ba','1.25 baths','0.5 baths','1.5 ba'] : return '1'
    elif status in ['2 baths','2','bathrooms: 2','1.750','2 ba','2.0','2.1 baths','2.5 baths','2.500','2.5','2.000','2.250','2.750','2.5+','2.5 ba','2.75 baths','2.25 baths',
    '2.2 baths','2.25 ba','1-2 baths'] : return '2'
    elif status in ['3.5', '3 baths', '3', '3.0','bathrooms: 3','3 ba','3.5 baths','3.000','3.500','3.5+','3.5 ba','3.1 baths', '3.25 baths','3.2 baths','1-0 / 1-0 / 1'] : return '3'
    elif status in ['4 baths','4.0','4.5','bathrooms: 4','4','4.000','4.5 baths','4 ba','4.5+','4.5 ba','4.75 baths','4.25 baths','1 / 1 / 1 / 1','1 / 1-0 / 1-0 / 1-0'] : return '4'
    elif status in ['bathrooms: 5','5','5.5 baths','5 baths','5.0','5.5','5.5+','5 ba','5.000','5.2 baths', '5.5 ba','5.25 baths', 'bathrooms: semimod', '3-1 / 2-2'] : return '5'
    elif status in [ '6 baths','6.0', 'bathrooms: 6','6','6 ba','6.5','6.5+', '6.5 baths','2-1 / 2-1 / 1-1 / 1-1','6.75 baths']: return '6'
    elif status in ['7 baths','7.0','7.5+','bathrooms: 7','7','7 ba','7.500+', '7.5', '7.5 baths'] : return '7'
    elif status in ['8 baths','8.0','8','bathrooms: 8','8.5+','8 ba','8.5','8.5 baths'] : return '8'
    #elif status in ['9 baths','9','9.0','bathrooms: 9', '9 ba','9.5','9.5+'] : return '9'
    else: 
        return '9 and more'
    
data['baths'] = data['baths'].apply(lambda x: baths_cod(x))
data['baths'].value_counts()

# homeFacts

In [31]:
data['homeFacts']

In [32]:
data['homeFacts'][1]

На мой взгляд, самыми информативными здесь являются колонки с датой постройки и информацией о парковке. Информация о площади и цене в датасете уже есть. В колонках отопление и охлаждение много пропусков и сложно понять, как правильно обобщить эти данные, но если останется время поиграть можно. Пока не буду. 

In [33]:
type(data['homeFacts'])

In [34]:
type(data['homeFacts'][1])

Самое очевидное - попробовать вытащить информацию о годе постройки здания

In [35]:
ser = pd.Series(data['homeFacts']) 

ser.head(10) 

In [36]:
ser[0][35:39]

In [37]:
year = []
for j in range(0,377185):
    year.append(ser[j][35:39])

In [38]:
year[:10]

In [39]:
data['year'] = year

In [40]:
data['year'].value_counts().head(30)

In [41]:
data['year'].unique()

In [42]:
def year_cod(status):
    if "', '" in status: return '0'
    elif 'one,' in status: return '0'
    elif 'No D' in status: return '0'
    elif "1', " in status: return '0'
    else: 
        return status
    
data['year'] = data['year'].apply(lambda x: year_cod(x))
data['year'].value_counts()

Теперь проанализирую  данные о парковке

In [43]:
from ast import literal_eval
def funk(x):
    homefact_dict = literal_eval(x)
    homefact_lst = homefact_dict['atAGlanceFacts'][4].get('factValue')
    if homefact_lst:
        return str(homefact_lst)
    else:
        return np.nan
Parking = list(data[:].homeFacts.apply(funk))
Parking[:10]

In [44]:
data['Parking'] = Parking  

In [45]:
data['Parking'] = data['Parking'].apply(lambda x: str(x).lower()) # приведу данные к нижнему регистру

In [46]:
data['Parking'].value_counts().head(30)

In [47]:
def parking_cod(status):
    if '1' in status: return 'one'
    elif '2' in status: return 'two'
    elif '3' in status: return 'three'
    elif '4' in status: return 'four'
    elif 'attached' in status: return 'attached'
    elif 'detached' in status: return 'detached'
    elif 'carport' in status: return 'carport'
    elif 'no data' in status: return 'no info'
    elif '5' in status: return 'five'
    elif 'parking' in status: return 'parking'
    elif 'on street' in status: return 'on street'
    elif 'off street' in status: return 'off street'
    elif 'none' in status: return 'zero'
    elif '0' in status: return 'zero'
    elif 'garage' in status: return 'one'
    elif 'nan' in status: return 'no info'
    else: 
        return 'other'
    
data['Parking'] = data['Parking'].apply(lambda x: parking_cod(x))
data['Parking'].value_counts()

In [48]:
#print (data['Parking'].unique().tolist())

# fireplace

In [49]:
data.fireplace.isna().sum()

In [50]:
data['fireplace'] = data['fireplace'].apply(lambda x: str(x).lower()) # приведу данные к нижнему регистру

In [51]:
data['fireplace'].value_counts().head(40)

In [52]:
def fireplace_cod(status):
    if '1' in status: return 'one'
    elif '2' in status: return 'two'
    elif '3' in status: return 'three'
    elif '4' in status: return 'four'
    elif '5' in status: return 'yes'
    elif 'one' in status: return 'one'
    elif 'gas' in status: return 'gas'
    elif 'yes' in status: return 'yes'
    elif 'wood' in status: return 'wood'
    elif 'nan' in status: return 'No info'
    elif 'not applicable' in status: return 'zero'
    elif 'no' in status: return 'zero'
    elif '0' in status: return 'zero'
    elif 'woodburning' in status: return 'wood'
    else: 
        return 'other'
    
data['fireplace'] = data['fireplace'].apply(lambda x: fireplace_cod(x))
data['fireplace'].value_counts()

# city

In [53]:
data.city.isna().sum()

In [54]:
data['city'].value_counts().head(30)

так как пропусков не много, заменю их самым распространенным значением

In [55]:
data['city'] = data['city'].replace(np.nan, 'Houston')

In [56]:
data.city.isna().sum()

# schools

In [57]:
data['schools']

In [58]:
type(data['schools'])

In [59]:
data['schools'][1]

здесь наиболее интересными данными мне кажутся рейтинг школы и расстояние до школы. Попробую вычислить средний рейтинг и среднее расстояние.

In [60]:
def funk(x):
    Distance = literal_eval(x)
    Distance_list = Distance[0]['data'].get('Distance')
    if Distance_list:
        return Distance_list
    else:
        return '0'
    
Distance = list(data[:].schools.apply(funk))
Distance[:10]

In [61]:
data['Distance'] = Distance

In [62]:
data['Distance'][:10]

In [63]:
average_distance_school = []

In [64]:
for x in range(0,377185):
   
    l = data['Distance'][x]
    l1=[]
    for x in l:
        x =str(x).replace('mi','').replace('[','').replace(']','')
        l1.append(x)
    floatlist = [float(elem) for elem in l1]
    average_distance_school.append(sum(floatlist)/len(floatlist))
    
       

In [65]:
average_distance_school[:10]

In [66]:
len(average_distance_school)

In [67]:
data['average_distance_school'] = average_distance_school
data['average_distance_school'] = data['average_distance_school'].apply(lambda x: round(x,2))

In [68]:
data['average_distance_school'].value_counts().head(30)

In [69]:
data.average_distance_school.isna().sum()

In [70]:
def funk(x):
    rating = literal_eval(x)
    rating_list = rating[0].get('rating')
    if rating_list:
        return rating_list
    else:
        return '0'
    
rating_school = list(data[:].schools.apply(funk))
rating_school[:10]

In [71]:
data['rating_school'] = rating_school

In [72]:
data['rating_school'][:10]

In [73]:
 average_rating_school = []

In [74]:
for x in range(0,377185):
   
    m = data['rating_school'][x]
    m1=[]
    for x in m:
        x =str(x).replace('1/10','1').replace('2/10','2').replace('4/10','4').replace('5/10','5').replace('6/10','6').replace('7/10','7').replace('8/10','8').replace('9/10','9').replace('10/10','10').replace('None/10','0').replace('NA','0').replace('NR','0').replace('3/10','3').replace('[','').replace(']','').replace('','0')
        m1.append(x)
    floatlist = [float(elem) for elem in m1]
    average_rating_school.append((sum(floatlist)/len(floatlist))/10)

In [75]:
data['average_rating_school'] = average_rating_school
data['average_rating_school'] = data['average_rating_school'].apply(lambda x: round(x,2))

In [76]:
data['average_rating_school'].value_counts()

In [77]:
#data.nlargest(10, ['average_rating_school'])

# sqft

In [78]:
data.sqft.isna().sum()

In [79]:
data['sqft'].value_counts().head(30)

In [80]:
data['sqft'] = data['sqft'].apply(lambda x: str(x).replace('sqft','').replace(',','.'))

In [81]:
data['sqft'] = data['sqft'].apply(lambda x: str(x).replace('.',''))

In [82]:
data['sqft'] = data['sqft'].apply(lambda x: str(x).replace('--','0'))

In [83]:
data['sqft'].value_counts().head(30)

In [84]:
data['sqft'] = data['sqft'].apply(lambda x: str(x).replace('Total interior livable area: ','').replace('nan','0').replace('456602479', '4566').replace('610-840 ', '840'))
data['sqft'].value_counts()

In [85]:
#print (data['sqft'].unique().tolist())

# zipcod

In [86]:
data['zipcode'].isna().sum()

In [87]:
data['zipcode'].value_counts().head(30)

некоторые индексы состоят из двух частей, разделенных дефисом. Чтобы перевести колонку в числовой тип, оставлю везде только первую часть кода

In [88]:
data['zipcode'] = [x[:5] for x in data['zipcode']] 

Заменю пустые значения самым распространенным

In [89]:
data['zipcode'] = data['zipcode'].apply(lambda x: str(x).replace('--','32137'))

In [90]:
#print (data['zipcode'].unique().tolist())

Пропусков нет, столбец с данными почтового индекса, оставим

 # beds

In [91]:
data['beds'].isna().sum()

In [92]:
data['beds'].value_counts().head(30)

In [93]:
def beds_cod(status):
    if '1' in status: return 'one'
    elif '2' in status: return 'two'
    elif '3' in status: return 'three'
    elif '4' in status: return 'four'
    elif '5' in status: return 'five'
    elif '6' in status: return 'six'
    elif '7' in status: return 'seven'
    elif '8' in status: return 'eight'
    elif '9' in status: return 'nine'
    elif 'nan' in status: return 'No info'
    elif 'Baths' in status: return 'No info'
    elif '--' in status: return 'No info'
    elif '0' in status: return 'zero'
    elif '0.0' in status: return 'zero'
    elif 'acres' in status: return 'one'
    else: 
        return 'more 9'
    
data['beds'] = data['beds'].astype(str).apply(lambda x:beds_cod(x))
data['beds'].value_counts()

# state

In [94]:
data['state'].isna().sum()

In [95]:
data['state'].value_counts().head(30)

# stories

In [96]:
data['stories'].isna().sum()

In [97]:
data['stories'].value_counts().head(30)

In [98]:
data['stories'] = data['stories'].apply(lambda x: str(x).lower()) # приведу данные к нижнему регистру

In [99]:
def stories_cod(status):
    if '1' in status: return 'one'
    elif '2' in status: return 'two'
    elif '3' in status: return 'three'
    elif '4' in status: return 'four'
    elif '5' in status: return 'five'
    elif '6' in status: return 'six'
    elif 'nan' in status: return 'No info'
    elif 'one' in status: return 'one'
    elif 'two' in status: return 'two'
    elif 'three' in status: return 'three'
    elif 'four' in status: return 'four'
    elif 'five' in status: return 'five'
    elif 'six' in status: return 'six'
    elif '0' in status: return 'zero'
    elif '0.0' in status: return 'zero'
    else: 
        return 'other'
    
data['stories'] = data['stories'].astype(str).apply(lambda x:stories_cod(x))
data['stories'].value_counts()

# MlsId

In [100]:
data['MlsId'].value_counts().head(30)

Опять данные об адресе. Удалим

In [101]:
data.drop('MlsId', axis=1, inplace=True)

# target

In [102]:
data['target'].value_counts()

In [103]:
#print (data['target'].unique().tolist())

In [104]:
data['target'] = data['target'].apply(lambda x: str(x).replace('$',''))
data['target'] = data['target'].apply(lambda x: str(x).replace('+',''))
data['target'] = data['target'].apply(lambda x: str(x).replace(',',''))
data['target'] = data['target'].apply(lambda x: str(x).replace('/mo','000'))
data['target'] = data['target'].apply(lambda x: str(x).replace('1,215 - 1,437','1300000'))
data['target'] = data['target'].apply(lambda x: str(x).replace('nan','225000'))
data['target'] = data['target'].apply(lambda x: str(x).replace('1215 - 1437000','1437000'))

In [105]:
data['target'].value_counts()

Удалю ненужные столбцы

In [106]:
data.drop('schools', axis=1, inplace=True)
data.drop('homeFacts', axis=1, inplace=True)
data.drop('Distance', axis=1, inplace=True)
data.drop('rating_school', axis=1, inplace=True)

In [107]:
data.head()

In [108]:
data.info()

In [109]:
df = data.copy()

Несколько столбцов можно перевести из  object в int

In [110]:
df['average_rating_school'] = df['average_rating_school'].astype(int)
df['year'] = df['year'].astype(int)
df['target'] = df['target'].astype(int)
df['sqft'] = df['sqft'].astype(int)
df['zipcode'] = df['zipcode'].astype(int)

In [111]:
df.info()

Кодирую оставшиеся категориальные значения. Для столбцов city и state, содержащих очень много различных значений я пробовала One Hot Encoding, но при этом ноутбук вылетает из-за превышения размера памяти. 

In [112]:
df.status = df.status.astype('category').cat.codes
df.propertyType = df.propertyType.astype('category').cat.codes
df.street = df.street.astype('category').cat.codes
df.baths = df.baths.astype('category').cat.codes
df.fireplace = df.fireplace.astype('category').cat.codes
df.city = df.city.astype('category').cat.codes
df.sqft = df.sqft.astype('category').cat.codes
#df.zipcode = df.zipcode.astype('category').cat.codes
df.beds = df.beds.astype('category').cat.codes
df.stories = df.stories.astype('category').cat.codes
df.state = df.state.astype('category').cat.codes
df.Parking = df.Parking.astype('category').cat.codes

In [113]:
df.head()

In [114]:
#Выгрузим датасет для работы на локальной машине 
df.to_csv("/kaggle/working/df.csv")

# ML models

In [115]:
X = df.drop('target',axis = 1)
y = df['target']

Разбиваю данные на test и train

In [116]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [117]:
# пробовала стандартизировать числовые признаки. На бустинг моделях метрики остались такими же,  в RandomForest чуть хуже, а в ExtraTrees чуть лучше.
# Метрики StackingRegressor лучше со стандартизацией, поэтому буду ее использовать
num_cols=[]

num_cols += ['sqft','average_distance_school', 'year', 'average_rating_school', 'zipcode']

scaler = MinMaxScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

## Model 1:  "Наивная" модель

В качестве нивной модели буду использовать линейную регрессию

In [118]:

naiv_model = LinearRegression()
naiv_model.fit(X_train, y_train)
naiv_predict = naiv_model.predict(X_test)
 

print(f"Точность наивной модели по метрике MAE: {(mean_absolute_error(y_test, naiv_predict)):0.2f}")
print(f"Точность наивной модели по метрике MAPE: {(mean_absolute_percentage_error(y_test, naiv_predict)):0.2f}%")
print(f"Точность наивной модели по метрике RMSE: {(np.sqrt(mean_squared_error(y_test, naiv_predict))):0.2f}")

# # попробую логарифмировние 
naiv_model_log = LinearRegression()
naiv_model_log.fit(X_train, np.log(y_train))
naiv_predict_log = np.exp(naiv_model_log.predict(X_test))


print(f"Точность наивной модели по метрике log MAE: {(mean_absolute_error(y_test, naiv_predict_log)):0.2f}")
print(f"Точность наивной модели по метрике log MAPE: {(mean_absolute_percentage_error(y_test, naiv_predict_log)):0.2f}%")
print(f"Точность наивной модели по метрике log RMSE: {(np.sqrt(mean_squared_error(y_test, naiv_predict_log))):0.2f}")

В дальнейшем, модели с логарифмированием целевой переменной показали лучший результат, поэтому out без логарифмирования показывать не буду

# Model 2 : Random Forest

In [119]:
#rf = RandomForestRegressor(random_state=RANDOM_SEED, n_jobs=-1, verbose=1)
#rf.fit(X_train, y_train)
#predict_rf = rf.predict(X_test)

#print(f"The MAPE mertics of the Random Forest model is: {(mean_absolute_percentage_error(y_test, predict_rf)):0.2f}%.")


rf_log = RandomForestRegressor(random_state=RANDOM_SEED, n_jobs=-1, verbose=1)
rf_log.fit(X_train, np.log(y_train))
predict_rf_log = np.exp(rf_log.predict(X_test))
predict_rf_x = np.exp(rf_log.predict(X_train))

print(f"The MAE train Random Forest model is : {(mean_absolute_error(y_train, predict_rf_x)):0.2f}.")
print(f"The MAE test  Random Forest model is : {(mean_absolute_error(y_test, predict_rf_log)):0.2f}.")

print(f"The MAPE train Random Forest model is : {(mean_absolute_percentage_error(y_train, predict_rf_x)):0.2f}%.")
print(f"The MAPE test  Random Forest model is : {(mean_absolute_percentage_error(y_test, predict_rf_log)):0.2f}%.")

print(f"The RMSE train Random Forest model is : {(np.sqrt(mean_squared_error(y_train, predict_rf_x))):0.2f}.")
print(f"The RMSE test Random Forest model is : {(np.sqrt(mean_squared_error(y_test, predict_rf_log))):0.2f}.")

The MAE mertic for the log Random Forest model is : 202091.55

The MAPE mertic for the log Random Forest model is : 3.45%.

The RMSE mertic for the log Random Forest model is : 1389585.58.

# Model 3 : ExtraTreesRegressor

In [120]:
#etr = ExtraTreesRegressor(random_state=RANDOM_SEED, n_jobs=-1, verbose=1)
#etr.fit(X_train, y_train)
#predict_etr = etr.predict(X_test)

#print(f"The MAPE mertics of the ExtraTrees model is: {(mean_absolute_percentage_error(y_test, predict_etr)):0.2f}%.")


etr_log = ExtraTreesRegressor(random_state=RANDOM_SEED, n_jobs=-1, verbose=1)
etr_log.fit(X_train, np.log(y_train))
predict_etr_log = np.exp(etr_log.predict(X_test))
predict_etr_x = np.exp(etr_log.predict(X_train))

print(f"The MAE train ExtraTrees model is : {(mean_absolute_error(y_train, predict_etr_x)):0.2f}.")
print(f"The MAE test ExtraTrees model is : {(mean_absolute_error(y_test, predict_etr_log)):0.2f}.")

print(f"The MAPE train ExtraTrees model is : {(mean_absolute_percentage_error(y_train, predict_etr_x)):0.2f}%.")
print(f"The MAPE test ExtraTrees model is : {(mean_absolute_percentage_error(y_test, predict_etr_log)):0.2f}%.")

print(f"The RMSE train ExtraTrees model is : {(np.sqrt(mean_squared_error(y_train, predict_etr_x))):0.2f}.")
print(f"The RMSE test ExtraTrees model is : {(np.sqrt(mean_squared_error(y_test, predict_etr_log))):0.2f}.")

The MAE mertic for the log ExtraTrees model is : 208563.76.

The MAPE mertic for the log ExtraTrees model is : 4.08%.

The RMSE mertic for the log ExtraTrees model is : 1358773.31.

# Model 4 : CatBoost

In [121]:
#cat_boost = CatBoostRegressor(iterations = 10000,
#                          random_seed = RANDOM_SEED,
#                          eval_metric='MAPE',
#                          custom_metric=['R2', 'MAE'],
#                          silent=True,
#                         )
#cat_boost.fit(X_train, y_train,
#          eval_set=(X_test, y_test),
#          verbose_eval=0,
#          use_best_model=True,
#          )

#cat_boost.save_model('catboost_single_model_baseline.model')


#predict_cat_boost = cat_boost.predict(X_test)


#print(f"The MAPE mertic for the default CatBoost model: {(mean_absolute_percentage_error(y_test, predict_cat_boost)):0.2f}%")

The MAPE mertic for the default CatBoost model: 3.16%

In [122]:
log_cat_boost = CatBoostRegressor(iterations = 10000,
                          random_seed = RANDOM_SEED,
                          eval_metric='MAPE',
                          custom_metric=['R2', 'MAE'],
                          silent=True,
                         )
log_cat_boost.fit(X_train, np.log(y_train),
          eval_set=(X_test, np.log(y_test)),
          verbose_eval=0,
          use_best_model=True,
          )

log_cat_boost.save_model('catboost_single_model_baseline.model')


log_predict_cat_boost = log_cat_boost.predict(X_test)
predict_cat_boost_x = np.exp(log_cat_boost.predict(X_train))

print(f"The MAE train CatBoost model: {(mean_absolute_error(y_train, predict_cat_boost_x)):0.2f}.")
print(f"The MAE test CatBoost model: {(mean_absolute_error(y_test, log_predict_cat_boost)):0.2f}")

print(f"The MAPE train CatBoost model: {(mean_absolute_percentage_error(y_train, predict_cat_boost_x)):0.2f}%.")
print(f"The MAPE test CatBoost model: {(mean_absolute_percentage_error(y_test, log_predict_cat_boost)):0.2f}%")

print(f"The RMSE train CatBoost model: {(np.sqrt(mean_squared_error(y_train, predict_cat_boost_x))):0.2f}.")
print(f"The RMSE test CatBoost model: {(np.sqrt(mean_squared_error(y_test, log_predict_cat_boost))):0.2f}")

The MAE mertic for the CatBoost model: 644162.03

The MAPE mertic for the CatBoost model: 1.00%

The RMSE mertic for the CatBoost model: 2050654.88

# Model 5: GradientBoosting

In [123]:
gb = GradientBoostingRegressor(random_state=42)
gb.fit(X_train, np.log(y_train))
predict_gb = gb.predict(X_test)
predict_gb_x = gb.predict(X_train)

print(f"The MAE train GradientBoosting model: {(mean_absolute_error(y_train, predict_gb_x)):0.2f}.")
print(f"The MAE test GradientBoosting model: {(mean_absolute_error(y_test, predict_gb)):0.2f}")

print(f"The MAPE train GradientBoosting model: {(mean_absolute_percentage_error(y_train, predict_gb_x)):0.2f}%.")
print(f"The MAPE test GradientBoosting model: {(mean_absolute_percentage_error(y_test, predict_gb)):0.2f}%")

print(f"The RMSE train GradientBoosting model: {(np.sqrt(mean_squared_error(y_train, predict_gb_x))):0.2f}.")
print(f"The RMSE test GradientBoosting model: {(np.sqrt(mean_squared_error(y_test, predict_gb))):0.2f}")

The MAE mertic for GradientBoosting model: 644162.03

The MAPE mertic for GradientBoosting model: 1.00%

The RMSE mertic for GradientBoosting model: 2050654.99

# Model 6 : StackingRegressor

Попробую комбинацию из моделей, показавших лучшие метрики

In [124]:
estimators = [
     ('cat_boost', CatBoostRegressor(iterations = 10000,
                          random_seed = RANDOM_SEED,
                          eval_metric='MAPE',
                          custom_metric=['R2', 'MAE'],
                          silent=True
                         )),
     ('gb', GradientBoostingRegressor(random_state=RANDOM_SEED))
]

sr_log = StackingRegressor(
    estimators=estimators,
    final_estimator=RandomForestRegressor(random_state=RANDOM_SEED, n_jobs=-1, verbose=1)
)

# # For training, fit() is used
sr_log.fit(X_train, np.log(y_train))

# # For MAPE metric (or any other), we need the predictions of the model
y_pred = np.exp(sr_log.predict(X_test))

print(f"The MAE mertic for the default StackingRegressor model: {(mean_absolute_error(y_test, y_pred)):0.2f}")
print(f"The MAPE mertic for the default StackingRegressor model: {(mean_absolute_percentage_error(y_test, y_pred)):0.2f}%")
print(f"The RMSE mertic for the default StackingRegressor model: {(np.sqrt(mean_squared_error(y_test, y_pred))):0.2f}")

The MAE mertic for the default StackingRegressor model: 246650.54

The MAPE mertic for the default StackingRegressor model: 2.27%

The RMSE mertic for the default StackingRegressor model: 1438371.60

# Итог

В ходе выполнения проекта:

Загружены данные, произведена их обработка;

Заполнены пропуски;

Очищены nan;

Созданы новые признаки на основе имеющихся;

Построены различные модели классического ML для решения задачи регрессии;

Для каждой построенной модели получены соответствующие метрики;

Построена сводная модель, которая улучшила значение показателей и предотвратила переобучение.


В результате финальные метрики:

MAE  246650.54

MAPE 2.27%

RMSE  1438371.60