In [10]:
import warnings

warnings.filterwarnings('ignore')

In [11]:
import pandas as pd

In [12]:
import insolver
from insolver.frame import InsolverDataFrame
from insolver.transforms import InsolverTransform
from insolver.transforms import DatetimeTransforms
from insolver.feature_engineering import DataPreprocessing
from insolver.model_tools import download_dataset

# Load dataset

Let's load a dataset with the pd.read_csv() and create InsolverDataFrame.

In [13]:
# https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data
download_dataset('AB_NYC_2019')
dataset = InsolverDataFrame(pd.read_csv("datasets/AB_NYC_2019.csv"))
dataset.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


Information about a loaded dataset.

In [14]:
dataset.info()

<class 'insolver.frame.frame.InsolverDataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review          

Transform date using DatetimeTransforms class.

In [15]:
transform = DatetimeTransforms(['last_review'])
transform(dataset)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,last_review_unix
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365,1.539907e+09
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355,1.558397e+09
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.94190,Private room,150,3,0,,,1,365,
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194,1.562285e+09
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.10,1,0,1.542586e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48890,36484665,Charming one bedroom - newly renovated rowhouse,8232441,Sabrina,Brooklyn,Bedford-Stuyvesant,40.67853,-73.94995,Private room,70,2,0,,,2,9,
48891,36485057,Affordable room in Bushwick/East Williamsburg,6570630,Marisol,Brooklyn,Bushwick,40.70184,-73.93317,Private room,40,4,0,,,2,36,
48892,36485431,Sunny Studio at Historical Neighborhood,23492952,Ilgar & Aysel,Manhattan,Harlem,40.81475,-73.94867,Entire home/apt,115,10,0,,,1,27,
48893,36485609,43rd St. Time Square-cozy single bed,30985759,Taz,Manhattan,Hell's Kitchen,40.75751,-73.99112,Shared room,55,1,0,,,6,2,


Deleting useless columns.

In [16]:
dataset.drop(['id', 'name', 'host_id', 'host_name', 'last_review'], axis=1, inplace=True)

In [17]:
dataset.info()

<class 'insolver.frame.frame.InsolverDataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 12 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   neighbourhood_group             48895 non-null  object 
 1   neighbourhood                   48895 non-null  object 
 2   latitude                        48895 non-null  float64
 3   longitude                       48895 non-null  float64
 4   room_type                       48895 non-null  object 
 5   price                           48895 non-null  int64  
 6   minimum_nights                  48895 non-null  int64  
 7   number_of_reviews               48895 non-null  int64  
 8   reviews_per_month               38843 non-null  float64
 9   calculated_host_listings_count  48895 non-null  int64  
 10  availability_365                48895 non-null  int64  
 11  last_review_unix                38843 non-null  float64
dtypes: float64(4), int64(

In [18]:
dataset.isnull().sum()

neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
last_review_unix                  10052
dtype: int64

# Automated Data Preprocessing

DataPreprocessing class allows you to automatically preprocess data. 
By default, it applies `AutoFillNA`, `OneHotEncoder` and `Normalization` transformations to data.  
Any feature engineering method used in this class can be disabled using assigned parameters. 
You can also use `dimensionality reduction`, `sampling`, `smoothing`, `feature selection` and change their parameters available in this class using the assigned parameters. 

**Default settings** 

You can call this class without initializing any parameters and it will use the default settings.

In [19]:
new_dataset = DataPreprocessing().preprocess(df=dataset, target='price')

new_dataset.head()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,last_review_unix,neighbourhood_group_Bronx,neighbourhood_group_Brooklyn,...,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,price
0,-1.493849,-0.437652,-0.293996,-0.320414,-0.676551,-0.034716,1.91625,-0.083459,-0.151071,1.196705,...,-0.004522,-0.056757,-0.042462,-0.015001,-0.004522,-0.069494,-1.040134,1.090893,-0.155887,149
1,0.452436,-0.684639,-0.293996,0.487665,-0.564771,-0.156104,1.840275,0.479481,-0.151071,-0.835628,...,-0.004522,-0.056757,-0.042462,-0.015001,-0.004522,-0.069494,0.961415,-0.91668,-0.155887,225
2,1.468399,0.222497,-0.196484,-0.522433,-0.341211,-0.186451,1.91625,0.47422,-0.151071,-0.835628,...,-0.004522,-0.056757,-0.042462,-0.015001,-0.004522,-0.069494,-1.040134,1.090893,-0.155887,150
3,-0.803398,-0.16445,-0.293996,5.538156,2.236302,-0.186451,0.617065,0.597856,-0.151071,1.196705,...,-0.004522,-0.056757,-0.042462,-0.015001,-0.004522,-0.069494,0.961415,-0.91668,-0.155887,89
4,1.27566,0.177216,0.144807,-0.320414,-0.748879,-0.186451,-0.856865,-0.001912,-0.151071,-0.835628,...,-0.004522,-0.056757,-0.042462,-0.015001,-0.004522,-0.069494,0.961415,-0.91668,-0.155887,80


## Columns types

You can initialize the list of columns names as the `categorical_columns` and `numerical_columns` parameters.

In [20]:
new_dataset = DataPreprocessing(categorical_columns=['neighbourhood_group', 'neighbourhood', 'room_type']).preprocess(
    df=dataset, target='price'
)

new_dataset.head()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,last_review_unix,neighbourhood_group_Bronx,neighbourhood_group_Brooklyn,...,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,price
0,-1.493849,-0.437652,-0.293996,-0.320414,-0.676551,-0.034716,1.91625,-0.083459,-0.151071,1.196705,...,-0.004522,-0.056757,-0.042462,-0.015001,-0.004522,-0.069494,-1.040134,1.090893,-0.155887,149
1,0.452436,-0.684639,-0.293996,0.487665,-0.564771,-0.156104,1.840275,0.479481,-0.151071,-0.835628,...,-0.004522,-0.056757,-0.042462,-0.015001,-0.004522,-0.069494,0.961415,-0.91668,-0.155887,225
2,1.468399,0.222497,-0.196484,-0.522433,-0.341211,-0.186451,1.91625,0.47422,-0.151071,-0.835628,...,-0.004522,-0.056757,-0.042462,-0.015001,-0.004522,-0.069494,-1.040134,1.090893,-0.155887,150
3,-0.803398,-0.16445,-0.293996,5.538156,2.236302,-0.186451,0.617065,0.597856,-0.151071,1.196705,...,-0.004522,-0.056757,-0.042462,-0.015001,-0.004522,-0.069494,0.961415,-0.91668,-0.155887,89
4,1.27566,0.177216,0.144807,-0.320414,-0.748879,-0.186451,-0.856865,-0.001912,-0.151071,-0.835628,...,-0.004522,-0.056757,-0.042462,-0.015001,-0.004522,-0.069494,0.961415,-0.91668,-0.155887,80


If some columns were not included it will give a warning.

In [21]:
new_dataset = DataPreprocessing(
    numerical_columns=[
        'price',
        'minimum_nights',
        'number_of_reviews',
        'reviews_per_month',
        'calculated_host_listings_count',
        'availability_365',
        'last_review_unix',
    ],
    categorical_columns=['neighbourhood_group', 'neighbourhood', 'room_type'],
).preprocess(df=dataset, target='price')

new_dataset.head()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,last_review_unix,neighbourhood_group_Bronx,neighbourhood_group_Brooklyn,...,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,price
0,-1.493849,-0.437652,-0.293996,-0.320414,-0.676551,-0.034716,1.91625,-0.083459,-0.151071,1.196705,...,-0.004522,-0.056757,-0.042462,-0.015001,-0.004522,-0.069494,-1.040134,1.090893,-0.155887,149
1,0.452436,-0.684639,-0.293996,0.487665,-0.564771,-0.156104,1.840275,0.479481,-0.151071,-0.835628,...,-0.004522,-0.056757,-0.042462,-0.015001,-0.004522,-0.069494,0.961415,-0.91668,-0.155887,225
2,1.468399,0.222497,-0.196484,-0.522433,-0.341211,-0.186451,1.91625,0.47422,-0.151071,-0.835628,...,-0.004522,-0.056757,-0.042462,-0.015001,-0.004522,-0.069494,-1.040134,1.090893,-0.155887,150
3,-0.803398,-0.16445,-0.293996,5.538156,2.236302,-0.186451,0.617065,0.597856,-0.151071,1.196705,...,-0.004522,-0.056757,-0.042462,-0.015001,-0.004522,-0.069494,0.961415,-0.91668,-0.155887,89
4,1.27566,0.177216,0.144807,-0.320414,-0.748879,-0.186451,-0.856865,-0.001912,-0.151071,-0.835628,...,-0.004522,-0.056757,-0.042462,-0.015001,-0.004522,-0.069494,0.961415,-0.91668,-0.155887,80


## Categorical transformation

`transform_categorical` parameter is the name of the categorical transform method, values `one_hot_encoder`, `encoder` are supported. If True `one_hot_encoder` will be used. If False/None categorical won't be transformed. 

`transform_categorical_drop` parameter is the list of categorical columns to not transform.

In [None]:
new_dataset = DataPreprocessing(transform_categorical=None).preprocess(df=dataset, target='price')

new_dataset.head()

In [None]:
new_dataset = DataPreprocessing(transform_categorical=True, transform_categorical_drop=['room_type']).preprocess(
    df=dataset, target='price'
)

new_dataset.head()

## Fill NA values

`fillna` parameter is a bool: if True Auto fill NA will be applied, if False/None it won't be applied.

`fillna_numerical` parameter is the name of the auto fill NA numerical method, values `median`, `mean`, `mode`, `remove` are supported.

`fillna_categorical` parameter is the name of the auto fill NA categorical method, values `frequent`, `new_category`, `imputed_column`, `remove` are supported.

In [25]:
new_dataset = DataPreprocessing(fillna=None).preprocess(df=dataset, target='price')

new_dataset.head()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,last_review_unix,neighbourhood_group_Bronx,neighbourhood_group_Brooklyn,...,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,price
0,-1.493849,-0.437652,-0.293996,-0.320414,-0.692221,-0.034716,1.91625,0.036059,-0.151071,1.196705,...,-0.004522,-0.056757,-0.042462,-0.015001,-0.004522,-0.069494,-1.040134,1.090893,-0.155887,149
1,0.452436,-0.684639,-0.293996,0.487665,-0.591055,-0.156104,1.840275,0.553078,-0.151071,-0.835628,...,-0.004522,-0.056757,-0.042462,-0.015001,-0.004522,-0.069494,0.961415,-0.91668,-0.155887,225
2,1.468399,0.222497,-0.196484,-0.522433,,-0.186451,1.91625,,-0.151071,-0.835628,...,-0.004522,-0.056757,-0.042462,-0.015001,-0.004522,-0.069494,-1.040134,1.090893,-0.155887,150
3,-0.803398,-0.16445,-0.293996,5.538156,1.944025,-0.186451,0.617065,0.661797,-0.151071,1.196705,...,-0.004522,-0.056757,-0.042462,-0.015001,-0.004522,-0.069494,0.961415,-0.91668,-0.155887,89
4,1.27566,0.177216,0.144807,-0.320414,-0.75768,-0.186451,-0.856865,0.110955,-0.151071,-0.835628,...,-0.004522,-0.056757,-0.042462,-0.015001,-0.004522,-0.069494,0.961415,-0.91668,-0.155887,80


In [26]:
new_dataset = DataPreprocessing(fillna_categorical='imputed_column', fillna_numerical='mode').preprocess(
    df=dataset, target='price'
)

new_dataset.head()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,last_review_unix,neighbourhood_group_Imputed,neighbourhood_Imputed,...,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,price
0,-1.493849,-0.437652,-0.293996,-0.320414,-0.555055,-0.034716,1.91625,-0.101411,0.0,0.0,...,-0.004522,-0.056757,-0.042462,-0.015001,-0.004522,-0.069494,-1.040134,1.090893,-0.155887,149
1,0.452436,-0.684639,-0.293996,0.487665,-0.448437,-0.156104,1.840275,0.456164,0.0,0.0,...,-0.004522,-0.056757,-0.042462,-0.015001,-0.004522,-0.069494,0.961415,-0.91668,-0.155887,225
2,1.468399,0.222497,-0.196484,-0.522433,-0.674216,-0.186451,1.91625,0.542145,0.0,0.0,...,-0.004522,-0.056757,-0.042462,-0.015001,-0.004522,-0.069494,-1.040134,1.090893,-0.155887,150
3,-0.803398,-0.16445,-0.293996,5.538156,2.223287,-0.186451,0.617065,0.573411,0.0,0.0,...,-0.004522,-0.056757,-0.042462,-0.015001,-0.004522,-0.069494,0.961415,-0.91668,-0.155887,89
4,1.27566,0.177216,0.144807,-0.320414,-0.624043,-0.186451,-0.856865,-0.020641,0.0,0.0,...,-0.004522,-0.056757,-0.042462,-0.015001,-0.004522,-0.069494,0.961415,-0.91668,-0.155887,80


## Normalization

`normalization` parameter is the name of the normalization method, values `standard`, `minmax`, `robust`, `normalizer`, `yeo-johnson`, `box-cox`, `log` are supported. If True 'standard' will be used. If False/None normalization won't be applied.

`normalization_drop` parameter is the list of columns to not normalize. 

In [27]:
new_dataset = DataPreprocessing(normalization=None).preprocess(df=dataset, target='price')

new_dataset.head()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,last_review_unix,neighbourhood_group_Bronx,neighbourhood_group_Brooklyn,...,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,price
0,40.64749,-73.97237,1,9,0.21,6,365,1539907000.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,149
1,40.75362,-73.98377,1,45,0.38,2,355,1558397000.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,225
2,40.80902,-73.9419,3,0,0.72,1,365,1558224000.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,150
3,40.68514,-73.95976,1,270,4.64,1,194,1562285000.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,89
4,40.79851,-73.94399,10,9,0.1,1,0,1542586000.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,80


In [28]:
new_dataset = DataPreprocessing(normalization='minmax', normalization_drop=['last_review_unix']).preprocess(
    df=dataset, target='price'
)

new_dataset.head()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,neighbourhood_group_Bronx,neighbourhood_group_Brooklyn,neighbourhood_group_Manhattan,...,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,last_review_unix,price
0,0.357393,0.511921,0.0,0.014308,0.003419,0.015337,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1539907000.0,149
1,0.614199,0.490469,0.0,0.071542,0.006326,0.003067,0.972603,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1558397000.0,225
2,0.748252,0.569257,0.001601,0.0,0.012139,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1558224000.0,150
3,0.448496,0.535649,0.0,0.429253,0.079159,0.0,0.531507,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1562285000.0,89
4,0.72282,0.565324,0.007206,0.014308,0.001539,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1542586000.0,80


## Feature Selection

`feature_selection` parameter is the name of the feature selection method, values `random_forest`, `mutual_inf`, `chi2`, `f_statistic`, `lasso` and `elasticnet` are supported. If True `random_forest` will be used. If False/None feature selection won't be applied.

`feat_select_task` parameter is the name of the feature selection task, values `reg`, `class`, `multiclass`, `multiclass_multioutput` are supported. If `feature_selection` is True or str this prameter must be initialized.

`feat_select_threshold` parameter is the feature selection threshold, values `mean`, `median` are supported or the threshold can be numeric.

In [29]:
new_dataset = DataPreprocessing(feature_selection=True, feat_select_task='reg').preprocess(df=dataset, target='price')

new_dataset.head()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,last_review_unix,neighbourhood_group_Manhattan,neighbourhood_Astoria,...,neighbourhood_Clinton Hill,neighbourhood_East Harlem,neighbourhood_Greenwich Village,neighbourhood_Lower East Side,neighbourhood_Midtown,neighbourhood_Tribeca,neighbourhood_Upper East Side,neighbourhood_Upper West Side,room_type_Entire home/apt,price
0,-1.493849,-0.437652,-0.293996,-0.320414,-0.676551,-0.034716,1.91625,-0.083459,-0.891833,-0.136938,...,-0.108798,-0.152902,-0.0899,-0.137788,-0.180636,-0.060276,-0.195388,-0.204949,-1.040134,149
1,0.452436,-0.684639,-0.293996,0.487665,-0.564771,-0.156104,1.840275,0.479481,1.121286,-0.136938,...,-0.108798,-0.152902,-0.0899,-0.137788,5.535996,-0.060276,-0.195388,-0.204949,0.961415,225
2,1.468399,0.222497,-0.196484,-0.522433,-0.341211,-0.186451,1.91625,0.47422,1.121286,-0.136938,...,-0.108798,-0.152902,-0.0899,-0.137788,-0.180636,-0.060276,-0.195388,-0.204949,-1.040134,150
3,-0.803398,-0.16445,-0.293996,5.538156,2.236302,-0.186451,0.617065,0.597856,-0.891833,-0.136938,...,9.191342,-0.152902,-0.0899,-0.137788,-0.180636,-0.060276,-0.195388,-0.204949,0.961415,89
4,1.27566,0.177216,0.144807,-0.320414,-0.748879,-0.186451,-0.856865,-0.001912,1.121286,-0.136938,...,-0.108798,6.540145,-0.0899,-0.137788,-0.180636,-0.060276,-0.195388,-0.204949,0.961415,80


In [30]:
new_dataset = DataPreprocessing(
    feature_selection='mutual_inf', feat_select_task='reg', feat_select_threshold='mean'
).preprocess(df=dataset, target='price')

new_dataset.head()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,last_review_unix,neighbourhood_group_Bronx,neighbourhood_group_Brooklyn,...,neighbourhood_Midtown,neighbourhood_Murray Hill,neighbourhood_Theater District,neighbourhood_Upper East Side,neighbourhood_Upper West Side,neighbourhood_West Village,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,price
0,-1.493849,-0.437652,-0.293996,-0.320414,-0.676551,-0.034716,1.91625,-0.083459,-0.151071,1.196705,...,-0.180636,-0.100093,-0.076974,-0.195388,-0.204949,-0.126324,-1.040134,1.090893,-0.155887,149
1,0.452436,-0.684639,-0.293996,0.487665,-0.564771,-0.156104,1.840275,0.479481,-0.151071,-0.835628,...,5.535996,-0.100093,-0.076974,-0.195388,-0.204949,-0.126324,0.961415,-0.91668,-0.155887,225
2,1.468399,0.222497,-0.196484,-0.522433,-0.341211,-0.186451,1.91625,0.47422,-0.151071,-0.835628,...,-0.180636,-0.100093,-0.076974,-0.195388,-0.204949,-0.126324,-1.040134,1.090893,-0.155887,150
3,-0.803398,-0.16445,-0.293996,5.538156,2.236302,-0.186451,0.617065,0.597856,-0.151071,1.196705,...,-0.180636,-0.100093,-0.076974,-0.195388,-0.204949,-0.126324,0.961415,-0.91668,-0.155887,89
4,1.27566,0.177216,0.144807,-0.320414,-0.748879,-0.186451,-0.856865,-0.001912,-0.151071,-0.835628,...,-0.180636,-0.100093,-0.076974,-0.195388,-0.204949,-0.126324,0.961415,-0.91668,-0.155887,80


    The following specified methods can be used for each individual task:
    - for the classification problem Mutual information, F statistics, chi-squared test, Random Forest, Lasso or ElasticNet can be used;
    - for the regression problem Mutual information, F statistics, Random Forest, Lasso or ElasticNet can be used;
    - for the multiclass classification Random Forest, Lasso or ElasticNet can be used;
    - for the multiclass multioutput classification Random Forest can be used.

## Dimensionality Reduction

`dim_red` parameter is the name of the dimensionality reduction method, values `pca`, `svd`, `lda`, `t_sne`, `isomap`, `lle`, `fa`, `nmf` are supported. If True `pca` will be used. If False/None dimensionality reduction won't be applied.

`dim_red_n_components` parameter is the dimensionality reduction n_components parameter value. If None n_components will be calculated by the model or will be set to the default value = 2.

`dim_red_n_neighbors` is the dimensionality reduction n_neighbors (or perplexity in the `t_sne`) parameter value. If None it will be set to the default value = 5 (for the `t_sne` = 30).

In [31]:
new_dataset = DataPreprocessing(dim_red=True).preprocess(df=dataset, target='price')

new_dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,228,229,230,231,232,233,234,235,236,price
0,-2.190708,1.308096,-0.30873,0.45854,-0.636779,0.34082,1.657023,-1.428328,0.149499,-0.588726,...,0.050217,-0.087921,-6.594826e-15,3.453451e-14,-1.40014e-14,-3.829246e-15,2.988533e-14,-2.245325e-15,2.179277e-14,149
1,2.163053,0.268435,0.890202,-0.299714,1.140298,0.316012,0.25815,0.314884,0.783325,0.013087,...,-0.130753,0.013856,-2.466296e-15,-2.085704e-14,-1.01816e-14,-1.054167e-14,-1.208257e-14,5.345662e-15,-2.85369e-14,225
2,1.655111,-1.502866,-0.410857,0.764132,-1.510103,1.462377,0.767409,1.105174,0.330775,0.335647,...,0.098268,0.107731,3.567789e-15,4.569725e-15,-5.88006e-16,-9.427913e-16,2.348613e-15,-6.461522e-16,-2.948509e-15,150
3,-1.750935,0.678189,1.763665,0.531608,2.995725,2.960885,-2.324265,0.154205,-1.145428,0.638751,...,0.077872,0.030745,5.136324e-16,-3.891473e-15,2.487202e-16,-4.940109e-15,4.125313e-15,2.006196e-17,-1.931875e-15,89
4,2.267672,-0.323253,-0.5377,-0.456349,-0.200454,-0.080353,-1.096058,0.882836,0.627538,0.040246,...,-0.027325,-0.030379,-1.79615e-15,-1.209008e-15,-2.163986e-15,-3.031495e-15,2.208141e-15,-4.617624e-15,9.956773e-16,80


In [32]:
new_dataset = DataPreprocessing(dim_red='svd', dim_red_n_components=10).preprocess(df=dataset, target='price')

new_dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,price
0,-2.19221,1.302802,-0.354626,0.425611,0.55568,0.272563,1.585992,0.213666,-0.720737,1.193763,149
1,2.163697,0.268403,0.883395,-0.314877,-1.099801,0.293433,0.508527,1.082885,-0.794702,1.352689,225
2,1.656083,-1.501518,-0.364907,0.750817,1.446799,1.537057,0.956836,0.56083,0.285593,0.477327,150
3,-1.749942,0.672461,1.759121,0.451185,-3.009787,3.02466,-2.153386,-0.34936,-0.055516,1.027727,89
4,2.267798,-0.326196,-0.518138,-0.443911,0.308912,-0.016664,-0.906361,-0.11074,-0.023358,0.241196,80


## Sampling

`sampling` parameter is the name of the sampling method, values `simple`, `systematic`, `cluster`, `stratified` are supported. If True `simple` will be used. If False/None sampling won't be applied. 

`sampling_n` parameter is the sampling n value. If None it will be set to the default value depending on the method.

`sampling_n_clusters` parameter is the sampling number of clusters value.

In [33]:
new_dataset = DataPreprocessing(sampling='cluster', sampling_n=2, sampling_n_clusters=8).preprocess(
    df=dataset, target='price'
)

new_dataset.head()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,last_review_unix,neighbourhood_group_Bronx,neighbourhood_group_Brooklyn,...,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,price,cluster_id
18336,0.125457,-1.201795,5.507955,-0.47754,-0.755455,-0.186451,1.217273,-1.02783,-0.151071,-0.835628,...,-0.056757,-0.042462,-0.015001,-0.004522,-0.069494,0.961415,-0.91668,-0.155887,250,4
18337,-0.12615,-0.104003,-0.147729,-0.297967,-0.623949,-0.186451,-0.811279,-0.143962,-0.151071,1.196705,...,-0.056757,-0.042462,-0.015001,-0.004522,-0.069494,0.961415,-0.91668,-0.155887,150,4
18338,-1.051887,-0.005425,-0.293996,-0.387753,-0.702852,-0.186451,-0.856865,-0.388604,-0.151071,1.196705,...,-0.056757,-0.042462,-0.015001,-0.004522,-0.069494,-1.040134,1.090893,-0.155887,65,4
18339,-0.214542,-0.995323,-0.293996,3.517959,2.519039,-0.186451,0.88298,0.553137,-0.151071,-0.835628,...,-0.056757,-0.042462,-0.015001,-0.004522,-0.069494,0.961415,-0.91668,-0.155887,250,4
18340,-1.296892,-0.615743,-0.147729,1.161064,0.60563,-0.186451,0.784212,0.532092,-0.151071,1.196705,...,17.619112,-0.042462,-0.015001,-0.004522,-0.069494,0.961415,-0.91668,-0.155887,99,4


## Smoothing

`smoothing` parameter is the name of the smoothing method, values `moving_average`, `lowess`, `s_g_filter`, `fft` are supported. If True `moving_average` will be used. If False/None smoothing won't be applied.

`smoothing_column` parameter is the name of the column to smooth.

In [34]:
new_dataset = DataPreprocessing(
    normalization_drop=['reviews_per_month'], smoothing='moving_average', smoothing_column='reviews_per_month'
).preprocess(df=dataset, target='price')

new_dataset.head()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,calculated_host_listings_count,availability_365,last_review_unix,neighbourhood_group_Bronx,neighbourhood_group_Brooklyn,neighbourhood_group_Manhattan,...,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,reviews_per_month_Moving_Average,reviews_per_month,price
0,-1.493849,-0.437652,-0.293996,-0.320414,-0.034716,1.91625,-0.083459,-0.151071,1.196705,-0.891833,...,-0.042462,-0.015001,-0.004522,-0.069494,-1.040134,1.090893,-0.155887,,0.21,149
1,0.452436,-0.684639,-0.293996,0.487665,-0.156104,1.840275,0.479481,-0.151071,-0.835628,1.121286,...,-0.042462,-0.015001,-0.004522,-0.069494,0.961415,-0.91668,-0.155887,,0.38,225
2,1.468399,0.222497,-0.196484,-0.522433,-0.186451,1.91625,0.47422,-0.151071,-0.835628,1.121286,...,-0.042462,-0.015001,-0.004522,-0.069494,-1.040134,1.090893,-0.155887,,0.72,150
3,-0.803398,-0.16445,-0.293996,5.538156,-0.186451,0.617065,0.597856,-0.151071,1.196705,-0.891833,...,-0.042462,-0.015001,-0.004522,-0.069494,0.961415,-0.91668,-0.155887,,4.64,89
4,1.27566,0.177216,0.144807,-0.320414,-0.186451,-0.856865,-0.001912,-0.151071,-0.835628,1.121286,...,-0.042462,-0.015001,-0.004522,-0.069494,0.961415,-0.91668,-0.155887,,0.1,80
