# 7 - Data Retrieval Protocol Helper
This notbook assists filling out "Data Retrieval Protocol.xlsx".
For each source of data we get datafarme which desribe the features (and dumping the results to 'data_retrieval_protocol_helper.csv').
Using this csv for cutting results from it to "Data Retrieval Protocol.xlsx".
This notbook support the following columns:
* "Feature name"
* "Source"
* "Unique count"
* "Min"
* "Max"
* "Notes"


In [1]:
# Import libraries:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

from random import sample
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from pandas.api.types import is_datetime64_any_dtype

In [2]:
def drop_additional_index_columns(_df):
    # dropping additional index columns that start with "Unnamed" - dropping these columns
    columns_to_drop = [x for x in _df.columns.to_list() if x.startswith("Unnamed")]
    print("dropping coulmns: ", columns_to_drop) # [Unamed..., Unamed..]
    return _df.drop(columns=columns_to_drop, axis=1, inplace=False)

In [3]:
def get_features_info(_df, _source):
    features_info_list = []
    for feature_name in _df.columns:
        sum_of_na =  _df[feature_name].isna().sum()
        unique_value = _df[feature_name].unique().tolist()
        smaple_values = sample(unique_value,5  if len(unique_value)> 5 else len(unique_value))
        is_numeric = is_numeric_dtype(_df[feature_name])
        is_string = is_string_dtype(_df[feature_name])
        is_datetime = is_datetime64_any_dtype(_df[feature_name])
        record = {"Feature name": feature_name,
                 "Source": _source,
                 "Unique count": len(_df[feature_name].unique()),
                 "Min": _df[feature_name].min() if is_numeric else "",
                 "Max": _df[feature_name].max() if is_numeric else "",
                 "Notes": {"smaple_values": smaple_values, 
                          "sum_of_na": sum_of_na, 
                          "is_numeric": is_numeric,  
                          "is_string": is_string, 
                          "is_datetime": is_datetime}
                 }
        features_info_list.append(record)
        
    return pd.DataFrame(features_info_list)

### 1. Kaggle Source Data

In [4]:
df_kaggle_flat_file = pd.read_csv("flat_file.csv")
df_kaggle_flat_file = drop_additional_index_columns(df_kaggle_flat_file)
df_kaggle_flat_file.info(verbose=True, show_counts=True)

  interactivity=interactivity, compiler=compiler, result=result)


dropping coulmns:  ['Unnamed: 0']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157864 entries, 0 to 157863
Data columns (total 109 columns):
 #    Column                                 Non-Null Count   Dtype  
---   ------                                 --------------   -----  
 0    listing_id                             157864 non-null  int64  
 1    name                                   157451 non-null  object 
 2    target_start_date_period               157864 non-null  object 
 3    target_end_date_period                 157864 non-null  object 
 4    target_num_of_day_in_period            157864 non-null  int64  
 5    target_num_of_booked_days              157864 non-null  int64  
 6    booked_up_target                       157864 non-null  int64  
 7    target_avg_dollar_price_in_period      43919 non-null   float64
 8    start_date_previous_period             157864 non-null  object 
 9    end_date_previous_period               157864 non-null  object 
 10   num_of_d

In [5]:
df_kaggle_flat_file_features_info = get_features_info(df_kaggle_flat_file, "kaggle_flat_table")
df_kaggle_flat_file_features_info.head(5)

Unnamed: 0,Feature name,Source,Unique count,Min,Max,Notes
0,listing_id,kaggle_flat_table,22552,2015.0,29867352.0,"{'smaple_values': [29605459, 23740543, 1282470..."
1,name,kaggle_flat_table,21874,,,{'smaple_values': ['Room in Wilmersdorf (only ...
2,target_start_date_period,kaggle_flat_table,7,,,"{'smaple_values': ['2019-05-01', '2019-07-01',..."
3,target_end_date_period,kaggle_flat_table,8,,,"{'smaple_values': ['2019-04-30', '2019-11-06',..."
4,target_num_of_day_in_period,kaggle_flat_table,5,89.0,100.0,"{'smaple_values': [92, 89, 100, 91, 98], 'sum_..."


### 2. Climatestotravel Berlin Sources Data

In [6]:
df_climatestotravel_berlin_sunshine_hours = pd.read_csv("climatestotravel Berlin - Sunshine hours.csv")
df_climatestotravel_berlin_sunshine_hours = drop_additional_index_columns(df_climatestotravel_berlin_sunshine_hours)
display(df_climatestotravel_berlin_sunshine_hours.info(verbose=True, show_counts=True))

df_climatestotravel_berlin_average_precipitation = pd.read_csv("climatestotravel Berlin - Average precipitation.csv")
df_climatestotravel_berlin_average_precipitation = drop_additional_index_columns(df_climatestotravel_berlin_average_precipitation)
display(df_climatestotravel_berlin_average_precipitation.info(verbose=True, show_counts=True))

df_climatestotravel_berlin_average_temperatures = pd.read_csv("climatestotravel Berlin - Average temperatures.csv")
df_climatestotravel_berlin_average_temperatures = drop_additional_index_columns(df_climatestotravel_berlin_average_temperatures)
display(df_climatestotravel_berlin_average_temperatures.info(verbose=True, show_counts=True))



dropping coulmns:  []
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Month    13 non-null     object 
 1   Average  13 non-null     float64
 2   Total    13 non-null     int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 440.0+ bytes


None

dropping coulmns:  []
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Month        13 non-null     object 
 1   Millimeters  13 non-null     int64  
 2   Inches       13 non-null     float64
 3   Days         13 non-null     int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 544.0+ bytes


None

dropping coulmns:  []
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Month      13 non-null     object 
 1   Min (°C)   13 non-null     float64
 2   Max (°C)   13 non-null     float64
 3   Mean (°C)  13 non-null     float64
 4   Min (°F)   13 non-null     float64
 5   Max (°F)   13 non-null     float64
 6   Mean (°F)  13 non-null     float64
dtypes: float64(6), object(1)
memory usage: 856.0+ bytes


None

In [7]:
df_climatestotravel_berlin_sunshine_hours_info = get_features_info(df_climatestotravel_berlin_sunshine_hours, "climatestotravel_berlin_sunshine_hours")
display(df_climatestotravel_berlin_sunshine_hours_info.head(3))

df_climatestotravel_berlin_average_precipitation_info = get_features_info(df_climatestotravel_berlin_average_precipitation, "climatestotravel_berlin_average_precipitation_")
display(df_climatestotravel_berlin_average_precipitation_info.head(3))

df_climatestotravel_berlin_average_temperatures_info = get_features_info(df_climatestotravel_berlin_average_temperatures, "climatestotravel_berlin_average_temperaturess")
display(df_climatestotravel_berlin_average_temperatures_info.head(3))

Unnamed: 0,Feature name,Source,Unique count,Min,Max,Notes
0,Month,climatestotravel_berlin_sunshine_hours,13,,,"{'smaple_values': ['February', 'June', 'May', ..."
1,Average,climatestotravel_berlin_sunshine_hours,10,1.5,7.5,"{'smaple_values': [1.5, 3.5, 2.5, 4.7, 2.0], '..."
2,Total,climatestotravel_berlin_sunshine_hours,12,40.0,1705.0,"{'smaple_values': [40, 230, 120, 220, 55], 'su..."


Unnamed: 0,Feature name,Source,Unique count,Min,Max,Notes
0,Month,climatestotravel_berlin_average_precipitation_,13,,,"{'smaple_values': ['February', 'July', 'Year',..."
1,Millimeters,climatestotravel_berlin_average_precipitation_,7,35.0,570.0,"{'smaple_values': [55, 40, 60, 35, 70], 'sum_o..."
2,Inches,climatestotravel_berlin_average_precipitation_,7,1.4,22.4,"{'smaple_values': [1.8, 2.8, 1.4, 2.2, 1.6], '..."


Unnamed: 0,Feature name,Source,Unique count,Min,Max,Notes
0,Month,climatestotravel_berlin_average_temperaturess,13,,,"{'smaple_values': ['January', 'November', 'Jun..."
1,Min (°C),climatestotravel_berlin_average_temperaturess,11,-2.0,14.0,"{'smaple_values': [11.0, 12.0, 4.0, 2.0, 9.0],..."
2,Max (°C),climatestotravel_berlin_average_temperaturess,9,3.0,24.0,"{'smaple_values': [4.0, 3.0, 13.0, 24.0, 13.5]..."


### 3. Features Enrichment

In [8]:
df_flat_file_after_feature_enrichment = pd.read_csv("flat_file_after_feature_enrichment.csv")
df_flat_file_after_feature_enrichment = drop_additional_index_columns(df_flat_file_after_feature_enrichment)
df_flat_file_after_feature_enrichment.info(verbose=True, show_counts=True)

  interactivity=interactivity, compiler=compiler, result=result)


dropping coulmns:  ['Unnamed: 0']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157864 entries, 0 to 157863
Data columns (total 865 columns):
 #    Column                                                                                                             Non-Null Count   Dtype  
---   ------                                                                                                             --------------   -----  
 0    listing_id                                                                                                         157864 non-null  int64  
 1    name                                                                                                               157451 non-null  object 
 2    target_start_date_period                                                                                           157864 non-null  object 
 3    target_end_date_period                                                                                             157

In [9]:
df_flat_file_after_feature_enrichment_info = get_features_info(df_flat_file_after_feature_enrichment, "flat_file_after_feature_enrichment")
df_flat_file_after_feature_enrichment_info.head(5)

Unnamed: 0,Feature name,Source,Unique count,Min,Max,Notes
0,listing_id,flat_file_after_feature_enrichment,22552,2015.0,29867352.0,"{'smaple_values': [919931, 21694231, 21606590,..."
1,name,flat_file_after_feature_enrichment,21874,,,{'smaple_values': ['Nomad's Nest - Cosy Room i...
2,target_start_date_period,flat_file_after_feature_enrichment,7,,,"{'smaple_values': ['2019-05-01', '2019-02-01',..."
3,target_end_date_period,flat_file_after_feature_enrichment,8,,,"{'smaple_values': ['2019-11-08', '2019-05-31',..."
4,start_date_previous_period,flat_file_after_feature_enrichment,2,,,"{'smaple_values': ['2018-11-07', '2018-11-09']..."


In [10]:
kaggle_flat_file_features = list(df_kaggle_flat_file_features_info['Feature name'].unique())
df_flat_file_after_feature_enrichment_info = df_flat_file_after_feature_enrichment_info[~df_flat_file_after_feature_enrichment_info['Feature name'].isin(kaggle_flat_file_features)]

In [11]:
df_data_retrieval_protocol_helper  = pd.concat([df_kaggle_flat_file_features_info,
           df_climatestotravel_berlin_sunshine_hours_info, 
           df_climatestotravel_berlin_average_precipitation_info,
           df_climatestotravel_berlin_average_temperatures_info,
           df_flat_file_after_feature_enrichment_info])

In [12]:
df_data_retrieval_protocol_helper

Unnamed: 0,Feature name,Source,Unique count,Min,Max,Notes
0,listing_id,kaggle_flat_table,22552,2015,29867352,"{'smaple_values': [29605459, 23740543, 1282470..."
1,name,kaggle_flat_table,21874,,,{'smaple_values': ['Room in Wilmersdorf (only ...
2,target_start_date_period,kaggle_flat_table,7,,,"{'smaple_values': ['2019-05-01', '2019-07-01',..."
3,target_end_date_period,kaggle_flat_table,8,,,"{'smaple_values': ['2019-04-30', '2019-11-06',..."
4,target_num_of_day_in_period,kaggle_flat_table,5,89,100,"{'smaple_values': [92, 89, 100, 91, 98], 'sum_..."
...,...,...,...,...,...,...
860,bedrooms_cat_bedrooms_75%_to_100%,flat_file_after_feature_enrichment,2,0,1,"{'smaple_values': [0, 1], 'sum_of_na': 0, 'is_..."
861,bedrooms_cat_bedrooms_missing,flat_file_after_feature_enrichment,2,0,1,"{'smaple_values': [1, 0], 'sum_of_na': 0, 'is_..."
862,sqrt_bedrooms_cat_sqrt_bedrooms_0%_to_25%,flat_file_after_feature_enrichment,2,0,1,"{'smaple_values': [0, 1], 'sum_of_na': 0, 'is_..."
863,sqrt_bedrooms_cat_sqrt_bedrooms_75%_to_100%,flat_file_after_feature_enrichment,2,0,1,"{'smaple_values': [1, 0], 'sum_of_na': 0, 'is_..."


In [13]:
df_data_retrieval_protocol_helper.to_csv('data_retrieval_protocol_helper.csv')