In [None]:
import pandas as pd
import matplotlib.pyplot as plt

### Exploratory Data Analysis

In [None]:
df = pd.read_csv("data/training_set_VU_DM.csv")
display(df.describe())

In [None]:
count_non_nan = df.count()
display(count_non_nan)

plt.figure(figsize=(15,8))
count_non_nan.plot.bar(width=0.5)

In [None]:
print(df["srch_id"].count())
print(df["srch_id"].nunique())
print(df["date_time"].count())
print(df["date_time"].nunique())

### This means that we have almost 5 million rows, but these retain from 199.795 unique searches. This means that per search about 25 hotels are shown.
##### Indeed there are 199.795 unique searches, but be aware the srch_id column has gaps! For example the first id's are: 1,4,6,8,11
##### Range of number of hotels shown in a search is [5,38] consecutive (no missing number in this range)

In [None]:
print(df["prop_id"].count())
print(df["prop_id"].nunique())

In [None]:
count_unique = df.nunique()

In [None]:
print(count_unique)

In [None]:
plt.figure(figsize=(15,8))
count_unique.plot.bar(width=0.5)

In [None]:
less_unique = count_unique.drop(labels=["srch_id", "date_time", "prop_id", "price_usd",
                                            "srch_query_affinity_score", "orig_destination_distance",
                                            "gross_bookings_usd", "srch_destination_id", "visitor_hist_adr_usd",
                                            "prop_location_score2", "comp1_rate_percent_diff"])
plt.figure(figsize=(15,8))
less_unique.plot.bar(width=0.5)

### In this dataset we count 129.113 unique hotels

In [None]:
df = pd.read_csv("data/test_set_VU_DM.csv")
display(df.describe())

In [None]:
# Lotte Donderdag 23 april
# TODO: een kleinere dataset maken, bv van 1000 searches dat is dan ± 25.000 rows aan data

# wat pandas commands die handig zijn
# df.dtypes
# pd.isnull(df).any()
# df.query('prop_location_score2 > prop_location_score1')

def count_of_column_per_search(df):
    """
    Met deze code kun je de count van een bepaalde kolom (zoals 'promotion_flag') per complete search bekijken
    """
    grouped_searches = df.groupby('srch_id')[['promotion_flag']].count()
    grouped_searches.sort_values('promotion_flag').reset_index()
    grouped_searches.columns = ['test']
    grouped_searches


def number_of_hotels_per_search(df):
    """
    Print hoeveel hotels er per search worden weergegeven, print het minimum, maximum, en de unieke waardes
    Handig linkje: https://towardsdatascience.com/data-visualization-exploration-using-pandas-only-beginner-a0a52eb723d5
    """
    
    grouped_searches = df.groupby('srch_id')[['prop_id']].count()
    grouped_searches.sort_values('prop_id').reset_index()
    grouped_searches.columns = ['nr_hotels']
    nr_hotels = grouped_searches['nr_hotels'].unique()
    print(min(nr_hotels))
    print(max(nr_hotels))
    print(sorted(nr_hotels))
    grouped_searches['nr_hotels'].plot(kind='kde')

# number_of_hotels_per_search(df)
def database_correlations(df):
    """
    Correlations in de database bekijken
    https://www.datacamp.com/community/tutorials/exploratory-data-analysis-python#comments
    """
    # Pearson correlations
    pearson = df.corr()
    correlation_plot(pearson, 'pearson_corr')
    # print(pearson['srch_id'])
    
#     # Kendall tau correlations
#     df.rank()
#     df.corr('kendall')
    
#     # Spearman rank correlations
#     df.corr('spearman')

def correlation_plot(df_corr, figure_name):
    """
    Plot correlations
    Input arguments: the correlation matrix (can be any kind eg pearson/kendall etc), file name for the plot
    """
    f = plt.figure(figsize=(19, 19))
    plt.matshow(df_corr, fignum=f.number)
    plt.xticks(range(df.shape[1]), df.columns, fontsize=14, rotation=45)
    plt.yticks(range(df.shape[1]), df.columns, fontsize=14)
    cb = plt.colorbar()
    cb.ax.tick_params(labelsize=14)
    plt.title('Correlation Matrix', fontsize=16);
    fig_name = 'plots/' + str(figure_name) + '.jpg'
    print(fig_name)
    plt.savefig(fig_name)
    plt.show()
    
database_correlations(df)