In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import shapiro

#To read the file downloaded from the CSO.ie database
df = pd.read_csv('cso_data.csv', index_col=False)

In [None]:
#To get all the information realted to the dataset
df.info()

In [None]:
#Shows the 5 first rows from the dataset
df.head()

In [None]:
#To drop the feature Statistic Label
df = df.drop(columns=['Statistic Label'])

In [None]:
#Reorganazing the dataset
#Creates 5 new columns and asigned values from VALUE into the new features 
df['Number of Overnight Trips by Foreign Visitors (Thousand)'] = pd.Series(df['VALUE'].iloc[0:120].reset_index(drop=True))
df['Percentage of Overnight Trips by Foreign Visitors (%)'] = pd.Series(df['VALUE'].iloc[120:240].reset_index(drop=True))
df['Number of Nights by Foreign Visitors (Thousand)'] = pd.Series(df['VALUE'].iloc[240:360].reset_index(drop=True))
df['Percentage of Nights by Foreign Visitors (%)'] = pd.Series(df['VALUE'].iloc[360:480].reset_index(drop=True))
df['Average Length of Stay of Foreign Visitors (Nights per trip)'] = pd.Series(df['VALUE'].iloc[480:600].reset_index(drop=True))


In [None]:
df.head(120)

In [None]:
#To drop features VALUE and UNIT
df = df.drop(columns=['VALUE', 'UNIT'])
#To drop all the observations after index 120
df = df.drop(df.index[120:])

In [None]:
df.head(125)

In [None]:
#to split the feature "Month" into "Month" and "Year"
df[['Year', 'Month']] = df['Month'].str.split(' ', expand=True)

#To change Year's datatype  
df['Year'] = pd.to_datetime(df['Year'], format='%Y')
df['Year'] = df['Year'].dt.year

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
#To drop redundant observations
print(f"Set size before dropping redundant observations: {df.shape}")
df.drop_duplicates(inplace=True)
print(f"Set size after dropping redundant observations: {df.shape}")


In [None]:
#to create the csv of the modified dataset
df.to_csv('new_cso_data.csv', index=False)

In [None]:
#To filter the data by excluding the feature: "All types of accommodation"
df_filtered = df[df['Main Accommodation Type'] != 'All types of accommodation']

#To verify the distribution, creates an histogram and a Q-Q plot of Number of overnight trips and Number of nights stayed
feature = 'Number of Overnight Trips by Foreign Visitors (Thousand)'
data = df_filtered[feature].values

#Creates a histogram
plt.figure(figsize=(10, 6))
sns.histplot(data, bins=30, kde=True)
plt.title('Histogram of Number of Overnight Trips by Foreign Visitors')
plt.xlabel(feature)
plt.ylabel('Frecuence')
plt.show()

# Creates a Q-Q Plot
plt.figure(figsize=(10, 6))
stats.probplot(data, dist="norm", plot=plt)
plt.title('Q-Q Plot: Comparing Number of Overnight Trips to Normal Distribution')
plt.show()


feature2 = 'Number of Nights by Foreign Visitors (Thousand)'
data2 = df_filtered[feature2].values

#Creates a histogram
plt.figure(figsize=(10, 6))
sns.histplot(data2, bins=30, kde=True)
plt.title('Histogram of Number of Nights Stayed by Foreign Visitors')
plt.xlabel(feature2)
plt.ylabel('Frecuence')
plt.show()

# Creates a Q-Q Plot
plt.figure(figsize=(10, 6))
stats.probplot(data2, dist="norm", plot=plt)
plt.title('Q-Q Plot: Comparing Number of Nights to Normal Distribution')
plt.show()


In [None]:
#To split the dataset into groups by year
by_year = df_filtered.groupby('Year')

#Shapiro-Wilk test by year to get a statistical confirmation of the distribution
shapiro_results_overnights= {}
print('Number of Overnight Trips by Foreign Visitors (Thousand)')
for year, group in by_year:
    stat, p_value= shapiro(group['Number of Overnight Trips by Foreign Visitors (Thousand)'])
    shapiro_results_overnights[year]= {'Statistics': stat, 'p-value': p_value}

#Loop to show results by year
for year, result in shapiro_results_overnights.items():
    print(f"Year: {year}, Shapiro-Wilk Test: Statistics={result['Statistics']}, p-value={result['p-value']}")


shapiro_results_nights= {}
print('\nNumber of Nights by Foreign Visitors (Thousand)')
for year, group in by_year:
    stat, p_value= shapiro(group['Number of Nights by Foreign Visitors (Thousand)'])
    shapiro_results_nights[year]= {'Statistics': stat, 'p-value': p_value}

#Loop to show results by year
for year, result in shapiro_results_nights.items():
    print(f"Year: {year}, Shapiro-Wilk Test: Statistics={result['Statistics']}, p-value={result['p-value']}")



In [None]:
#Function to manage the outliers using the Interquartile range
def yearly_group(group):
    feature_to_process = ['Number of Overnight Trips by Foreign Visitors (Thousand)', 'Number of Nights by Foreign Visitors (Thousand)']
    group['Year'] = group['Year']
    for feature in feature_to_process:
        Q1 = group[feature].quantile(0.25)
        Q3 = group[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        group = group[(group[feature] >= lower) & (group[feature] <= upper)]
          
    #To return the proccessed features only
    return group[feature_to_process + ['Year']]

#To apply the IQR method to each group
processed_groups = by_year.apply(yearly_group).reset_index(drop=True)

print(processed_groups)

In [None]:
#To compare the std original and after the quartiles method
#For Number of Overnight Trips
print('Number of Overnight Trips by Foreign Visitors')
orig_std = df['Number of Overnight Trips by Foreign Visitors (Thousand)'].std()
processed_std = processed_groups['Number of Overnight Trips by Foreign Visitors (Thousand)'].std()

print(f"Original std: {orig_std}")
print(f"No outliers std: {processed_std}")

#For Number of Nights
print('\nNumber of Nights by Foreign Visitors')
orig_std2 = df['Number of Nights by Foreign Visitors (Thousand)'].std()
processed_std2 = processed_groups['Number of Nights by Foreign Visitors (Thousand)'].std()

print(f"Original std: {orig_std2}")
print(f"No outliers std: {processed_std2}")