# Data Exploration

There is already a great Notebook for data exporation. We condenced the information and made our own plots.
The original Notebook can be found [here](https://www.kaggle.com/code/andradaolteanu/birdcall-recognition-eda-and-audio-fe)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import plotly.express as px
import geopandas as gpd
from shapely.geometry import Point, Polygon
import re

In [None]:
df = pd.read_csv("./input/Xeno-Canto_All-Metadata.csv")
# split date
df['year'] = df['date'].apply(lambda x: x.split('-')[0])
df['month'] = df['date'].apply(lambda x: x.split('-')[1])
df['day_of_month'] = df['date'].apply(lambda x: x.split('-')[2])

# time to common format
# if time not correct, set to 00:00
df['time'] = df['time'].astype(str).apply(lambda x: x if re.match("^[0-9]{2}:[0-9]{2}$", x) else "00:00")
df['time'] = pd.to_datetime(df['time'], format='%H:%M').dt.time

# round time to nearest hour 
df['time'] = df['time'].apply(lambda x: x.replace(minute=0))

# length to seconds
df['length'] = df['length'].apply(lambda x: int(x.split(":")[0]) * 60 + int(x.split(":")[1]))

df.head()

In [None]:
one_color = "#DA291C"
color_map = ["#000", one_color]

In [None]:
# time bar plot 
year_sorted = df.sort_values('year')

plt.figure(figsize=(16, 6))
# ax = sns.countplot(train_csv['year'], palette="hls") but vertical sorted by year 
ax = sns.countplot(x="year", data=year_sorted, color=one_color)

plt.title("Audio Files Registration per Year Made", fontsize=16)
plt.xticks(rotation=90, fontsize=13)
plt.yticks(fontsize=13)
plt.ylabel("Amount", fontsize=14)
plt.xlabel("");

In [None]:
sorted_month = df.sort_values('month')

plt.figure(figsize=(16, 6))
ax = sns.countplot(x="month", data=sorted_month, color=one_color)

plt.title("Audio Files Registration per Month Made", fontsize=16)
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.ylabel("Frequency", fontsize=14)
plt.xlabel("");

In [None]:
# Create data
data = df['time'].value_counts().reset_index()

# sort by time
data = data.sort_values('time').reset_index(drop=True)

# drop index 0
data = data.drop(0)

plt.figure(figsize=(16, 6))
sns.barplot(x="time", y="count", data=data, color=one_color) 

plt.title("When was the bird recorded?", fontsize=16)
plt.ylabel("",)
plt.yticks(fontsize=13)
plt.xticks(rotation=45, fontsize=13)
plt.xlabel("")

In [None]:
# plot length of audio files
plt.figure(figsize=(16, 6))
ax = sns.displot(df['length'], color=one_color)

plt.xlim(0, 300)
plt.title("Distribution of Audio Files Duration", fontsize=16)
plt.ylabel("Frequency", fontsize=14)
plt.yticks(fontsize=13)
plt.xlabel("Duration (s)", fontsize=14)
plt.xticks(fontsize=13)

In [None]:
# plot country of recordings pie chart
country = df['cnt'].value_counts().reset_index()
country.columns = ['cnt', 'count']

fig = px.pie(country, values='count', names='cnt', title='Recordings per Country')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

In [None]:
# amount of recordings in switzerland
switzerland = df[df['cnt'] == 'Switzerland']

len(switzerland)

In [None]:
# plot amount of recording per species
species = df['en'].value_counts().reset_index()
species.columns = ['species', 'count']

species = species.sort_values('count', ascending=False)

# box plot
plt.figure(figsize=(16, 6))
box = sns.boxplot(x="count", data=species, color=one_color)

plt.title("Top 100 Species with the Most Recordings", fontsize=16)
plt.ylabel("Species", fontsize=14)
plt.yticks(fontsize=13)
plt.xlabel("amount of recordings", fontsize=14)
plt.xticks(fontsize=13)

plt.show()

In [None]:
# SHP file
world_map = gpd.read_file("./input/world-shape/world_shapefile.shp")

# Coordinate reference system
crs = {"init" : "epsg:4326"}

# Lat and Long need to be of type float, not object
data = df[df["lat"] != "Not specified"]
data["lat"] = data["lat"].astype(float)
data["lng"] = data["lng"].astype(float)

# Create geometry
geometry = [Point(xy) for xy in zip(data["lng"], data["lat"])]

# Geo Dataframe
geo_df = gpd.GeoDataFrame(data, crs=crs, geometry=geometry)

# Plot
fig, ax = plt.subplots(figsize=(20, 20))
geo_df.plot(ax=ax, markersize=3, color=one_color, marker='o')
world_map.boundary.plot(ax=ax, linewidth=1, color='black')

plt.title("Bird Recordings Locations", fontsize=16)
plt.legend(fontsize=14)
plt.show()