In [250]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import numpy as np
from bokeh.plotting import figure, show, output_notebook, output_file
from bokeh.models import ColumnDataSource, HoverTool, WheelZoomTool, PanTool
from bokeh.transform import linear_cmap
from bokeh.palettes import Viridis256  # Use Viridis256 palette


In [251]:
fish_file = pd.read_csv('fish_catches.csv')

In [252]:
fish_file = fish_file.iloc[:-1,:]

In [253]:
selected_columns = ['Species', 'Area', 'Units', 'Country'] + [str(year) for year in range(2014, 2005, -1)]
data = fish_file[selected_columns]

In [254]:
# Pivot the data to group by Country, Species, and Area
pivoted_data = data.pivot_table(index=['Country', 'Species', 'Area'], values=[str(year) for year in range(2014, 2005, -1)], aggfunc=np.sum)

In [255]:
pivoted_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,2006,2007,2008,2009,2010,2011,2012,2013,2014
Country,Species,Area,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
BE,ANF,27,1193.0,1363.0,964.0,853.0,1031.0,1279.0,1716.0,1633.0,993.0
BE,ANF,27.4,141.0,181.0,185.0,140.0,131.0,116.0,133.0,137.0,217.0
BE,ANF,27.4.A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BE,ANF,27.4.B,138.0,179.0,181.0,134.0,124.0,111.0,131.0,135.0,213.0
BE,ANF,27.4.C,3.0,3.0,4.0,6.0,7.0,6.0,2.0,2.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...
UK,YFT,27.8.E,0.0,0.0,0.0,0.0,12.0,21.0,0.0,0.0,0.0
UK,YFT,27.8.E.1,0.0,0.0,0.0,0.0,12.0,21.0,0.0,0.0,0.0
UK,ZGP,27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
UK,ZGP,27.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [256]:
# Convert the pivoted data to a list of dictionaries
country_tensors = []
for index, row in pivoted_data.iterrows():
    country_tensors.append({
        'Country': index[0],
        'Species': index[1],
        'Area': index[2],
        'Fish_Catches': row.values.tolist()
    })

In [257]:
from collections import defaultdict

country_aggregated = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

for tensor_data in country_tensors:
    country = tensor_data['Country']
    species = tensor_data['Species']
    area = tensor_data['Area']
    fish_catches = tensor_data['Fish_Catches']

    country_aggregated[country][species][area].extend(fish_catches)


In [258]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Prepare data for clustering
country_tensors_dict = {}

# Aggregate fish catches data by country
for tensor_data in country_tensors:
    country = tensor_data['Country']
    fish_catches = tensor_data['Fish_Catches']
    if country not in country_tensors_dict:
        country_tensors_dict[country] = [0.0] * len(fish_catches)  # Initialize with zeros
    country_tensors_dict[country] = [prev + current for prev, current in zip(country_tensors_dict[country], fish_catches)]

# Convert the aggregated country tensors to a list
country_tensors_list = []
for country, fish_catches in country_tensors_dict.items():
    country_tensors_list.append({
        'Country': country,
        'Fish_Catches': fish_catches
    })

# Convert the list of dictionaries to a NumPy array
country_array = np.array([data['Fish_Catches'] for data in country_tensors_list])

# Standardize the data
scaler = StandardScaler()
standardized_country_array = scaler.fit_transform(country_array)

# Cluster the countries using KMeans
num_clusters = 8  # You can adjust the number of clusters
kmeans = KMeans(n_clusters=num_clusters)
cluster_labels = kmeans.fit_predict(standardized_country_array)

# Assign cluster labels to countries
country_clusters = defaultdict(list)
for idx, label in enumerate(cluster_labels):
    country = country_tensors_list[idx]['Country']
    country_clusters[label].append(country)





In [259]:
country_clusters

defaultdict(list,
            {4: ['BE',
              'CN',
              'EE',
              'GG',
              'GL',
              'IM',
              'JE',
              'JP',
              'LT',
              'LV',
              'LY',
              'TW'],
             0: ['DE', 'FI', 'IE', 'PL', 'PT', 'SE'],
             3: ['DK'],
             6: ['ES', 'FR', 'NL'],
             7: ['FO', 'UK'],
             2: ['IS'],
             1: ['NO'],
             5: ['RU']})

In [260]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.transform import linear_cmap
from bokeh.palettes import Category20

# Apply PCA to reduce dimensionality for visualization
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(standardized_country_array)

# Create a DataFrame for the reduced features
pca_df = pd.DataFrame(reduced_features, columns=['x', 'y'])

# Create a DataFrame for the cluster labels
cluster_df = pd.DataFrame({'cluster_label': cluster_labels})

# Combine the reduced features and cluster labels DataFrames
source_data = pd.concat([pca_df, cluster_df], axis=1)

# Include the 'Country' column in the DataFrame
source_data['Country'] = [country_tensors_list[i]['Country'] for i in range(len(country_tensors_list))]

# Create a ColumnDataSource for Bokeh
source = ColumnDataSource(data=source_data)

# Create a color mapper based on a categorical palette
palette = Category20[5]  # Adjust the number of colors as needed
color_mapper = linear_cmap(field_name='cluster_label', palette=palette, low=min(cluster_labels), high=max(cluster_labels))

# Create a Bokeh figure
plot = figure(title="Cluster Visualization with Country Codes", tools=[HoverTool(tooltips=[("Country", "@Country")])])

# Add scatter glyphs with colored clusters
plot.scatter('x', 'y', source=source, size=10, fill_color=color_mapper, legend_field='cluster_label')

# Show the plot
output_notebook()  # For Jupyter Notebook, comment this line if using a script
show(plot)



In [261]:
country_codes = pd.read_csv('country_codes.csv')

In [262]:
country_codes

Unnamed: 0,Code,Description,CodeType,Deprecated,Created,Modified,Unnamed: 7
0,ALL,All countries used when reporting survey indices,IC_Country,False,2006-10-23,2006-10-23,
1,BE,Belgium,IC_Country,False,2006-10-23,2006-10-23,
2,BG,Bulgaria,IC_Country,False,2012-08-08,2012-08-08,
3,CA,Canada,IC_Country,False,2007-06-22,2007-06-22,
4,DE,Germany,IC_Country,False,2006-10-23,2006-10-23,
5,DK,Denmark,IC_Country,False,2006-10-23,2006-10-23,
6,EE,Estonia,IC_Country,False,2006-10-23,2006-10-23,
7,ES,Spain,IC_Country,False,2006-10-23,2006-10-23,
8,FI,Finland,IC_Country,False,2006-10-23,2006-10-23,
9,FO,Faroe Islands,IC_Country,False,2006-10-23,2006-10-23,
