In [117]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import numpy as np
from bokeh.plotting import figure, show, output_notebook, output_file
from bokeh.models import ColumnDataSource, HoverTool, WheelZoomTool, PanTool
from bokeh.transform import linear_cmap
from bokeh.palettes import Viridis256  # Use Viridis256 palette


In [118]:
fish_file = pd.read_csv('fish_catches.csv')

In [119]:
fish_file = fish_file.iloc[:-1,:]

In [120]:
fish_file

Unnamed: 0,Species,Area,Units,Country,2014,2013,2012,2011,2010,2009,...,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24
0,ANF,27,TLW,BE,993.0,1633.0,1716.0,1279.0,1031.0,853.0,...,,,,,,,,,,
1,ANF,27.4,TLW,BE,217.0,137.0,133.0,116.0,131.0,140.0,...,,,,,,,,,,
2,ANF,27.4.A,TLW,BE,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,ANF,27.4.B,TLW,BE,213.0,135.0,131.0,111.0,124.0,134.0,...,,,,,,,,,,
4,ANF,27.4.C,TLW,BE,4.0,2.0,2.0,6.0,7.0,6.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49104,WHG,27.7,TLW,JE,1.0,0.0,0.0,0.0,3.0,0.0,...,,,,,,,,,,
49105,WHG,27.7.E,TLW,JE,1.0,0.0,0.0,0.0,3.0,0.0,...,,,,,,,,,,
49106,WRA,27,TLW,JE,14.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
49107,WRA,27.7,TLW,JE,14.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [121]:
numerical_data = fish_file[['2014', '2013', '2012', '2011', '2010', '2009', '2008', '2007', '2006']]
numerical_data

Unnamed: 0,2014,2013,2012,2011,2010,2009,2008,2007,2006
0,993.0,1633.0,1716.0,1279.0,1031.0,853.0,964.0,1363.0,1193.0
1,217.0,137.0,133.0,116.0,131.0,140.0,185.0,181.0,141.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,213.0,135.0,131.0,111.0,124.0,134.0,181.0,179.0,138.0
4,4.0,2.0,2.0,6.0,7.0,6.0,4.0,3.0,3.0
...,...,...,...,...,...,...,...,...,...
49104,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0
49105,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0
49106,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49107,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [122]:
scaler = StandardScaler()
scaled_numeric_data = scaler.fit_transform(numerical_data)
scaled_numeric_data

array([[ 0.04332697,  0.12343469,  0.12233605, ...,  0.02983226,
         0.0664967 ,  0.0545611 ],
       [-0.04978201, -0.05443075, -0.05137893, ..., -0.03978803,
        -0.04537357, -0.05384978],
       [-0.07581893, -0.07071923, -0.06597406, ..., -0.05632174,
        -0.0625043 , -0.06838014],
       ...,
       [-0.07413913, -0.07071923, -0.06597406, ..., -0.05632174,
        -0.0625043 , -0.06838014],
       [-0.07413913, -0.07071923, -0.06597406, ..., -0.05632174,
        -0.0625043 , -0.06838014],
       [-0.07413913, -0.07071923, -0.06597406, ..., -0.05632174,
        -0.0625043 , -0.06838014]])

In [123]:
categorical_data = fish_file[['Species', 'Area', 'Country']].fillna('')
categorical_data

Unnamed: 0,Species,Area,Country
0,ANF,27,BE
1,ANF,27.4,BE
2,ANF,27.4.A,BE
3,ANF,27.4.B,BE
4,ANF,27.4.C,BE
...,...,...,...
49104,WHG,27.7,JE
49105,WHG,27.7.E,JE
49106,WRA,27,JE
49107,WRA,27.7,JE


In [124]:
# Apply TF-IDF to categorical data
tfidf_vectorizer = TfidfVectorizer()
tfidf_categorical_data = tfidf_vectorizer.fit_transform(categorical_data.apply(lambda x: ' '.join(x), axis=1))
tfidf_categorical_data

<49109x952 sparse matrix of type '<class 'numpy.float64'>'
	with 162618 stored elements in Compressed Sparse Row format>

In [125]:
tensor_data = np.concatenate((scaled_numeric_data, tfidf_categorical_data.toarray()), axis=1)

In [126]:
# Apply PCA for dimensionality reduction
pca = PCA(n_components=3)  # You can adjust the number of components
reduced_data = pca.fit_transform(tensor_data)

In [127]:
from sklearn.cluster import KMeans


In [134]:
# Apply K-Means clustering
num_clusters = 20
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(reduced_data)



In [135]:
fish_file['cluster'] = clusters

In [136]:
# Filter the data to show a maximum of 20 samples for each country
sampled_df = fish_file.groupby('Country', group_keys=False).apply(lambda group: group.sample(min(20, len(group))))

# Create a new DataFrame with PCA components
pca_df = pd.DataFrame(reduced_data, columns=['x', 'y', 'z'][:reduced_data.shape[1]])

In [137]:
import matplotlib.pyplot as plt
import seaborn as sns

In [138]:
# Create a Bokeh ColumnDataSource
source = ColumnDataSource(pca_df.join(fish_file[['cluster', 'Country', 'Species']]))

# Create a color mapper based on categorical palette
palette = Viridis256  # Use Viridis256 palette
color_mapper = linear_cmap(field_name='cluster', palette=palette, low=min(clusters), high=max(clusters))

# Create a Bokeh figure
plot = figure(title="Cluster Visualization with Country Codes", tools=[HoverTool(tooltips=[("Country", "@Country"), ("Fish Species", "@Species")]), WheelZoomTool(), PanTool()])

# Add scatter glyphs
scatter = plot.scatter('x', 'y', source=source, legend_field='Country', size=10, fill_alpha=0.6, line_color=None, fill_color=color_mapper)

# Show the plot
output_notebook()  # For Jupyter Notebook, comment this line if using a script
show(plot)