In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Data and parameters to use

In [None]:
# Read in the data
df_projects = pd.read_csv('data/projectList.csv')
# Drop Title, ArticleURL columns
df_projects.drop(['Title', 'ArticleURL'], axis=1, inplace=True)
df_projects.head()

In [None]:
# Function to get the number of projects by year
def get_projects_by_year(df_filtered):
    df_projects_year=df_filtered.copy(deep=True)
    df_projects_year.drop(['Family', 'Language', 'glottocode', 'isocode', 'Speakers'], axis=1, inplace=True)
    df_projects_year.drop_duplicates(inplace=True)
    df_projects_year=df_projects_year.groupby(['Year']).agg({'Project':'count', 'Cinema/TV':'sum', 'Social media':'sum', 'Radio':'sum', 'Music':'sum', 'Technology':'sum', 'Teaches the language':'sum', 'State initiative':'sum', 'External organization':'sum', 'Universities and institutes':'sum', 'Alternative projects from the community':'sum', 'Documentation': 'sum', 'Books and educative material': 'sum', 'Promotes use of the language': 'sum'}).reset_index().drop_duplicates()
    return df_projects_year

In [None]:
# Function to get the number of projects by country
def get_projects_by_country():
    df_project_by_country=df_projects.copy(deep=True)
    df_project_by_country["nLanguages"] = df_project_by_country.groupby("Project")["Language"].transform("nunique")
    df_project_by_country.drop(['Family', 'Language', 'glottocode', 'isocode','Speakers'], axis=1, inplace=True)
    df_project_by_country.drop_duplicates(inplace=True)
    return df_project_by_country

In [None]:
# Define the list of columns to filter
origin_columns = ['State initiative', 'External organization', 'Universities and institutes', 'Alternative projects from the community']
characteristics_columns = ["Cinema/TV", "Social media", "Radio", "Music", "Technology", "Teaches the language", "Documentation", "Books and educative material"]

## sns configuration

In [None]:
# Set the style of the plots
sns.set_style("darkgrid")

# Define the color palette
color_palette = sns.color_palette("husl", 4)

# Define a color for each category using Seaborn's color palette
color_dict = {
    'State initiative': sns.color_palette("husl", 4)[0],
    'External organization': sns.color_palette("husl", 4)[1],
    'Universities and institutes': sns.color_palette("husl", 4)[2],
    'Alternative projects from the community': sns.color_palette("husl", 4)[3]
}

# Set the global parameters for the plots
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.labelweight'] = 'light'
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['axes.titleweight'] = 'light'

# Plots

## Projects by year and origin

In [None]:
# Get the number of projects by year
df = get_projects_by_year(df_projects)

# Melt the DataFrame into a long format
df_melted = pd.melt(df, id_vars='Year', value_vars=origin_columns)

# Plot the data
plt.figure(figsize=(16, 8))
sns.barplot(data=df_melted, x='Year', y='value', hue='variable', palette=color_dict)
plt.title('Number of Projects by Year and Origin')
plt.ylabel('Number of projects')
plt.xlabel('Year')
plt.legend(title='Origin')
plt.savefig('images/number_of_projects_by_year_and_origin.png', dpi=300, bbox_inches='tight')
plt.show()

## Top 30 languages with most projects

In [None]:
# Group by 'Lengua', count, reset index, and sort
df_grouped = df_projects.groupby('Language').count().reset_index().sort_values(by='Project', ascending=False)

# Filter to include only the top 30 languages
df_filtered = df_grouped[0:30]

# Plot the data
plt.figure(figsize=(16, 8))
plt.bar(df_filtered['Language'], df_filtered['Project'], color="#377eb8", width=0.85)
plt.xlabel('Language')
plt.ylabel('Number of Projects')
plt.xticks(rotation='vertical')
plt.title('Top 30 Languages with Most Projects')

# Adjust the x-axis limits
plt.xlim(-0.8, len(df_filtered['Language'])-0.2)
# Keep only horizontal grid lines
plt.grid(axis='x')

plt.savefig('images/top_30_languages_with_most_projects.png', dpi=300, bbox_inches='tight')
plt.show()

## Projects by country

In [None]:
df_projects_country = get_projects_by_country()
# Group the data, count the projects, reset the index, and sort the values
df_grouped = df_projects_country.groupby('Country').count().reset_index().sort_values(by='Project', ascending=False)

# Plot the data
plt.figure(figsize=(16, 9))
bars = plt.bar(df_grouped['Country'], df_grouped['Project'], color="#377eb8", width=0.85)

# Add the number values on top of the bars
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.05, yval, ha='center', va='bottom')

# Remove the y-axis
plt.yticks([])
# Keep only horizontal grid lines
plt.grid(axis='x')

# Adjust the x-axis limits
plt.xlim(-0.8, len(df_grouped['Country'])-0.2)

# Add labels and title
plt.xlabel('Country')
plt.ylabel('Number of Projects')
plt.xticks(rotation='vertical')
plt.title('Number of Projects by Country')
plt.savefig('images/number_of_projects_by_country.png', dpi=300, bbox_inches='tight')
plt.show()


## Projects by country and origin

In [None]:
df_projects_country = get_projects_by_country()
# Group the data, count the projects, reset the index, and sort the values
df_grouped = df_projects_country.groupby('Country').agg({
    'Project': 'count',
    'State initiative': 'sum',
    'External organization': 'sum',
    'Universities and institutes': 'sum',
    'Alternative projects from the community': 'sum'
}).reset_index().sort_values(by='Project', ascending=False)


top_9 = df_grouped.head(8)
df_melted = top_9.melt(id_vars='Country', value_vars=['State initiative', 'External organization', 'Universities and institutes', 'Alternative projects from the community'], var_name='Origin', value_name='Number of Projects')

# Create the catplot
g = sns.catplot(
    x='Number of Projects', 
    y='Origin', 
    hue='Origin', 
    data=df_melted, 
    kind='bar', 
    height=4, 
    aspect=1, 
    palette=color_dict,
    col_wrap=4,
    col='Country',
    sharey=True,
    orient='h',
    color='#377eb8',
    formatter= #Si es "Alternative projects from the community" entonces poner un salto de línea
    lambda x: "",
    legend=True
)


# Set the title for the entire plot
g.figure.suptitle('Number of Projects by Country and Origin', fontsize=16, y=1.02)
g.set_titles('{col_name}')
g.set_axis_labels('' ,'')

# g.legend.set_bbox_to_anchor((0.5, -0.1))
# g.legend.set_loc('center')
g.legend.set_ncols(4)  # 
g.legend.set_
# Save the plot
g.savefig('images/number_of_projects_by_country_and_origin.png', dpi=300, bbox_inches='tight')
plt.show()


## Correlation between number of speakers and number of projects

In [None]:
import plotly.express as px

# Prepare the data
#Exclude "Quechuan" and "Paraguayan Guarani" because they are outliers
df = df_projects.copy(deep=True)
df = df_projects[df_projects['Language'] != 'Quechuan']
df = df[df['Language'] != 'Paraguayan Guaraní']
df = df.groupby('Language').agg({'Speakers':'first', 'Project':'count', 'Family':'first', 'Country':'first', 'State initiative':'sum', 'External organization':'sum', 'Universities and institutes':'sum', 'Alternative projects from the community':'sum'}).reset_index()

df['Adjusted Project'] = df['Project']+2

# Create the correlation plot
fig = px.scatter(df, x='Speakers', y='Project', size='Adjusted Project', color='Family', hover_data=['Language', 'Project'], height=800, size_max=40, title='Correlation between number of speakers and number of projects')

#Find regression line
import plotly.graph_objects as go
import numpy as np
from sklearn.linear_model import LinearRegression

# Prepare the data for the regression model
X = df['Speakers'].values.reshape(-1, 1)

y = df['Project'].values

# Create a linear regression model
model = LinearRegression()
model.fit(X, y)

# Predict y values
y_pred = model.predict(X)

# Print the R^2 value
print('R^2:', model.score(X, y))

# Create a trace for the regression line
trace = go.Scatter(x=df['Speakers'], y=y_pred, mode='lines', name='Regression Line')

# Add the regression line to the plot
fig.add_trace(trace)

#Rename the legend title 
fig.update_layout(legend_title_text='Language Family')

#Rename the axes
fig.update_xaxes(title_text='Number of Speakers')
fig.update_yaxes(title_text='Number of Projects')

#Hide the legend
fig.update_layout(showlegend=False)

#Write the name of the language in the circle for the 11 languages with the most projects
top_11 = df.nlargest(11, 'Project')
for i in range(11):
    fig.add_annotation(x=top_11.iloc[i]['Speakers'], y=top_11.iloc[i]['Project'], text=top_11.iloc[i]['Language'], showarrow=True, arrowhead=2,
                       yshift=5)

# Show the plot
fig.show()

## Projects by year and characteristic

In [None]:
df = get_projects_by_year(df_projects)
df_melted = df.melt(id_vars='Year', value_vars=characteristics_columns, var_name='Characteristic', value_name='Number of Projects')

# Create the catplot
g = sns.catplot(x='Number of Projects', y='Year', col='Characteristic', data=df_melted, kind='bar', col_wrap=4, height=4, aspect=1, sharey=True, orient='h', color='#377eb8')

# Set the titles for each subplot
g.set_titles("{col_name}")

# Set the labels for the x and y axes
g.set_axis_labels("", "Year")

# Set the title for the entire plot
g.figure.suptitle('Number of projects by year and characteristic', fontsize=16, y=1.05)

# Save the plot
g.savefig('images/number_of_projects_by_year_and_characteristic.png', dpi=300, bbox_inches='tight')
plt.show()

## Number of projects and their origin through the years

In [None]:
df = get_projects_by_year(df_projects)

# Melt the DataFrame into a long format
df_melted = pd.melt(df, id_vars='Year', value_vars=origin_columns)

# Plot the data
plt.figure(figsize=(16, 8))
sns.lineplot(data=df_melted, x='Year', y='value', hue='variable', palette=color_dict)
plt.title('Number of Projects by Year and Origin over Time')
plt.ylabel('Number of projects')
plt.xlabel('Year')
plt.legend(title='Origin')

# Set xticks
plt.xticks(np.arange(min(df['Year']), max(df['Year'])+1, 1))

plt.savefig('images/number_of_projects_by_year_and_origin_over_time.png', dpi=300, bbox_inches='tight')
plt.show()

## Proportion of projects by origin per 5 years

In [None]:
df_projects_by_year=get_projects_by_year(df_projects)
df_projects_by_year.set_index('Year', inplace=True)
df_projects_by_year = df_projects_by_year[origin_columns]

# Create the subplots
fig, axs = plt.subplots(1, 3, figsize=(20, 6))
fig.suptitle('Project counts by origin each five years', fontsize=16)

for i, ax in enumerate(axs.flat):
    start = 2007+(i)*5
    end = start+5
    # Get the sum of projects by origin for the given period
    data = df_projects_by_year.loc[start:end-1].sum()
    # Create the pie chart
    patches, texts, autotexts = ax.pie(data, autopct=lambda p: '{:.0f}'.format(p * sum(data) / 100), colors=color_palette, textprops={'color':"w", 'weight':'bold', 'fontsize':14})
    # Set the title for the subplot
    ax.set_title(f'From {start} to {end-1}')

# Create the legend
fig.legend(patches, data.index, loc="lower center", ncol=4)
plt.subplots_adjust(right=0.85)  # Adjust the right padding to make room for the legend

plt.savefig('images/project_counts_by_origin_each_five_years.png', dpi=300, bbox_inches='tight')
plt.show()