In [9]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

In [10]:
df = pd.read_csv('combined_file_with_year.csv')

In [11]:
df.dtypes

Country                                              object
Preservation Requests                                object
Preservation Accounts Preserved                      object
Total Requests                                       object
Number of Requests Where Some Data Produced          object
Total Requests Accounts                              object
Total Requests Percentage                            object
Legal Process Request Total                          object
Legal Number of Requests Where Some Data Produced    object
Legal Process Request Total Accounts                 object
Legal Process Request Total Percentage               object
Emergency Request Total                              object
ER Number of Requests Where Some Data Produced       object
Emergency Request Total Accounts                     object
Emergency Request Total Percentage                   object
Year                                                  int64
dtype: object

In [12]:
number_columns = [
    "Preservation Requests",
    "Preservation Accounts Preserved",
    "Total Requests",
    "Number of Requests Where Some Data Produced",
    "Total Requests Accounts",
    "Legal Process Request Total",
    "Legal Number of Requests Where Some Data Produced",
    "Legal Process Request Total Accounts",
    "Emergency Request Total",
    "ER Number of Requests Where Some Data Produced",
    "Emergency Request Total Accounts"
]

for col in number_columns:
    df[col] = df[col].astype(str)              
    df[col] = df[col].str.replace(',', '')     
    df[col] = df[col].astype(float)             
    df[col] = df[col].fillna(0).astype(int)    

percent_columns = [
    "Total Requests Percentage",
    "Legal Process Request Total Percentage",
    "Emergency Request Total Percentage"
]

for col in percent_columns:
    df[col] = df[col].astype(str)              
    df[col] = df[col].str.replace('%', '')     
    df[col] = df[col].astype(float)       
df.dtypes

Country                                               object
Preservation Requests                                  int64
Preservation Accounts Preserved                        int64
Total Requests                                         int64
Number of Requests Where Some Data Produced            int64
Total Requests Accounts                                int64
Total Requests Percentage                            float64
Legal Process Request Total                            int64
Legal Number of Requests Where Some Data Produced      int64
Legal Process Request Total Accounts                   int64
Legal Process Request Total Percentage               float64
Emergency Request Total                                int64
ER Number of Requests Where Some Data Produced         int64
Emergency Request Total Accounts                       int64
Emergency Request Total Percentage                   float64
Year                                                   int64
dtype: object

**In Milestone 2, I had a problem with the line graph the x-axis was not showing clean years like 2020, 2021, and 2022, but it showing values like 2020, 2020.5, and 2021, 2021.5 etc. To better arrange the data, I thought about using the melt() function, but I wasn't sure if it would address the axis issueI decided to fix it in the next milestone and started thinking about adding dashboards too. In this milestone, I solved the x-axis issue by using update_layout() to manually set the tick values and labels. After fixing it, I created a line plot to show how different types of requests — Legal Process, Emergency, and Preservation — changed over time. I grouped the data by year, added markers to highlight each point clearly, and used a clean background and gridlines to make trends easier to see. My goal was to help the audience quickly spot patterns, like which request types are increasing faster or where sudden changes happened. i took reference from this link https://plotly.com/python/tick-formatting/#:~:text=80-,Tickmode%20%2D%20Array,-If%20%22array%22.**

In [None]:
request_types = df.groupby("Year")[
    ["Legal Process Request Total", "Emergency Request Total", "Preservation Requests"]
].sum().reset_index()

fig = px.line(
    request_types,
    x="Year",
    y=["Legal Process Request Total", "Emergency Request Total", "Preservation Requests"],
    markers=True,
    title="Request Types Over the Years"
)

fig.update_layout(
    xaxis=dict(
        tickmode='array',
        tickvals=request_types['Year'],
        ticktext=[str(year) for year in request_types['Year']],
        title="Year",
        showgrid=True,           
        gridcolor='white'      
    ),
    yaxis=dict(
        title="Total Requests",
        showgrid=True,            
        gridcolor='white'     
    ),
    plot_bgcolor="lightgrey",
    title_x=0.5,
    font=dict(size=14),
    legend_title_text="Request Type"
)

fig.show()

**I created a line plot because I wanted to show how different types of requests:-Legal Process Requests, Emergency Requests, and Preservation Requests,have changed over time. To do this, I grouped the data by year and summed the totals for each request type.I used markers on the lines to make it easier for the audience to see each data point clearly. I also added a light grey background and grid lines to make the chart easier to read. Here,my goal is to help the audience quickly spot trends over the years, such as which request types are growing faster or if any sudden jumps happen.**

**In this step, I made sure that the *'Total Requests'* column was in the correct numeric format. This is important because sometimes the data might be stored as text, and converting it to numeric ensures that calculations (like sums) are done correctly when visualizing the data.**

In [13]:
df['Total Requests'] = pd.to_numeric(df['Total Requests'])

**Next,I wanted to show which countries received the most total requests.I summed the total number of requests for each country and used a choropleth map to visualize it.I chose a blue color scale where darker shades represent more requests. Here, I wanted give the audience a quick, intuitive feel for the global distribution of requests highlighting the countries with the heaviest activity.**

In [None]:
country_requests = df.groupby('Country')["Total Requests"].sum().reset_index()

fig_map = px.choropleth(
    country_requests,
    locations="Country",
    locationmode="country names",
    color="Total Requests",
    color_continuous_scale="Blues",
    title="Total Requests by Country (All Years Combined)",
)

fig_map.update_layout(
    geo=dict(showframe=False, showcoastlines=True)
)

fig_map.show()


**After showing total volume by country, I wanted to dig a little deeper, Which type of request is more dominant in each country? To do this, I grouped the data by country and request type (Preservation, Legal Process, Emergency), then used an animated choropleth map where the audience can switch between request types. I want to make the audience explore how different request types are distributed globally, and easily spot patterns depending on the request type.**

In [None]:
for col in ["Preservation Requests", "Legal Process Request Total", "Emergency Request Total"]:
    df[col] = pd.to_numeric(df[col])

country_requests = df.groupby('Country')[
    ["Preservation Requests", "Legal Process Request Total", "Emergency Request Total"]
].sum().reset_index()

country_requests_melted = country_requests.melt(
    id_vars="Country",
    var_name="Request Type",
    value_name="Total"
)

fig = px.choropleth(
    country_requests_melted,
    locations="Country",
    locationmode="country names",
    color="Total",
    color_continuous_scale="Blues",
    title="Requests by Country and Type",
    animation_frame="Request Type" 
)
fig.update_layout(
    geo=dict(showframe=False, showcoastlines=True),
    title_x=0.5
)

fig.show()

**Then, I wanted to add the dimension of time into the global map.I grouped the data by both country and year, and created an animated choropleth map.Each frame shows how the total requests change across countries year by year.**

In [None]:
df['Total Requests'] = pd.to_numeric(df['Total Requests'])

country_year_requests = df.groupby(['Year', 'Country'])['Total Requests'].sum().reset_index()

fig = px.choropleth(
    country_year_requests,
    locations="Country",
    locationmode="country names",
    color="Total Requests",
    animation_frame="Year",
    color_continuous_scale="Blues",
    title="Total Requests by Country Over the Years",
)

fig.update_layout(
    geo=dict(showframe=False, showcoastlines=True),
    title_x=0.5
)

fig.show()

**At this stage, I was interested in expanding the image and provide a closer look at a few key countries. I chose ten countries, including the US, India, Germany, Japan, and others, rather than displaying every nation, which could be overwhelming. I decided on a heatmap since it's great for quickly identifying patterns.After that, I used a pivot_table() to organize the data into a grid with countries as rows, years as columns, and total requests as the values. This made it easy to plot in the heatmap format, where I could see at a glance which countries and years had higher or lower requests based on the color scale.This visual makes it really easy for the audience to immediately see which countries had spikes in requests, which stayed steady, and which ones suddenly dropped off. Let the audience quickly compare countries side by side over the years, and spot interesting patterns like sudden surges, consistent growth, or unexpected drops. For Color code 'RdYlBu_r' I took help of google**

In [None]:
selected_countries = ['Australia', 'Brazil', 'Canada', 'France', 'Germany', 
                      'India', 'Japan', 'South Korea', 'United Kingdom', 'United States']

df_selected = df[df['Country'].isin(selected_countries)]

heatmap_data = df_selected.pivot_table(
    index='Country',
    columns='Year',
    values='Total Requests',
    aggfunc='sum'
)

fig_heatmap = px.imshow(
    heatmap_data,
    labels=dict(x="Year", y="Country", color="Total Requests"),
    color_continuous_scale='RdYlBu_r', 
    title='Heatmap of Total Requests by Country and Year'
)

fig_heatmap.update_layout(
    title_x=0.5,
    xaxis_title="Year",
    yaxis_title="Country"
)

fig_heatmap.show()

**Here,I sorted country_totals by 'Total Requests' using ascending=True because I wanted the countries with the smallest number of requests to show up first. This way, when I plot the heatmap, it's easier for people to read from countries with fewer requests at the top to countries with more requests at the bottom, making the whole chart look more organized and clear.**

In [None]:
selected_countries = ['Australia', 'Brazil', 'Canada', 'France', 'Germany', 
                      'India', 'Japan', 'South Korea', 'United Kingdom', 'United States']

df_selected = df[df['Country'].isin(selected_countries)]

country_totals = df_selected.groupby('Country')['Total Requests'].sum().reset_index()

country_totals = country_totals.sort_values(by='Total Requests', ascending=True)

sorted_countries = country_totals['Country'].tolist()

heatmap_data = df_selected.pivot_table(
    index='Country',
    columns='Year',
    values='Total Requests',
    aggfunc='sum'
)

heatmap_data = heatmap_data.loc[sorted_countries]


fig_heatmap = px.imshow(
    heatmap_data,
    labels=dict(x="Year", y="Country", color="Total Requests"),
    color_continuous_scale='RdYlBu_r',
    title='Heatmap of Total Requests by Country and Year'
)

fig_heatmap.update_layout(
    title_x=0.5,
    xaxis_title="Year",
    yaxis_title="Country"
)

fig_heatmap.show()

**I am happy with how my third milestone turned out. The plots are clearer and more organized compared to earlier versions, and I was able to fix issues such as the x-axis formatting in the line graph. I also used new visualizations,such as heatmaps and choropleth maps, to make the data presentation more engaging. Although I think the visualizations are now simpler to understand. Overall, I'm more confident in my ability to work with data and present it effectively. I'll try to create a simple app that combines all of my visualizations and explanations into one interactive space.**