In [64]:
import polars as pl
import polars.selectors as cs
import plotly.express as px

In [73]:
df= (
        pl.read_excel(
        'undesa_pd_2024_ims_stock_by_sex_destination_and_origin.xlsx',
        sheet_name='Table 1',
        has_header=True,
        read_options={'header_row': 10}
    )
    .rename(
        {
            'Region, development group, country or area of destination' :
            'DESTINATION',
            'Location code of destination' :'DEST_CODE',
            'Region, development group, country or area of origin' :
            'ORIGIN',
            'Location code of origin' :'ORIGIN_CODE',
        }
    )
)
df_dest_codes = (
    df
    .select(
        LOCATION=pl.col('DESTINATION'),
        CODE = pl.col('DEST_CODE')
    )
    .unique('LOCATION')
    .sort('CODE')
    .with_columns(REF = pl.lit('DESTINATION'))
)

df_origin_codes = (
    df
    .select(
        LOCATION=pl.col('ORIGIN'),
        CODE = pl.col('ORIGIN_CODE')
    )
    .unique('LOCATION')
    .sort('CODE')
    .with_columns(REF = pl.lit('ORIGIN'))
)
df_codes = (
    # checks the code numbers are consistent between dest and origin countries
    pl.concat([df_dest_codes,df_origin_codes])
    .sort('LOCATION')
    .pivot(
        index='LOCATION',
        on='REF',
        values='CODE'
    )
    .with_columns(DIFF = (pl.col('DESTINATION') - pl.col('ORIGIN')))
    # filter excludes Others, Saint BarthÃ©lemy, and Saint Martin (French part)
    .filter(pl.col('DIFF') == 0) # gets rid or inconsistencies
    .rename({'DESTINATION': 'CODE'})
    .select('LOCATION', 'CODE')
)

df_codes.write_csv('df_codes.csv')
dict_codes = (
    dict(
        zip(
            df_codes['CODE'],
            df_codes['LOCATION'],
        )
    )
)
dict_codes
# df_join = (
#     df_dest_codes
#     .join(
#         df_orgin_codes,
#         how='left',
#         left_on= 'DEST_CODE',
#         right_on = 'ORIGIN_CODE'
#     )
# )
#df_dest_codes

# destination_codes = (
#     dict(
#         zip(
#             df.unique('DESTINATION')['DEST_CODE'],
#             df.unique('DESTINATION')['DESTINATION']
#         )
#     )
# )
# len(sorted(destination_codes))

# location_code_dict = (
#     df
# )
# df
# destination_codes

{903: 'AFRICA',
 935: 'ASIA',
 4: 'Afghanistan',
 8: 'Albania',
 12: 'Algeria',
 16: 'American Samoa*',
 20: 'Andorra',
 24: 'Angola',
 660: 'Anguilla*',
 28: 'Antigua and Barbuda',
 32: 'Argentina',
 51: 'Armenia',
 533: 'Aruba*',
 36: 'Australia*',
 1836: 'Australia/New Zealand',
 40: 'Austria',
 31: 'Azerbaijan',
 44: 'Bahamas',
 48: 'Bahrain',
 50: 'Bangladesh',
 52: 'Barbados',
 112: 'Belarus',
 56: 'Belgium',
 84: 'Belize',
 204: 'Benin',
 60: 'Bermuda*',
 64: 'Bhutan',
 68: 'Bolivia (Plurinational State of)',
 535: 'Bonaire, Sint Eustatius and Saba*',
 70: 'Bosnia and Herzegovina',
 72: 'Botswana',
 76: 'Brazil',
 92: 'British Virgin Islands*',
 96: 'Brunei Darussalam',
 100: 'Bulgaria',
 854: 'Burkina Faso',
 108: 'Burundi',
 132: 'Cabo Verde',
 116: 'Cambodia',
 120: 'Cameroon',
 124: 'Canada',
 915: 'Caribbean',
 136: 'Cayman Islands*',
 140: 'Central African Republic',
 916: 'Central America',
 5500: 'Central Asia',
 1831: 'Central and Southern Asia',
 148: 'Chad',
 830: 'Ch

In [66]:
df_dest_codes

LOCATION,CODE,REF
str,i64,str
"""Afghanistan""",4,"""DESTINATION"""
"""Albania""",8,"""DESTINATION"""
"""Algeria""",12,"""DESTINATION"""
"""American Samoa*""",16,"""DESTINATION"""
"""Andorra""",20,"""DESTINATION"""
…,…,…
"""Low-and-middle-income countrie…",1859,"""DESTINATION"""
"""Central Asia""",5500,"""DESTINATION"""
"""Southern Asia""",5501,"""DESTINATION"""
"""High-and-upper-middle-income c…",5503,"""DESTINATION"""


In [67]:
for key in destination_codes:
    print(key, destination_codes[key], origin_codes[key])

913 New Caledonia* Central and Southern Asia
8 Central and Southern Asia Cameroon
524 Western Africa Slovenia
788 Nauru Mongolia
296 Armenia Ghana
674 Syrian Arab Republic Southern Africa
266 India Malta
909 Argentina South-Eastern Asia
332 France* Belarus
20 Honduras Bangladesh
764 Middle-income countries Tuvalu
124 Burundi Sub-Saharan Africa
1836 Central America Mauritius
854 South Sudan Benin
682 Bangladesh Palau
64 Ukraine* Japan
12 Togo Israel
533 No income group available North Macedonia
308 Turks and Caicos Islands* Spain*
642 Oceania (excluding Australia and New Zealand) Barbados
748 LATIN AMERICA AND THE CARIBBEAN Poland
120 Nicaragua South Africa
903 Sudan Antigua and Barbuda
1517 Croatia Serbia*
620 Palau Guinea
51 Venezuela (Bolivarian Republic of) Saint Helena
466 Land-locked Developing Countries (LLDC) British Virgin Islands*
136 Tonga Land-locked Developing Countries (LLDC)
902 Spain* Saint Lucia
203 Iraq Denmark*
148 Saint Lucia Monaco
694 Dominican Republic Germany
941

KeyError: 663

In [None]:
origin_codes = (
    dict(
        zip(
            df.unique('ORIGIN')['ORIGIN_CODE'],
            df.unique('ORIGIN')['ORIGIN']
        )
    )
)
origin_codes

{710: 'San Marino',
 276: 'United Republic of Tanzania',
 450: 'Finland*',
 534: 'Seychelles',
 462: 'Faroe Islands*',
 156: 'Togo',
 120: 'South Africa',
 931: 'Zimbabwe',
 724: 'Canada',
 72: 'Marshall Islands',
 912: 'Kyrgyzstan',
 422: 'Bermuda*',
 957: 'Kenya',
 454: 'Jordan',
 380: 'Honduras',
 2003: 'Russian Federation',
 132: 'ASIA',
 5503: 'Western Sahara',
 364: 'Czechia',
 591: 'Paraguay',
 570: 'Micronesia',
 188: 'World',
 288: 'Channel Islands*',
 925: 'Eastern Asia',
 418: 'Brazil',
 496: 'Cuba',
 826: 'Eritrea',
 548: 'NORTHERN AMERICA',
 4: 'Micronesia (Fed. States of)',
 84: 'Uganda',
 706: 'Myanmar',
 728: 'Vanuatu',
 170: 'Afghanistan',
 528: 'Zambia',
 598: 'Least developed countries',
 604: 'Iraq',
 660: 'Northern Europe',
 608: 'High-income countries',
 136: 'Land-locked Developing Countries (LLDC)',
 674: 'Southern Africa',
 328: 'Philippines',
 934: 'Sudan',
 788: 'Mongolia',
 558: 'Bonaire, Sint Eustatius and Saba*',
 768: 'Upper-middle-income countries',
 191

In [None]:
import polars as pl
import polars.selectors as cs
import plotly.express as px

# dataset issue:
#   99.8% of dataset is from Washington State. I dropped all other locations.
#   Base MSRP is 0 for 99.5% of the entries, dropped this too
#   Utility company info is long and messy, dropped it to
#   Vehicle Indentification Numbers (VIN) dropped, no interest
#   Range data is not credible with 2/3 of values as 0, dropped it too

df = (
    pl.scan_csv('Electric_Vehicle_Population_Data.csv')  # lazyframe
    .select(
        STATE = pl.col('State'),
        YEAR = pl.col('Model Year'),
        MAKE = pl.col('Make'),
    )
    .filter(pl.col('STATE') == 'WA')
    .filter(pl.col('MAKE') != 'TESLA')
    .select(
        'YEAR', 'MAKE', # 'MODEL',
        YEAR_MAKE_TOT = pl.col('STATE').count().over(['YEAR', 'MAKE']),
        YEAR_TOT = pl.col('STATE').count().over('YEAR'),
    )
    .unique(['YEAR', 'MAKE'])
    .with_columns(
        PCT_SHARE = 100*(pl.col('YEAR_MAKE_TOT')/pl.col('YEAR_TOT'))
    )
    .collect()  # convert to dataframe for pivot
    .pivot(
        on='MAKE',
        index='YEAR',
        values='PCT_SHARE'
    )
    .lazy()     # back to lazyframe
    .filter(pl.col('YEAR') > 2020)
    .sort('YEAR')
    .with_columns(
        PCT_TOT = pl.sum_horizontal(cs.exclude(pl.col('YEAR')))
    )
    .with_columns(
        PCT_TOT = pl.when(pl.col('PCT_TOT') >= 5)
                    .then('PCT_TOT')
                    .otherwise(pl.lit(0.0))
    )
    .with_columns(
        pl.col('YEAR').cast(pl.String)
    )
    .collect()  # lazyframe to data frame from here to the end of script
)

#----- EXCLUDE MAKES WITH 1 OR MORE NULL VALUES, ALPHABETIC SORT ---------------
sorted_cols_null_filter = sorted(
    [
        c for c in df.columns 
        if (df[c].is_null().sum() <= 1) and
        (c not in ['YEAR', 'PCT_TOT'])
    ]
)

#----- MAKE LIST OF TOP 5 MAKES in 2025 ----------------------------------------
top_5_makes = (
    df
    .select(['YEAR'] + sorted_cols_null_filter + ['PCT_TOT'] )
    .transpose(
        include_header=True,
        header_name='MAKE',
        column_names='YEAR',
    )
    .sort('2025', descending=True)
    .head(6)    # make a list of the top 6 makes
    .tail(5)    # then exclude the top brand
    .select('MAKE')
    .to_series()
    .to_list()
)

#----- COLOR DICTIONARY FOR THE GRAPHS AND ANNOTATED LABELS --------------------
my_color_dict = dict(
    zip(top_5_makes,px.colors.qualitative.Vivid[:5] )
)

#----- PLOT SHARE OF TOP 5 ALTERNATIVE EVS--------------------------------------
my_subtitle = (
    'Companies not led by ' +
    '<b>M</b>ostly <b>U</b>nhinged <b>S</b>pace <b>K</b>ings'
)
fig=px.line(
    df,
    'YEAR',
    top_5_makes,
    # Just found out from plotly youtube that markers can be enabled in px.line
    markers=True, 
    title="WASHINGTON STATE'S TOP 5 ALTERNATE ELECTRIC VEHICLES",
    subtitle=my_subtitle,
    template='simple_white',
    line_shape='spline',
    color_discrete_map=my_color_dict,
    height=500, width=800
)
for make in top_5_makes:
    fig.add_annotation(
            x=0.96, xref='paper',
            y=df[make].to_list()[-1], yref='y',
            text=f'<b>{make}</b>',
            showarrow=False,
            xanchor='left',
            font=dict(color=my_color_dict.get(make))
        )
    
data_source = 'https://catalog.data.gov/dataset/electric-vehicle-population-data'
fig.update_layout(
    showlegend=False,
    xaxis=dict(
        title=dict(text=f'Data source: {data_source}', font=dict(color='gray')),
    ),
    yaxis=dict(
        title=dict(
            text='ALTERNATE EV SHARE', 
            font=dict(color='gray'),
        ),
        ticksuffix = "%",
    ),
    hovermode='x unified'
)
fig.show()