In [None]:
import polars as pl
import plotly.express as px

#   99.8% of this dataset is from Washington State, all others states excluded
#   Base MSRP is 0 for 99.5% of the entries, so it is excluded
#   Utility company names are long and messy, and dropped from this analysis
# 


df = (
    pl.read_csv('Electric_Vehicle_Population_Data.csv')
    .select(
        VIN = pl.col('VIN (1-10)'),
        COUNTY = pl.col('County'),
        CITY = pl.col('City'),
        STATE = pl.col('State'),
        ZIP = pl.col('Postal Code'),
        YEAR = pl.col('Model Year'),
        MAKE = pl.col('Make'),
        MODEL = pl.col('Model'),
        EVTYPE = (
            pl.col('Electric Vehicle Type')
                .str.replace(
                    'Plug-in Hybrid Electric Vehicle (PHEV)', 
                    'HYBRID', 
                    literal=True
                )
                .str.replace(
                    'Battery Electric Vehicle (BEV)', 
                    'BATT', 
                    literal=True
                )
        ),
        RANGE = pl.col('Electric Range'),
        LEG_DIST = pl.col('Legislative District'),
        LOC = pl.col('Vehicle Location'),
    )
    .filter(pl.col('STATE') == 'WA')
    .with_columns(
        LONG= (
            pl.col('LOC')
                .str.replace('POINT (', '', literal=True)
                .str.replace(')', '', literal=True) 
                .str.split(' ')
                .list.get(0)    # 1st element [0] is longitude (west-east)
                .cast(pl.Float64, strict=False)
        )
    )
    .with_columns(
        LAT = (
            pl.col('LOC')
                .str.replace('POINT (', '', literal=True) 
                .str.replace(')', '', literal=True) 
                .str.split(' ')
                .list.get(1)   # 2nd element [1] is latitude (north-south)
                .cast(pl.Float64)
        )
   )
   .drop('LOC')
)

# df.write_excel(
#     'df_1.xlsx',
#     autofit=True,
#     freeze_panes=(1,0),
#     column_formats= {'ZIP':'%d', 'YEAR':'%d'} # force 1000s comma removal
# )


print(df.shape)
print(df.sample(10).glimpse())


(235198, 13)
Rows: 10
Columns: 13
$ VIN      <str> '1C4RJXR67R', '7SAYGDEE3N', 'WP0AK2A70P', '5YJ3E1EB4N', '1G1FY6S09L', 'JTDKAMFP8M', 'JTJAAAAB5P', 'WBY43AW02P', '7SAYGAEE5P', '5YJXCAE21L'
$ COUNTY   <str> 'Pierce', 'King', 'Whatcom', 'King', 'King', 'Pierce', 'King', 'King', 'King', 'King'
$ CITY     <str> 'South Hill', 'Seattle', 'Bellingham', 'Renton', 'Seattle', 'South Hill', 'Seattle', 'Kent', 'Bellevue', 'Shoreline'
$ STATE    <str> 'WA', 'WA', 'WA', 'WA', 'WA', 'WA', 'WA', 'WA', 'WA', 'WA'
$ ZIP      <i64> 98373, 98133, 98229, 98058, 98133, 98375, 98133, 98042, 98005, 98133
$ YEAR     <i64> 2024, 2022, 2023, 2022, 2020, 2021, 2023, 2023, 2023, 2020
$ MAKE     <str> 'JEEP', 'TESLA', 'PORSCHE', 'TESLA', 'CHEVROLET', 'TOYOTA', 'LEXUS', 'BMW', 'TESLA', 'TESLA'
$ MODEL    <str> 'WRANGLER', 'MODEL Y', 'PANAMERA', 'MODEL 3', 'BOLT EV', 'PRIUS PRIME', 'RZ', 'I4', 'MODEL Y', 'MODEL X'
$ EVTYPE   <str> 'HYBRID', 'BATT', 'HYBRID', 'BATT', 'BATT', 'HYBRID', 'BATT', 'BATT', 'BATT', 'BATT'
$

In [None]:
import polars as pl
import plotly.express as px

#   99.8% of this dataset is from Washington State, all others states excluded
#   Base MSRP is 0 for 99.5% of the entries, so it is excluded
#   Utility company names are long and messy, these need to be cleaned up 
# 


df = (
    pl.read_csv('Electric_Vehicle_Population_Data.csv')
    .select(
        VIN = pl.col('VIN (1-10)'),
        COUNTY = pl.col('County'),
        CITY = pl.col('City'),
        STATE = pl.col('State'),
        ZIP = pl.col('Postal Code'),
        YEAR = pl.col('Model Year'),
        MAKE = pl.col('Make'),
        MODEL = pl.col('Model'),
        EVTYPE = (
            pl.col('Electric Vehicle Type')
                .str.replace(
                    'Plug-in Hybrid Electric Vehicle (PHEV)', 
                    'HYBRID', 
                    literal=True
                )
                .str.replace(
                    'Battery Electric Vehicle (BEV)', 
                    'BATT', 
                    literal=True
                )
        ),
        RANGE = pl.col('Electric Range'),
        LEG_DIST = pl.col('Legislative District'),
        UTIL_CO = pl.col('Electric Utility'),
        LOC = pl.col('Vehicle Location'),
    )
    .filter(pl.col('STATE') == 'WA')
    .with_columns(
        LONG= (
            pl.col('LOC')
                .str.replace('POINT (', '', literal=True)
                .str.replace(')', '', literal=True) 
                .str.split(' ')
                .list.get(0)    # 1st element [0] is longitude (west-east)
                .cast(pl.Float64, strict=False)
        )
    )
    .with_columns(
        LAT = (
            pl.col('LOC')
                .str.replace('POINT (', '', literal=True) 
                .str.replace(')', '', literal=True) 
                .str.split(' ')
                .list.get(1)   # 2nd element [1] is latitude (north-south)
                .cast(pl.Float64)
        )
   )
   .with_columns(
       pl.when(pl.col('UTIL_CO').begins_with('BONNEVILLE POWER ADMINISTRATION'))
         .then(pl.lit('BONNEVILLE'))
         .when(pl.col('UTIL_CO').contains('SEATTLE'))
         .then(pl.lit('SEATTLE'))
         
         .otherwise('UTIL_CO')
   )
   .drop('LOC')
)

# df.write_excel(
#     'df_1.xlsx',
#     autofit=True,
#     freeze_panes=(1,0)
# )


print(df.shape)
print(df.sample(10).glimpse())


# df = (

#     pl.read_csv('henley_results_cleaned.csv')
#     .drop('time', 'fawley_loser_leading')   # EDA shows 80% of time values are null, so drop this col
# )
#-----  EXPLORATRY DATA ANALYSIS ----------------------------------------------
# if False:
#     df_row_count = df.height
#     df_null = (df
#         #.fill_null(0)
#         .null_count()
#         .transpose(include_header=True)
#         .with_columns(
#             PCT_NUL = (
#             100*pl.col('column_0')/df_row_count
#             )
#         )
#         .sort('column_0', descending=True)
#     )
    # print(df_null)
    # # print(df.select('PCT_NULL', 'column_0'))
    # null_time_count = df.filter(pl.col('time').is_null()).height
    # not_null_time_count = df.filter(pl.col('time').is_not_null()).height

#     print(1/0)



#     pct_null_time = 100 * null_time_count / (null_time_count + not_null_time_count)
#     print(f'{null_time_count = }')
#     print(f'{not_null_time_count = }')
#     print(f'{pct_null_time = }')


# for col in df.columns:
#     print(df[col].value_counts())
# print(df.columns)
# print(df.glimpse())import polars as pl
# pl.show_versions()

# df = (
#     pl.read_csv('Electric_Vehicle_Population_Data.csv')
#     .select(
#         VIN = pl.col('VIN (1-10)'),
#         COUNTY = pl.col('County'),
#         CITY = pl.col('City'),
#         STATE = pl.col('State'),
#         ZIP = pl.col('Postal Code'),
#         YEAR = pl.col('Model Year'),
#         MAKE = pl.col('Make'),
#         MODEL = pl.col('Model'),
#         EVTYPE = (
#             pl.col('Electric Vehicle Type')
#                 .str.replace('Plug-in Hybrid Electric Vehicle (PHEV)', 'HYBRID', literal=True)
#                 .str.replace('Battery Electric Vehicle (BEV)', 'BATT', literal=True)
#         ),
#         RANGE = pl.col('Electric Range'),
#         #  MSRP = pl.col('Base MSRP'), # MSRP are mostly 0, don't use
#         LEG_DIST = pl.col('Legislative District'),
#         # UTIL_CO = pl.col('Electric Utility'),  only specified for WA State, so exclude
#         LOC = pl.col('Vehicle Location'),
#     )
#     .with_columns(
#         LONG= (
#             pl.col('LOC')
#                 .str.replace('POINT (', '', literal=True)
#                 .str.replace(')', '', literal=True) 
#                 .str.split(' ')
#                 .list.get(0)    # 1st element [0] is longitude (west-east)
#                 .cast(pl.Float64, strict=False)
#         )
#     )
#     .with_columns(
#         LAT = (
#             pl.col('LOC')
#                 .str.replace('POINT (', '', literal=True) 
#                 .str.replace(')', '', literal=True) 
#                 .str.split(' ')
#                 .list.get(1)   # 2nd element [1] is latitude (north-south)
#                 .cast(pl.Float64)
#         )
#    )
#    .drop('LOC')
# )

# df.write_excel(
#     'df_1.xlsx',
#     autofit=True,
#     freeze_panes=(1,0)
# )


print(df.shape)
print(df.sample(10).glimpse())


# df = (

#     pl.read_csv('henley_results_cleaned.csv')
#     .drop('time', 'fawley_loser_leading')   # EDA shows 80% of time values are null, so drop this col
# )
#-----  EXPLORATRY DATA ANALYSIS ----------------------------------------------
# if False:
#     df_row_count = df.height
#     df_null = (df
#         #.fill_null(0)
#         .null_count()
#         .transpose(include_header=True)
#         .with_columns(
#             PCT_NUL = (
#             100*pl.col('column_0')/df_row_count
#             )
#         )
#         .sort('column_0', descending=True)
#     )
    # print(df_null)
    # # print(df.select('PCT_NULL', 'column_0'))
    # null_time_count = df.filter(pl.col('time').is_null()).height
    # not_null_time_count = df.filter(pl.col('time').is_not_null()).height

#     print(1/0)



#     pct_null_time = 100 * null_time_count / (null_time_count + not_null_time_count)
#     print(f'{null_time_count = }')
#     print(f'{not_null_time_count = }')
#     print(f'{pct_null_time = }')


# for col in df.columns:
#     print(df[col].value_counts())
# print(df.columns)
# print(df.glimpse())
#
#
#
#
#
# pl.show_versions()

# df = (
#     pl.read_csv('Electric_Vehicle_Population_Data.csv')
#     .select(
#         VIN = pl.col('VIN (1-10)'),
#         COUNTY = pl.col('County'),
#         CITY = pl.col('City'),
#         STATE = pl.col('State'),
#         ZIP = pl.col('Postal Code'),
#         YEAR = pl.col('Model Year'),
#         MAKE = pl.col('Make'),
#         MODEL = pl.col('Model'),
#         EVTYPE = (
#             pl.col('Electric Vehicle Type')
#                 .str.replace('Plug-in Hybrid Electric Vehicle (PHEV)', 'HYBRID', literal=True)
#                 .str.replace('Battery Electric Vehicle (BEV)', 'BATT', literal=True)
#         ),
#         RANGE = pl.col('Electric Range'),
#         LEG_DIST = pl.col('Legislative District'),
#         UTIL_CO = pl.col('Electric Utility'),  only specified for WA State, so exclude
#         LOC = pl.col('Vehicle Location'),
#     )
#     .with_columns(
#         LONG= (
#             pl.col('LOC')
#                 .str.replace('POINT (', '', literal=True)
#                 .str.replace(')', '', literal=True) 
#                 .str.split(' ')
#                 .list.get(0)    # 1st element [0] is longitude (west-east)
#                 .cast(pl.Float64, strict=False)
#         )
#     )
#     .with_columns(
#         LAT = (
#             pl.col('LOC')
#                 .str.replace('POINT (', '', literal=True) 
#                 .str.replace(')', '', literal=True) 
#                 .str.split(' ')
#                 .list.get(1)   # 2nd element [1] is latitude (north-south)
#                 .cast(pl.Float64)
#         )
#    )
#    .drop('LOC')
# )

# df.write_excel(
#     'df_1.xlsx',
#     autofit=True,
#     freeze_panes=(1,0)
# )


print(df.shape)
print(df.sample(10).glimpse())


# df = (

#     pl.read_csv('henley_results_cleaned.csv')
#     .drop('time', 'fawley_loser_leading')   # EDA shows 80% of time values are null, so drop this col
# )
#-----  EXPLORATRY DATA ANALYSIS ----------------------------------------------
# if False:
#     df_row_count = df.height
#     df_null = (df
#         #.fill_null(0)
#         .null_count()
#         .transpose(include_header=True)
#         .with_columns(
#             PCT_NUL = (
#             100*pl.col('column_0')/df_row_count
#             )
#         )
#         .sort('column_0', descending=True)
#     )
    # print(df_null)
    # # print(df.select('PCT_NULL', 'column_0'))
    # null_time_count = df.filter(pl.col('time').is_null()).height
    # not_null_time_count = df.filter(pl.col('time').is_not_null()).height

#     print(1/0)



#     pct_null_time = 100 * null_time_count / (null_time_count + not_null_time_count)
#     print(f'{null_time_count = }')
#     print(f'{not_null_time_count = }')
#     print(f'{pct_null_time = }')


# for col in df.columns:
#     print(df[col].value_counts())
# print(df.columns)
# print(df.glimpse() asdf


AttributeError: 'Expr' object has no attribute 'begins_with'

In [None]:
(
    df['UTIL_CO']
    .value_counts()
    .sort('count')
    .write_excel(
        'df_util.xlsx',
        autofit=True,
        freeze_panes=(1,0)
        )
)
df['UTIL_CO'].value_counts()

UTIL_CO,count
str,u32
"""CITY OF SEATTLE - (WA)|CITY OF…",40106
"""BONNEVILLE POWER ADMINISTRATIO…",24
"""PUD NO 1 OF DOUGLAS COUNTY""",518
"""BONNEVILLE POWER ADMINISTRATIO…",4001
"""PUD NO 1 OF OKANOGAN COUNTY""",144
…,…
"""PACIFICORP""",1947
"""BONNEVILLE POWER ADMINISTRATIO…",2
"""BONNEVILLE POWER ADMINISTRATIO…",1217
"""PUD NO 1 OF CHELAN COUNTY""",1433
