In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
import os

In [3]:
solar_data = pd.read_csv('solar_data.csv')
solar_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3699 entries, 0 to 3698
Data columns (total 24 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   case_id     3699 non-null   int64  
 1   multi_poly  3699 non-null   object 
 2   eia_id      3699 non-null   int64  
 3   p_state     3699 non-null   object 
 4   p_county    3699 non-null   object 
 5   ylat        3699 non-null   float64
 6   xlong       3699 non-null   float64
 7   p_area      3699 non-null   int64  
 8   p_img_date  3699 non-null   int64  
 9   p_dig_conf  3699 non-null   int64  
 10  p_name      3699 non-null   object 
 11  p_year      3699 non-null   int64  
 12  p_pwr_reg   3685 non-null   object 
 13  p_tech_pri  3699 non-null   object 
 14  p_tech_sec  3699 non-null   object 
 15  p_axis      3436 non-null   object 
 16  p_azimuth   3699 non-null   int64  
 17  p_tilt      3265 non-null   float64
 18  p_battery   103 non-null    object 
 19  p_cap_ac    3699 non-null  

In [4]:
solar_data['p_axis'] = solar_data['p_axis'].replace({
    'fixed-tilt,single-axis': 'combination',
    'fixed-tilt,single-axis,dual-axis': 'combination'
})
solar_data['p_axis'].value_counts()

p_axis
fixed-tilt     1975
single-axis    1430
dual-axis        25
combination       6
Name: count, dtype: int64

In [5]:
solar_data['p_type'] = solar_data['p_type'].replace({'landfill': 'landfill', 'landfill named': 'landfill',
                                                     'greenfield': 'greenfield',
                                                     'PCSC': 'contaminated', 'superfund': 'contaminated',
                                                     'RCRA': 'contaminated', 'AML': 'contaminated'})
(solar_data['p_type'].value_counts())


p_type
greenfield      3522
landfill         115
contaminated      62
Name: count, dtype: int64

In [6]:
solar_data['p_tech_sec'].value_counts()
solar_data['p_tech_sec'] = solar_data['p_tech_sec'].replace({
    'c-si': 'c-si',
    'thin film': 'thin-film',
    'unknown': 'unknown',
    'cpv': 'cpv',
    'c-si,thin-film': 'combination',
    'c-si,cpv': 'combination',
    'c-si,thin-film,cpv': 'combination'
})
# Verify the changes
print(solar_data['p_tech_sec'].value_counts())

p_tech_sec
c-si           2962
thin-film       396
unknown         333
combination       4
cpv               4
Name: count, dtype: int64


In [7]:
solar_data['p_agrivolt'] = (solar_data['p_agrivolt'].apply(lambda x: x if x == 'non-agrivoltaic' else 'agrivoltaic'))
solar_data['p_agrivolt'].value_counts()

p_agrivolt
non-agrivoltaic    3633
agrivoltaic          66
Name: count, dtype: int64

In [8]:
solar_data['p_battery'].value_counts()

p_battery
batteries                            102
solar thermal with energy storage      1
Name: count, dtype: int64

In [9]:
drop = [0, 1, 2, 8, 9, 10, 11, 12, 13, 18, 3, 4]
solar_data = solar_data.drop(solar_data.columns[drop], axis=1)
solar_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3699 entries, 0 to 3698
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ylat        3699 non-null   float64
 1   xlong       3699 non-null   float64
 2   p_area      3699 non-null   int64  
 3   p_tech_sec  3699 non-null   object 
 4   p_axis      3436 non-null   object 
 5   p_azimuth   3699 non-null   int64  
 6   p_tilt      3265 non-null   float64
 7   p_cap_ac    3699 non-null   float64
 8   p_cap_dc    3699 non-null   float64
 9   p_type      3699 non-null   object 
 10  p_agrivolt  3699 non-null   object 
 11  p_zscore    3699 non-null   float64
dtypes: float64(6), int64(2), object(4)
memory usage: 346.9+ KB
