### Import Libraries

In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, cross_val_score

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import precision_score, recall_score, precision_recall_curve,f1_score, fbeta_score

from sklearn import linear_model, svm, naive_bayes, neighbors, ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV

### Import Data

In [3]:
# Import solar data
# https://www.californiadgstats.ca.gov/downloads/
# CSI Working Data Set
raw = pd.read_csv('WorkingDataSet_5-28-2020.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
raw.shape

(174344, 124)

In [5]:
raw.columns

Index(['Application Number', 'Program Administrator', 'Program',
       'Incentive Design', 'Incentive Type', 'Incentive Step',
       'Incentive Amount', 'Total Cost', 'Nameplate Rating', 'CEC PTC Rating',
       ...
       'CEC PTC Rating Single-Axis Tracking',
       'CEC PTC Rating Dual-Axis Tracking', 'CSI Rating Fixed',
       'CSI Rating Single-Axis Tracking', 'CSI Rating Dual-Axis Tracking',
       'MASH Track 1A Incentive Amount', 'MASH Track 1B Incentive Amount',
       'MASH Track 2 Incentive Amount', 'MASH Track 1A % Capacity',
       'MASH Track 1B % Capacity'],
      dtype='object', length=124)

In [6]:
#Feature selection based on (limited) domain knowledge
solar = raw[['Program Administrator', 'Program', 'Incentive Type', 'Incentive Amount',\
            'Total Cost', 'Nameplate Rating', 'CEC PTC Rating', 'Design Factor',\
            'CSI Rating', 'Current Incentive Application Status', 'System Owner Sector', 'Host Customer Sector',\
            'Host Customer Physical Address City', 'Host Customer Physical Address County', 'Host Customer Physical Zip Code',\
            'Solar Contractor Company Name', 'Seller Company Name', 'PV Module#1 Manufacturer', 'PV Module#1 Model',\
            'Inverter#1 Manufacturer', 'Inverter#1 Model', 'Installed Status', 'Completed/PBI-In Payment Status', 'Tracking Type']]

In [7]:
#Select only 'Delisted' or 'Installed' projects
sol = solar.loc[(solar['Current Incentive Application Status'] == 'Cancelled') |\
                (solar['Current Incentive Application Status'] == 'Completed') |\
                (solar['Current Incentive Application Status'] == 'Withdrawn') |\
                (solar['Current Incentive Application Status'] == 'Suspended - Incentive Claim Request Review')]

In [8]:
#Drop NaNs
sol = sol.dropna()

In [9]:
#Change target ('Installed Status') to binary. Majority = 'Installed', "1". Minority = "Delisted" (canceled), "0".
#Drop "Installed Status"
sol['Completed'] = (sol['Installed Status'] == 'Installed').astype(int) 
sol = sol.drop(columns=['Installed Status'])

In [10]:
#Calculate majority (Installed) and minority (Delisted) counts
sol['Completed'].value_counts()

1    142506
0     22122
Name: Completed, dtype: int64

In [11]:
#Drop columns for data leakage and company name!!!
sol = sol.drop(columns= ['Current Incentive Application Status', 'Completed/PBI-In Payment Status',\
                         'Solar Contractor Company Name', 'Seller Company Name'])

In [12]:
#Assign target and features,
y = sol['Completed']
X = sol.drop(columns = ['Completed'])

In [15]:
#Label encode categorical features
le = LabelEncoder()
names = ['Program Administrator', 'Program', 'Incentive Type', 'System Owner Sector',\
        'Host Customer Sector', 'Host Customer Physical Address City', 'Host Customer Physical Address County',\
        'PV Module#1 Manufacturer', 'PV Module#1 Model',\
        'Inverter#1 Manufacturer', 'Inverter#1 Model', 'Tracking Type']

for k in names:
    le.fit(X[k])
    X[k] = le.transform(X[k])

In [18]:
#Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=123)

In [27]:
#Random Forest
rf = ensemble.RandomForestClassifier()
rf_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='roc_auc',verbose=0)
rf.fit(X_train, y_train)
print("Average ROC AUC: ", np.mean(rf_scores))



Average ROC AUC:  0.6916266993894513
