In [1]:
import pandas as pd
import pickle
import warnings

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score

In [2]:
debug = False
if debug:
    file_name = 'debug_data.pkl'
else:
    file_name = 'data.pkl'

In [3]:
df = pd.read_pickle('./Data/Cleandata/training.pkl')

In [4]:
X = df[['borough', 'job_type', 'block', 'lot', 'zip_code', 'work_type', 'bldg_type',
        "owner's_business_type", 'non-profit', 'latitude', 'longitude',
        'council_district', 'issuance_year', 'issuance_month', 'issuance_day', 'job_start_year', 'job_start_month', 
        'job_start_day']]
y = df['permit_type']

In [5]:
warnings.filterwarnings('ignore')

le = LabelEncoder()
X['borough'] = le.fit_transform(X['borough'])
X['job_type'] = le.fit_transform(X['job_type'])
X["owner's_business_type"] = le.fit_transform(X["owner's_business_type"])
X['work_type'] = le.fit_transform(X['work_type'])
X['bldg_type'] = le.fit_transform(X['bldg_type'])
# X['block'] = le.fit_transform(X['block'])
# X['lot'] = le.fit_transform(X['lot'])
# X['zip_code'] = le.fit_transform(X['zip_code'])
X['non-profit'] = le.fit_transform(X['non-profit'])

In [6]:
#list for cols to scale
cols_to_scale = ['block','lot', 'zip_code', 'latitude', 'longitude', 'council_district', 'issuance_year',
                 'issuance_month', 'issuance_day', 'job_start_year', 'job_start_month', 'job_start_day']

#create and fit scaler
scaler = StandardScaler()

#scale selected data
X[cols_to_scale] = scaler.fit_transform(X[cols_to_scale])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [9]:
with open('./Data/log_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [10]:
with open('./Data/log_model.pkl', 'rb') as f:
    model = pickle.load(f)

In [11]:
y_pred = model.predict(X_test)

In [12]:
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {round(accuracy* 100, 2)}%')

Accuracy: 74.09%


In [13]:
X_train

Unnamed: 0,borough,job_type,block,lot,zip_code,work_type,bldg_type,owner's_business_type,non-profit,latitude,longitude,council_district,issuance_year,issuance_month,issuance_day,job_start_year,job_start_month,job_start_day
363874,1,4,0.188824,-0.279876,1.002741,10,1,6,0,-0.940954,0.340531,1.354761,-0.985528,0.738507,-1.236972,-0.931754,0.716069,-1.245909
1833384,1,1,1.174738,-0.280427,1.032721,9,1,6,0,-1.505119,-0.215699,1.540493,0.878380,-0.148342,-1.465590,0.937789,-0.181093,-1.476083
1650678,3,0,-0.322240,-0.271067,1.262563,10,1,6,0,0.223794,1.080901,0.116548,0.754119,-0.443958,1.392131,0.813153,-0.480147,1.401094
2431479,1,0,1.765100,-0.245741,1.039383,10,0,6,0,-1.739938,0.553294,1.664314,-0.364226,1.625356,-0.436810,-0.308573,1.613230,-0.440300
296476,2,2,-0.347528,-0.271067,-0.972573,9,1,1,0,1.044509,-0.134550,-0.626379,-0.985528,-1.035191,-0.322502,-0.931754,-1.078254,-0.325213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1692759,3,1,-0.325362,-0.227572,1.262563,2,1,13,0,0.100901,1.091032,0.116548,0.754119,0.442891,0.134734,0.563880,-0.181093,-1.476083
2356348,3,1,0.629956,3.848799,1.239246,9,1,1,0,0.461695,1.470102,0.054638,1.375422,0.147274,0.934896,1.436334,-1.078254,1.055833
2229102,1,1,-0.547334,-0.264460,1.009403,9,1,13,0,-0.723365,-0.014774,1.045208,1.126901,0.442891,0.134734,1.187061,0.417015,0.480397
2768328,2,1,-0.325675,-0.206651,-0.954252,9,1,1,0,1.196420,0.058267,-0.626379,-1.606831,0.147274,-1.694208,-1.554934,0.117961,-1.706258
