## 1) MSE 방식으로 학습 -> 수정함
## 2) feature의 개수가 너무 많음 -> 7개 조절
## 3) layer 깊이 조절하기 -> 3개로 조절
## 4) Normalize 다른거 -> PCA 변수자체를 norm으로 만듬

In [1]:
#일반 라이브러리
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#API를 사용하기 위한 import
# from geoband.API import *
# import folium
# import json
# import geopandas as gpd

# 전처리 및 operator
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

# model
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

In [2]:
PCA_data = pd.read_csv("PCA.csv")
PCA_data = PCA_data.iloc[:,1:]

data = pd.read_csv("total.csv")
data = data.iloc[:,1:]


In [3]:
data.columns.tolist()[-5:]

[' rob_satety_q1',
 'mur_safety_q2',
 'ta_safety_q3',
 'raw_odder_q4',
 'overall_q5']

In [4]:
target = "raw_odder_q4"

dataset = pd.concat([PCA_data[["PC1","PC2","PC3","PC4","PC5","PC6","PC7"]], data[["jur_stn",target]]], axis = 1)

train = dataset[dataset[target].isnull() == False]
test = dataset[dataset[target].isnull()]

print(train.shape)
print(test.shape)

(246, 9)
(82, 9)


In [5]:
train

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,jur_stn,raw_odder_q4
0,-5.843327,-1.552553,-0.386392,0.054962,0.000691,2.365791,0.396099,마산동부경찰서,63.6
1,-5.753760,-1.934429,-0.455216,0.511299,0.112767,2.344618,0.208428,마산동부경찰서,65.7
2,-6.051162,-1.223132,-0.257158,-0.117475,0.057478,2.496029,0.342167,마산동부경찰서,70.7
3,-6.093965,-1.350174,-0.233333,0.005199,0.116385,2.718029,0.422335,마산동부경찰서,70.0
4,-5.929747,-0.562165,0.425840,-0.649937,0.049884,2.583079,0.319921,마산동부경찰서,68.9
...,...,...,...,...,...,...,...,...,...
321,-3.092893,-4.157176,-1.933148,1.088230,4.034375,0.099208,-2.121681,창원중부경찰서,62.4
322,-3.211471,-3.146838,-1.648940,0.706636,4.047320,0.200138,-1.773088,창원중부경찰서,70.6
323,-3.235498,-3.143853,-1.607412,0.673844,4.086958,-0.036664,-2.125947,창원중부경찰서,68.6
324,-3.110050,-2.502579,-1.075531,0.192931,4.602668,-0.905996,-2.867498,창원중부경찰서,73.6


In [6]:
X = train[["PC1","PC2","PC3","PC4","PC5","PC6","PC7"]]
Y = train[target]

# train & test 2
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 0, shuffle = True,
                                                    train_size = 0.83)

print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(204, 7) (204,)
(42, 7) (42,)


In [7]:
# 모델전처리1
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# scaler.fit(X_train)
# X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)


# 모델전처리2

# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# scaler.fit(train)
# train = scaler.transform(train)

In [8]:
# step1) import
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader # custom_dataset & Dataloader

# step2) device
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.manual_seed(777)
if device == "cuda":
    torch.cuda.manual_seed_all(777)
print(device)

cpu


In [9]:
X_test.shape[0]

42

In [10]:
X_train.shape[0] % 17
X_test.shape[0] % 14

0

In [11]:
# step3) hyper parameter
training_epoch = 10000  # epoch 증대
batch_size = 17       # 4개에서 증가시킴 --> 학습이 안정적
learning_rate = 0.01 # 0.005에서 줄임

In [12]:
# step4) dataset and dataloader
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self,X,Y):
        self.x_data = X
        self.y_data = Y
        
    def __len__(self):
        return len(self.x_data)
    
    def __getitem__(self, idx):
        x = self.x_data[idx]
        y = self.y_data[idx]        
        x = torch.FloatTensor([x])
        y = torch.FloatTensor([y])
        
        return x,y
        
# train
X_train = np.array(X_train)
Y_train = np.array(Y_train)
train_dataset = CustomDataset(X_train, Y_train)
train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True, drop_last = True)

# test
X_test = np.array(X_test)
Y_test = np.array(Y_test)
test_dataset = CustomDataset(X_test, Y_test)
test_loader = DataLoader(test_dataset, batch_size = 14, shuffle = False, drop_last = False)
X_test = torch.FloatTensor([X_test])
Y_test = torch.FloatTensor([Y_test])


In [14]:
# step5) model

class LinearModel(nn.Module):
    def __init__(self):
        super(LinearModel, self).__init__()
        
        # feature의 개수가 34개 (전체모델)
        # feature의 개수가 17개 (PCA모델)
        self.layer1 = nn.Sequential(
            nn.Linear(7,8),
            nn.ReLU(),
        )
            
        self.layer2 = nn.Sequential(
            nn.Linear(8,1),
        )
                
    def forward(self,x):
        out = self.layer1(x)
        out = self.layer2(out)        
        #out = self.layer3(out)
        return out
    
    def initalize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):  
                nn.init.kaiming_normal_(m.weight, mode = "fan_out", nonlinearity= "leaky_relu" ) # 321 -> 20 
                nn.init.constant_(m.bias, 0)
        print("Weight Init Success!")
            
model = LinearModel().to(device)
model.initalize_weights()


Weight Init Success!


In [15]:
# step6) loss & optim
criterion = nn.MSELoss().to(device)
optimizer = optim.Adam(model.parameters(), lr = learning_rate)
lr_sche = optim.lr_scheduler.StepLR(optimizer, step_size= 200, gamma=0.9) # 10에서 20으로 바꿈


In [None]:
loss_list = []
r2_list = []

iteration = len(train_loader)
for epoch in range(training_epoch):
    model.train()
    r2 = 0
    loss = 0
    
    lr_sche.step()
    for sample in train_loader:
        optimizer.zero_grad()

        # Data
        X, Y = sample
        X = X.to(device)
        Y = Y.to(device)
        Y.squeeze_()

        # hypothesis
        hypothesis = model(X)
        hypothesis.squeeze_()

        cost = criterion(hypothesis, Y)
        cost.backward()
        optimizer.step()

        #calculate
        loss += cost.item()
        r2 += r2_score(Y,hypothesis.detach().numpy())

    loss /= iteration
    r2 /= iteration

    loss_list.append(loss)
    r2_list.append(r2)

    if (epoch+1) % 1000 == 0:
        print("[Epoch {:04d}] loss = {:.4f} r2 = {:.4f}".format(epoch+1, loss, r2))

    if (epoch+1) % 1000 == 0:
        with torch.no_grad():
            model.eval()
            test_loss = 0
            test_r2 = 0

            for X,Y in test_loader:
                X = X.to(device)
                Y = Y.to(device)
                Y.squeeze_()

                # forward
                hypothesis = model(X)
                hypothesis.squeeze_()

                test_loss += criterion(hypothesis, Y).item()
                test_r2 += r2_score(Y, hypothesis.detach().numpy())

            test_loss /= len(test_loader)
            test_r2 /= len(test_loader)
            print("[Test] loss = {:.4f} r2 = {:.4f}".format(test_loss, test_r2))
            print("\n")

print("Done")
            



[Epoch 1000] loss = 8.2052 r2 = 0.5264
[Test] loss = 14.6421 r2 = -0.0517




In [149]:
print(Y)
print(hypothesis)

tensor([71.2000, 77.7000, 73.5000, 67.9000, 71.0000, 69.9000, 72.3000, 73.3000,
        65.6000, 72.4000, 68.1000, 63.4000, 68.1000, 63.8000])
tensor([70.8447, 71.9116, 75.3496, 67.0268, 69.5330, 66.2083, 67.9208, 66.7531,
        73.8830, 70.1403, 67.1118, 65.6201, 69.8401, 68.6699])


In [150]:
torch.mean(abs(Y - hypothesis))

tensor(3.2366)

# q5


[Epoch 10000] loss = 3.0520 r2 = 0.6574 <br>
[Test] loss = 3.8111 r2 = 0.3510

In [68]:
t = torch.FloatTensor(np.array(test.iloc[:,:-2]))
q5 = model(t).squeeze().tolist()

In [73]:
q5 = pd.DataFrame(q5)
q5.head()

Unnamed: 0,0
0,72.227943
1,72.491867
2,75.432129
3,75.802956
4,76.064537


In [109]:
q5

Unnamed: 0,0
0,72.227943
1,72.491867
2,75.432129
3,75.802956
4,76.064537
...,...
77,72.854820
78,74.892082
79,74.913528
80,74.075310


# q1
[Epoch 5000] loss = 3.7710 r2 = 0.7012 <br>
[Test] loss = 5.2094 r2 = 0.2837

In [110]:
q1 = model(t).squeeze().tolist()
q1 = pd.DataFrame(q1)
q1.head()

Unnamed: 0,0
0,74.65258
1,74.779327
2,77.768486
3,77.897026
4,80.347832


# q2

[Epoch 10000] loss = 3.7328 r2 = 0.6673 <br>
[Test] loss = 6.9457 r2 = 0.2891

In [111]:
q2 = model(t).squeeze().tolist()
q2 = pd.DataFrame(q2)
q2.head()

Unnamed: 0,0
0,76.546989
1,76.633797
2,78.503731
3,78.437355
4,81.25901


# q3

[Epoch 10000] loss = 3.7243 r2 = 0.3939
[Test] loss = 8.2219 r2 = -0.5128

--> 성능이 굉장히 좋지 않아서 다중회귀분석 모델 사용

In [133]:
q3 = model(t).squeeze().tolist()
q3 = pd.DataFrame(q3)
q3.head()

Unnamed: 0,0
0,68.790192
1,68.882507
2,71.30928
3,71.521454
4,68.954582


# q4

[Epoch 10000] loss = 7.1436 r2 = 0.5659
[Test] loss = 13.2010 r2 = 0.0416

같은 경우도 성능이 굉장히 좋지 않아서 앙상블 모델에서 데이터 가져오기

In [151]:
q4 = model(t).squeeze().tolist()
q4 = pd.DataFrame(q4)
q4.head()

Unnamed: 0,0
0,67.609703
1,67.995857
2,75.961418
3,75.530128
4,71.116188


# 제출물 만들기

In [159]:
correct = pd.read_csv("./correct_answer.csv")
correct = correct.iloc[:,1:]
correct.columns = ["q1","q2","q3","q4","q5"]

Basic = pd.read_csv("./Basic.csv")
answers = pd.concat([Basic, correct], axis = 1)
answers.head()


Unnamed: 0,jur_stn,year,상반기,q1,q2,q3,q4,q5
0,마산동부경찰서,2020,0,75.549856,77.942222,67.951587,68.941667,72.715301
1,마산동부경찰서,2020,1,75.761459,77.453333,67.203968,69.733333,72.519672
2,마산중부경찰서,2020,0,78.185317,79.773333,69.655556,71.608333,74.402186
3,마산중부경찰서,2020,1,78.187602,79.757778,69.62381,71.075,74.236066
4,서울강남경찰서,2020,0,80.210797,81.506667,69.115079,72.266667,75.731148


In [164]:
q3_linear = pd.read_csv("q3.csv")
q3_linear = q3_linear.iloc[:,1:]

In [169]:
answers["q1"] = q1
answers["q2"] = q2
answers["q3"] = q3_linear
answers["q5"] = q5

In [172]:
answers = answers.drop(["year","상반기"], axis = 1)
answers["범죄 안전도"] = (answers["q1"] + answers["q2"]) /2
answers["분야별 안전도"] = answers["범죄 안전도"]*0.343 + answers["q3"]*0.305 + answers["q4"]*0.352
answers["종합체감안전도_점수"] = answers["q5"]*0.3 + answers["분야별 안전도"]*0.7

answers = answers[["jur_stn","종합체감안전도_점수", "분야별 안전도", "범죄 안전도", "q1","q2","q3","q4","q5"]]
answers.head()


Unnamed: 0,jur_stn,종합체감안전도_점수,분야별 안전도,범죄 안전도,q1,q2,q3,q4,q5
0,마산동부경찰서,70.478066,69.728118,75.599785,74.65258,76.546989,64.032541,68.941667,72.227943
1,마산동부경찰서,70.610993,69.804904,75.706562,74.779327,76.633797,63.250558,69.733333,72.491867
2,마산중부경찰서,73.248077,72.312054,78.136108,77.768486,78.503731,66.574543,71.608333,75.432129
3,마산중부경찰서,73.071024,71.900197,78.167191,77.897026,78.437355,65.804755,71.075,75.802956
4,서울강남경찰서,73.348605,72.184634,80.803421,80.347832,81.25901,62.397356,72.266667,76.064537


In [174]:
submit = pd.read_csv("./25.결과제출양식.csv")
police = submit["경찰서명"]

In [175]:
for i in range(len(police)):
    temp = answers[answers["jur_stn"] == police[i].strip()].iloc[:,1:]
    submit.iloc[i,2:] = temp.iloc[0,:].tolist()

In [187]:
submit

Unnamed: 0,순번,경찰서명,종합체감안전도_점수 \n(전반적안전도*0.3) + (분야별안전도*0.7),분야별 안전도\n(범죄안전도*0.343) + \n(교통사고안전도*0.305)\n + (법질서준수도*0.352),범죄 안전도\n(절도폭력안전도+강도살인안전도)/2,문항1. 절도 폭력안전도\n (문항1 점수 * 10),문항2. 강도 살인 안전도\n(문항2 점수 * 10),문항3. 교통사고 안전도\n(문항3 점수*10),문항4. 법질서 준수도\n(문항 4점수 * 10),문항5. 전반적 안전도\n(문항 5점수 * 10)
0,1,서울중부경찰서,70.532638,70.720719,74.291229,74.06012,74.522339,70.528177,67.408333,70.093781
1,2,서울종로경찰서,73.883076,73.585449,81.169403,79.464531,82.874275,69.521564,69.716667,74.577538
2,3,서울남대문경찰서,72.735462,72.75211,76.812042,76.550682,77.073402,74.699823,67.108333,72.696617
3,4,서울서대문경찰서,74.825725,73.58745,81.893677,79.555428,84.231926,69.877328,68.708333,77.715034
4,5,서울혜화경찰서,70.588895,70.749605,76.069687,73.800896,78.338478,71.479243,64.933333,70.213905
5,6,서울용산경찰서,74.855564,74.721614,81.307461,79.316536,83.298386,73.889688,69.025,75.168114
6,7,서울성북경찰서,78.00191,76.354148,87.028393,83.978897,90.077888,73.279813,68.616667,81.846687
7,8,서울동대문경찰서,72.048968,71.698822,77.584435,76.050835,79.118034,69.93539,67.491667,72.865974
8,9,서울마포경찰서,74.750065,73.754594,82.216045,81.63102,82.801071,71.47658,67.483333,77.07283
9,10,서울영등포경찰서,71.719919,71.620424,76.146023,75.608086,76.68396,72.709743,66.266667,71.952072


In [197]:
submit.to_csv("./submit4.csv", encoding= "utf-8", index = False)


In [194]:
df = pd.read_csv("4.체감안전도_통계(2017~2019).csv")
df[df["medium_category"].isin(["서울관악","서울관악서"])]


Unnamed: 0,large_category,medium_category,tot_f_safety,dept_safety,crime_safety,rob_satety_q1,mur_safety_q2,ta_safety_q3,raw_odder_q4,overall_q5
26,2017 상반기(경찰서별),서울관악,67.3,66.9,75.0,73.5,76.4,66.4,58.7,68.3
75,2017 하반기(경찰서별),서울관악,66.6,66.3,73.1,70.9,75.3,66.1,59.2,67.2
123,2018 상반기(경찰서별),서울관악서,68.6,68.3,75.7,75.5,75.9,66.4,62.1,69.3
171,2018 하반기(경찰서별),서울관악서,71.0,70.9,75.8,74.4,77.1,66.9,69.4,71.2
219,2019 상반기(경찰서별),서울관악서,69.6,69.3,75.3,74.5,76.1,64.6,67.5,70.3
267,2019 하반기(경찰서별),서울관악서,68.3,68.2,72.4,71.8,72.9,66.4,65.7,68.4


In [198]:
pd.read_csv("./submit4.csv")

Unnamed: 0,순번,경찰서명,종합체감안전도_점수 \n(전반적안전도*0.3) + (분야별안전도*0.7),분야별 안전도\n(범죄안전도*0.343) + \n(교통사고안전도*0.305)\n + (법질서준수도*0.352),범죄 안전도\n(절도폭력안전도+강도살인안전도)/2,문항1. 절도 폭력안전도\n (문항1 점수 * 10),문항2. 강도 살인 안전도\n(문항2 점수 * 10),문항3. 교통사고 안전도\n(문항3 점수*10),문항4. 법질서 준수도\n(문항 4점수 * 10),문항5. 전반적 안전도\n(문항 5점수 * 10)
0,1,서울중부경찰서,70.532638,70.720719,74.291229,74.06012,74.522339,70.528177,67.408333,70.093781
1,2,서울종로경찰서,73.883076,73.585449,81.169403,79.464531,82.874275,69.521564,69.716667,74.577538
2,3,서울남대문경찰서,72.735462,72.75211,76.812042,76.550682,77.073402,74.699823,67.108333,72.696617
3,4,서울서대문경찰서,74.825725,73.58745,81.893677,79.555428,84.231926,69.877328,68.708333,77.715034
4,5,서울혜화경찰서,70.588895,70.749605,76.069687,73.800896,78.338478,71.479243,64.933333,70.213905
5,6,서울용산경찰서,74.855564,74.721614,81.307461,79.316536,83.298386,73.889688,69.025,75.168114
6,7,서울성북경찰서,78.00191,76.354148,87.028393,83.978897,90.077888,73.279813,68.616667,81.846687
7,8,서울동대문경찰서,72.048968,71.698822,77.584435,76.050835,79.118034,69.93539,67.491667,72.865974
8,9,서울마포경찰서,74.750065,73.754594,82.216045,81.63102,82.801071,71.47658,67.483333,77.07283
9,10,서울영등포경찰서,71.719919,71.620424,76.146023,75.608086,76.68396,72.709743,66.266667,71.952072
