In [None]:
!pip install pymysql
!pip install -U imbalanced-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting imbalanced-learn
  Using cached imbalanced_learn-0.9.1-py3-none-any.whl (199 kB)


In [None]:
import itertools
import pandas as pd
import numpy as np
import warnings
import time
import random
from sqlalchemy import create_engine
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import statsmodels.api as sm
from glob import glob

In [None]:
def scaling_df(df, scaling_col=None, scaling_func=MinMaxScaler) :
    """
    :param df: 사용할 데이터프레임
    :param scaling_col: default = None, 스케일링할 컬럼 리스트
    :param scaling_func: 스케일링시 사용할 컬럼, MinMaxScaler or StandardScaler
    :return: 스케일링 된 데이터 프레임
    """
    if scaling_col is not None :
        for c in scaling_col :
            scaler = scaling_func()
            scaler.fit(df[[c]])
            df[c] = scaler.transform(df[[c]])
    return df

In [None]:
# GridSearchCV의 최적 하이퍼파라미터로 학습된 Estimator 반환하는 함수
def optimizing_lr(clf, train_x, train_y):
  parameters = {'penalty': ['l2'],
                'C': [round(random.random(),1)]}
            
  start = time.time()
  grid_cv = GridSearchCV(clf, param_grid=parameters, n_jobs=-1, scoring='accuracy')
  grid_cv.fit(train_x, train_y)

  #최적의 하이퍼 파라미터 출력
  print('GridSearch 수행시간 : ', time.time()-start)
  print('최적 하이퍼 파라미터 :',grid_cv.best_params_)

  print(grid_cv.best_estimator_)
  return grid_cv.best_estimator_

In [None]:
def logistic_regression(train_x, train_y, feature_importance = False, optimizing = False):

  # logistic regression
  lr_clf = LogisticRegression()

  # optimizing parameters
  if optimizing :
    lr_clf = optimizing_lr(lr_clf, train_x, train_y)
  else:
    lr_clf.fit(train_x, train_y)

  return lr_clf

In [None]:
def summary_lr(x, y):
  con_x = sm.add_constant(x)
  model = sm.OLS(y, con_x)
  result = model.fit()
  print(result.summary())

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
scaling_col = ['width', 'length', 'dist_walkway', 'dist_farm', 'speed_limit', 'num_near_species', 'frequency_appearance_animals',
               'dist_river', 'dist_hikingtrails', 'fence_length', 'fence_height', 'traffic_volumne', 'dist_buildings', 'num_lane']
scaling_col_tun = scaling_col + ['hiehgt', 'openness']

## 육교형

In [None]:
df1=pd.read_csv('/content/drive/MyDrive/데이터청년캠퍼스/Team2_Data File/maindata/overpass_knn3_drop.csv',encoding='cp949')
df1=scaling_df(df1, scaling_col=scaling_col, scaling_func=StandardScaler)

# x, y
x=df1.drop('efficiency', axis=1)
y=df1['efficiency']

# train, test set
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, stratify=df1['efficiency'])
print('SMOTE 적용 전 값의 분포 :\n',pd.Series(train_y).value_counts())

# 불균형 데이터 Oversampling : SMOTE
train_over_x, train_over_y = SMOTE(random_state=0).fit_resample(train_x, train_y)
print('SMOTE 적용 후 값의 분포 :\n',pd.Series(train_over_y).value_counts())


# Logistic Regression
lr_clf =logistic_regression(train_over_x, train_over_y, optimizing=True)
pred_y = lr_clf.predict(test_x)
print(classification_report(test_y,pred_y))

SMOTE 적용 전 값의 분포 :
 1.0    86
0.0    60
Name: efficiency, dtype: int64
SMOTE 적용 후 값의 분포 :
 0.0    86
1.0    86
Name: efficiency, dtype: int64
GridSearch 수행시간 :  0.16704583168029785
최적 하이퍼 파라미터 : {'C': 1.0, 'penalty': 'l2'}
LogisticRegression()
              precision    recall  f1-score   support

         0.0       0.53      0.67      0.59        15
         1.0       0.72      0.59      0.65        22

    accuracy                           0.62        37
   macro avg       0.62      0.63      0.62        37
weighted avg       0.64      0.62      0.62        37



In [None]:
# 생태통로 효율성 '판단불가' 값 추정
df2=pd.read_csv('/content/drive/MyDrive/데이터청년캠퍼스/Team2_Data File/maindata/overpass_knn3_drop_not3.csv',encoding='cp949')
df2_scaling=scaling_df(df2, scaling_col=scaling_col, scaling_func=StandardScaler)

not3_x=df2_scaling.drop('efficiency', axis=1)
pred_y = lr_clf.predict(not3_x)
df2['efficiency']=pred_y

# df1, df2 merge
df = pd.concat([df1, df2], ignore_index=True)

# DF to CSV
path = '/content/drive/MyDrive/데이터청년캠퍼스/Team2_Data File/maindata/'
df.to_csv(path + 'overpass_knn_predic.csv', index=False, encoding='cp949')

## 터널형

In [None]:
df1=pd.read_csv('/content/drive/MyDrive/데이터청년캠퍼스/Team2_Data File/maindata/tunnel_knn3_drop.csv',encoding='cp949')
df1=scaling_df(df1, scaling_col=scaling_col_tun, scaling_func=StandardScaler)

# x, y
x=df1.drop('efficiency', axis=1)
y=df1['efficiency']

# train, test set
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, stratify=df1['efficiency'])
print('SMOTE 적용 전 값의 분포 :\n',pd.Series(train_y).value_counts())

# 불균형 데이터 Oversampling : SMOTE
train_over_x, train_over_y = SMOTE(random_state=0).fit_resample(train_x, train_y)
print('SMOTE 적용 후 값의 분포 :\n',pd.Series(train_over_y).value_counts())


# Logistic Regression
lr_clf =logistic_regression(train_over_x, train_over_y, optimizing=True)
pred_y = lr_clf.predict(test_x)
print(classification_report(test_y,pred_y))

SMOTE 적용 전 값의 분포 :
 1.0    53
0.0    44
Name: efficiency, dtype: int64
SMOTE 적용 후 값의 분포 :
 0.0    53
1.0    53
Name: efficiency, dtype: int64
GridSearch 수행시간 :  0.1311798095703125
최적 하이퍼 파라미터 : {'C': 0.4, 'penalty': 'l2'}
LogisticRegression(C=0.4)
              precision    recall  f1-score   support

         0.0       0.70      0.64      0.67        11
         1.0       0.73      0.79      0.76        14

    accuracy                           0.72        25
   macro avg       0.72      0.71      0.71        25
weighted avg       0.72      0.72      0.72        25



In [None]:
# 생태통로 효율성 '판단불가' 값 추정
df2=pd.read_csv('/content/drive/MyDrive/데이터청년캠퍼스/Team2_Data File/maindata/tunnel_knn3_drop_not3.csv',encoding='cp949')
df2_scaling=scaling_df(df2, scaling_col=scaling_col, scaling_func=StandardScaler)

not3_x=df2_scaling.drop('efficiency', axis=1)
pred_y = lr_clf.predict(not3_x)
df2['efficiency']=pred_y

# df1, df2 merge
df = pd.concat([df1, df2], ignore_index=True)

# DF to CSV
path = '/content/drive/MyDrive/데이터청년캠퍼스/Team2_Data File/maindata/'
df.to_csv(path + 'tunnel_knn_predic.csv', index=False, encoding='cp949')