In [12]:
!pip install pymysql
!pip install -U imbalanced-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting imbalanced-learn
  Using cached imbalanced_learn-0.9.1-py3-none-any.whl (199 kB)


In [13]:
import itertools
import pandas as pd
import numpy as np
import warnings
import time
from sqlalchemy import create_engine
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import statsmodels.api as sm

In [14]:
# GridSearchCV의 최적 하이퍼파라미터로 학습된 Estimator 반환하는 함수
def optimizing_lr(clf, train_x, train_y):
  parameters = {'penalty': ['l1','l2','elasticnet','none'],
                'C': np.arange(0,1,0.1),
                'random_state': range(3,7,2)}
  start = time.time()
  grid_cv = GridSearchCV(clf, param_grid=parameters, n_jobs=-1, scoring='accuracy', cv=5)
  grid_cv.fit(train_x, train_y)

  #최적의 하이퍼 파라미터 출력
  print('GridSearch 수행시간 : ', time.time()-start)
  print('최적 하이퍼 파라미터 :',grid_cv.best_params_)

  return grid_cv.best_estimator_

In [15]:
def logistic_regression(train_x, train_y, feature_importance = False, optimizing = False):

  # logistic regression
  lr_clf = LogisticRegression()

  # optimizing parameters
  if optimizing :
    lr_clf = optimizing_lr(lr_clf, train_x, train_y)
  else:
    lr_clf.fit(train_x, train_y)

  return lr_clf

In [16]:
def summary_lr(x, y):
  con_x = sm.add_constant(x)
  model = sm.OLS(y, con_x)
  result = model.fit()
  print(result.summary())

In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 육교형

In [18]:
df1=pd.read_csv('/content/drive/MyDrive/데이터청년캠퍼스/Team2_Data File/maindata/overpass_knn3_drop.csv',encoding='cp949')

# x, y
x=df1.drop('생태통로_효율성', axis=1)
y=df1['생태통로_효율성']

# train, test set
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, stratify=df1['생태통로_효율성'])
print('SMOTE 적용 전 값의 분포 :\n',pd.Series(train_y).value_counts())

# 불균형 데이터 Oversampling : SMOTE
train_over_x, train_over_y = SMOTE(random_state=0).fit_resample(train_x, train_y)
print('SMOTE 적용 후 값의 분포 :\n',pd.Series(train_over_y).value_counts())


# Logistic Regression
lr_clf =logistic_regression(train_over_x, train_over_y, optimizing=False)
pred_y = lr_clf.predict(test_x)
print(classification_report(test_y,pred_y))

SMOTE 적용 전 값의 분포 :
 1.0    86
0.0    60
Name: 생태통로_효율성, dtype: int64
SMOTE 적용 후 값의 분포 :
 0.0    86
1.0    86
Name: 생태통로_효율성, dtype: int64
              precision    recall  f1-score   support

         0.0       0.42      0.53      0.47        15
         1.0       0.61      0.50      0.55        22

    accuracy                           0.51        37
   macro avg       0.52      0.52      0.51        37
weighted avg       0.53      0.51      0.52        37



In [19]:
# 생태통로 효율성 '판단불가' 값 추정
df2=pd.read_csv('/content/drive/MyDrive/데이터청년캠퍼스/Team2_Data File/maindata/overpass_knn3_drop_not3.csv',encoding='cp949')

not3_x=df2.drop('생태통로_효율성', axis=1)
pred_y = lr_clf.predict(not3_x)
df2['생태통로_효율성']=pred_y

# df1, df2 merge
df = pd.concat([df1, df2], ignore_index=True)

# DF to CSV
path = '/content/drive/MyDrive/데이터청년캠퍼스/Team2_Data File/maindata/'
df.to_csv(path + 'overpass_mean_predic.csv', index=False, encoding='cp949')

## 터널형

In [20]:
df1=pd.read_csv('/content/drive/MyDrive/데이터청년캠퍼스/Team2_Data File/maindata/tunnel_knn3_drop.csv',encoding='cp949')

# x, y
x=df1.drop('생태통로_효율성', axis=1)
y=df1['생태통로_효율성']

# train, test set
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, stratify=df1['생태통로_효율성'])
print('SMOTE 적용 전 값의 분포 :\n',pd.Series(train_y).value_counts())

# 불균형 데이터 Oversampling : SMOTE
train_over_x, train_over_y = SMOTE(random_state=0).fit_resample(train_x, train_y)
print('SMOTE 적용 후 값의 분포 :\n',pd.Series(train_over_y).value_counts())


# Logistic Regression
lr_clf =logistic_regression(train_over_x, train_over_y, optimizing=False)
pred_y = lr_clf.predict(test_x)
print(classification_report(test_y,pred_y))

SMOTE 적용 전 값의 분포 :
 1.0    53
0.0    44
Name: 생태통로_효율성, dtype: int64
SMOTE 적용 후 값의 분포 :
 1.0    53
0.0    53
Name: 생태통로_효율성, dtype: int64
              precision    recall  f1-score   support

         0.0       0.58      0.64      0.61        11
         1.0       0.69      0.64      0.67        14

    accuracy                           0.64        25
   macro avg       0.64      0.64      0.64        25
weighted avg       0.64      0.64      0.64        25



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [21]:
# 생태통로 효율성 '판단불가' 값 추정
df2=pd.read_csv('/content/drive/MyDrive/데이터청년캠퍼스/Team2_Data File/maindata/tunnel_knn3_drop_not3.csv',encoding='cp949')

not3_x=df2.drop('생태통로_효율성', axis=1)
pred_y = lr_clf.predict(not3_x)
df2['생태통로_효율성']=pred_y

# df1, df2 merge
df = pd.concat([df1, df2], ignore_index=True)

# DF to CSV
path = '/content/drive/MyDrive/데이터청년캠퍼스/Team2_Data File/maindata/'
df.to_csv(path + 'tunnel_mean_predic.csv', index=False, encoding='cp949')