## 💻 환경 설정

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import folium
import sys

import geopy
from tqdm import tqdm

from pycaret.regression import *

#경고창이 뜨지 않도록 해주는 것
import warnings
warnings.filterwarnings('ignore')

# notebook을 실행한 브라우저에서 바로 그림을 볼 수 있게 해주는 것
%matplotlib inline 

>주피터 노트북에서 시각화할때, matplotlib가 한글 폰트 지원하지 않아, 깨짐 처리해줘야 함

In [2]:
import platform                

from matplotlib import font_manager, rc
plt.rcParams['axes.unicode_minus']= False

if platform.system() == 'Darwin': #맥os 사용자의 경우에
    plt.style.use('seaborn-darkgrid') 
    rc('font', family = 'AppleGothic')
    
elif platform.system() == 'Windows':#윈도우 사용자의 경우에
    path = 'c:/Windows/Fonts/malgun.ttf'
    font_name = font_manager.FontProperties(fname=path).get_name()
    plt.style.use('seaborn-darkgrid') # https://python-graph-gallery.com/199-matplotlib-style-sheets/
    rc('font', family=font_name)

> pandas 작업 중 tqdm을 사용하기 위해 다음과 같은 설정을 합니다.

In [3]:
tqdm.pandas()

>복원성을 위해 아래 random_state를 상수로 고정합니다.

In [4]:
RANDOM_STATE= 0
np.random.seed(RANDOM_STATE)

##  💻데이터 불러오기 

In [5]:
df = pd.read_csv('./NYC_taxi') 

In [6]:
df.head()

Unnamed: 0,zip_code,pickup_hour,month,day,weekday,hour,is_weekend,cnt
0,11106,2015-01-01 00:00:00,1,1,3,0,0,75
1,10006,2015-01-01 00:00:00,1,1,3,0,0,64
2,11201,2015-01-01 00:00:00,1,1,3,0,0,169
3,11430,2015-01-01 00:00:00,1,1,3,0,0,222
4,10032,2015-01-01 00:00:00,1,1,3,0,0,64


## 💻 데이터 파악 및 이상유무 확인

In [7]:
df.shape

(87020, 8)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87020 entries, 0 to 87019
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   zip_code     87020 non-null  int64 
 1   pickup_hour  87020 non-null  object
 2   month        87020 non-null  int64 
 3   day          87020 non-null  int64 
 4   weekday      87020 non-null  int64 
 5   hour         87020 non-null  int64 
 6   is_weekend   87020 non-null  int64 
 7   cnt          87020 non-null  int64 
dtypes: int64(7), object(1)
memory usage: 5.3+ MB


> 결측치가 하나도 없습니다.

In [9]:
df['zip_code'].describe()

count    87020.000000
mean     10567.760296
std        579.688987
min      10001.000000
25%      10032.000000
50%      10177.000000
75%      11217.000000
max      14801.000000
Name: zip_code, dtype: float64

In [10]:
df['zip_code'].value_counts()

10019    744
10028    744
10002    743
10036    743
10128    743
10065    743
10029    743
10021    743
10010    743
10022    743
11104    742
10003    742
11101    742
10023    742
10014    742
10013    742
10017    742
10025    741
10016    741
10018    741
10024    741
10011    741
10012    741
10075    741
10009    741
10035    740
10026    740
10001    740
10031    739
11377    739
11106    739
11102    739
10038    739
10280    738
10027    738
10007    738
11201    738
10004    737
11211    737
10020    737
11217    737
10119    737
11430    737
10032    736
10006    736
10005    736
11215    736
11103    735
11231    734
10199    733
10030    733
10170    733
10282    732
11222    730
10110    725
10174    724
11105    723
11205    722
10168    721
10165    719
10153    718
11238    717
10037    715
10103    714
10172    713
11372    713
10171    712
11373    709
11371    707
10069    705
10152    705
10167    697
11369    693
10154    693
10451    693
10033    693
11370    689

In [11]:
df['zip_code'].value_counts().describe()

count    374.000000
mean     232.673797
std      301.011504
min        1.000000
25%        4.000000
50%       29.000000
75%      588.000000
max      744.000000
Name: zip_code, dtype: float64

In [12]:
# plt.figure(figsize=(15,8))
# sns.histplot(x='zip_code', data=df, discrete=True)

> zip_code는 총 374 개의 값이 있으며, 분포에 꽤 차이가 있습니다.

In [13]:
df['zip_code'].value_counts().loc[df['zip_code'].value_counts() > 100]

10019    744
10028    744
10002    743
10036    743
10128    743
10065    743
10029    743
10021    743
10010    743
10022    743
11104    742
10003    742
11101    742
10023    742
10014    742
10013    742
10017    742
10025    741
10016    741
10018    741
10024    741
10011    741
10012    741
10075    741
10009    741
10035    740
10026    740
10001    740
10031    739
11377    739
11106    739
11102    739
10038    739
10280    738
10027    738
10007    738
11201    738
10004    737
11211    737
10020    737
11217    737
10119    737
11430    737
10032    736
10006    736
10005    736
11215    736
11103    735
11231    734
10199    733
10030    733
10170    733
10282    732
11222    730
10110    725
10174    724
11105    723
11205    722
10168    721
10165    719
10153    718
11238    717
10037    715
10103    714
10172    713
11372    713
10171    712
11373    709
11371    707
10069    705
10152    705
10167    697
11369    693
10154    693
10451    693
10033    693
11370    689

> 100번 이상 등장하는 값이 154개로, 절반 이상이 100회가 되지 않습니다.

## 💻 위치 정보 가져오기

In [None]:
import geopy

In [14]:
geolocator = geopy.Nominatim(user_agent='gh-quest')
geolocator.geocode({"postalcode": 10001}, country_codes="us").raw['lat']
geolocator.geocode({"postalcode": 10001}, country_codes="us").raw['lon']

'-73.99411702534721'

In [15]:
def postcode_to_lat(row):
    postcode = row['zip_code']
    try:
        lat = geolocator.geocode({"postalcode": postcode}, country_codes="us").raw['lat']
    except:
        lat = 0
    return lat

def postcode_to_lon(row):
    postcode = row['zip_code']
    try:
        lon = geolocator.geocode({"postalcode": postcode}, country_codes="us").raw['lon']
    except:
        lon = 0
    return lon

In [20]:
zip_unique = pd.DataFrame(df['zip_code'].unique(), columns=['zip_code'])
zip_unique

Unnamed: 0,zip_code
0,11106
1,10006
2,11201
3,11430
4,10032
5,10075
6,10069
7,10001
8,10019
9,11216


In [21]:
zip_unique['lat'] = zip_unique.progress_apply(postcode_to_lat, axis=1)
zip_unique['lon'] = zip_unique.progress_apply(postcode_to_lon, axis=1)

100%|████████████████████████████████████████████████████████████████████████████████| 374/374 [03:08<00:00,  1.99it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 374/374 [03:06<00:00,  2.00it/s]


In [56]:
zip_unique

Unnamed: 0,zip_code,lat,lon
0,11106,40.761229,-73.929593
1,10006,40.708312,-74.01344
2,11201,40.6925,-73.991763
3,11430,40.653199,-73.784133
4,10032,40.837391,-73.941015
5,10075,40.773597,-73.956517
6,10069,40.77694,-73.988253
7,10001,40.748426,-73.994117
8,10019,40.763992,-73.985893
9,11216,40.680521,-73.949349


In [53]:
zip_unique.loc[zip_unique['lat'] == 0]

Unnamed: 0,zip_code,lat,lon


In [36]:
zip_unique['lat'] = pd.to_numeric(zip_unique['lat'])
zip_unique['lon'] = pd.to_numeric(zip_unique['lon'])

> API를 통해 나오지 않는 정보는 직접 검색해 찾아봅니다.
https://www.freemaptools.com/convert-us-zip-code-to-lat-lng.htm

In [54]:
zip_unique.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   zip_code  374 non-null    int64  
 1   lat       374 non-null    float64
 2   lon       374 non-null    float64
dtypes: float64(2), int64(1)
memory usage: 8.9 KB


In [55]:
zip_unique.loc[zip_unique['zip_code']==11424, 'lat'] = 40.71570
zip_unique.loc[zip_unique['zip_code']==11424, 'lon'] = -73.82670

zip_unique.loc[zip_unique['zip_code']==10119, 'lat'] = 40.75053
zip_unique.loc[zip_unique['zip_code']==10119, 'lon'] = -73.99309

zip_unique.loc[zip_unique['zip_code']==11351, 'lat'] = 40.75530
zip_unique.loc[zip_unique['zip_code']==11351, 'lon'] = -73.82680

In [69]:
zip_unique

Unnamed: 0,zip_code,lat,lon
0,11106,40.761229,-73.929593
1,10006,40.708312,-74.01344
2,11201,40.6925,-73.991763
3,11430,40.653199,-73.784133
4,10032,40.837391,-73.941015
5,10075,40.773597,-73.956517
6,10069,40.77694,-73.988253
7,10001,40.748426,-73.994117
8,10019,40.763992,-73.985893
9,11216,40.680521,-73.949349


## 💻 기존 데이터셋과 합치기

> 기존 df의 순서를 지키며 merge

In [70]:
df = df.merge(df.merge(zip_unique, how='left', sort=False))

In [71]:
df

Unnamed: 0,zip_code,pickup_hour,month,day,weekday,hour,is_weekend,cnt,lat,lon
0,11106,2015-01-01 00:00:00,1,1,3,0,0,75,40.761229,-73.929593
1,10006,2015-01-01 00:00:00,1,1,3,0,0,64,40.708312,-74.013440
2,11201,2015-01-01 00:00:00,1,1,3,0,0,169,40.692500,-73.991763
3,11430,2015-01-01 00:00:00,1,1,3,0,0,222,40.653199,-73.784133
4,10032,2015-01-01 00:00:00,1,1,3,0,0,64,40.837391,-73.941015
...,...,...,...,...,...,...,...,...,...,...
87015,11219,2015-01-31 23:00:00,1,31,5,23,1,1,40.632650,-73.996601
87016,10456,2015-01-31 23:00:00,1,31,5,23,1,3,40.829743,-73.908570
87017,11355,2015-01-31 23:00:00,1,31,5,23,1,1,40.750423,-73.819936
87018,11210,2015-01-31 23:00:00,1,31,5,23,1,1,40.627427,-73.945651


## 💻 Folium을 활용한 Heatmap 만들기

In [89]:
def generateBaseMap(default_location=[40.746176, -73.996453], # Boston의 좌표
                    default_zoom_start=9): #location과 시작 확대 정도를 파라미터로 받는 함수를 만들었어요
    base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom_start)
    return base_map

In [90]:
base_map=generateBaseMap()
base_map

In [91]:
from folium.plugins import HeatMap
HeatMap(data=df[['lat', 'lon']].drop_duplicates().reset_index(drop=True), 
        radius=8, max_zoom=5).add_to(base_map)

<folium.plugins.heat_map.HeatMap at 0x214a4c9c490>

In [92]:
base_map

## 💻 추가된 정보로 csv 파일 생성

In [94]:
df.to_csv('NYC_taxi_lat_lon.csv', index=False)