# 나라, 지역 정보 찾기
- BIL파일에서 데이터 추출하기
- gdal 모듈 

In [None]:
# import numpy as np
# import pandas as pd
# from osgeo import gdal

# # BIL 파일 경로
# bil_file_path = './HWSD2.bil'

# # GDAL을 사용하여 BIL 파일 읽기
# dataset = gdal.Open(bil_file_path)

# # BIL 파일의 메타 데이터 정보
# geotransform = dataset.GetGeoTransform()
# band = dataset.GetRasterBand(1)
# data = band.ReadAsArray()

# # 헤더 파일의 정보를 통해 해상도 및 기원점 확인
# originX = geotransform[0]
# originY = geotransform[3]
# pixelWidth = geotransform[1]
# pixelHeight = geotransform[5]

# # 데이터 프레임 생성
# data_list = []

# for row in range(data.shape[0]):
#     for col in range(data.shape[1]):
#         # 픽셀 값을 가져오기
#         value = data[row, col]
        
#         # NoData 값 제외
#         if value != 65535:
#             # 위도와 경도 계산
#             x = originX + col * pixelWidth
#             y = originY + row * pixelHeight
            
#             data_list.append([x, y, value])

# # 데이터 프레임으로 변환
# df = pd.DataFrame(data_list, columns=['Longitude', 'Latitude', 'Soil Mapping Unit'])

# # 데이터 프레임 출력
# df


- 위 코드가 메모리를 너무 차지해서 다시 코드 
1. 데이터 타입 최적화: 데이터를 uint16 형식으로 변환하여 메모리 사용량을 줄입니다.
2. 유효한 데이터만 추출: 유효하지 않은 데이터(값이 65535인 데이터)를 제외하고 나머지 데이터를 처리합니다.
3. 메모리 최적화: 메모리를 최대한 효율적으로 사용하여 데이터를 처리합니다.

In [1]:
from osgeo import gdal
import numpy as np
import pandas as pd

# BIL 파일 경로
bil_file_path = r"C:\Users\user\OneDrive\문서\사용자 지정 Office 서식 파일\바탕 화면\ML_PRJ\ML_wine\GLOSIS 글로벌 토양 특성\HWSD2.bil"

# GDAL을 사용하여 BIL 파일 읽기 시도
dataset = gdal.Open(bil_file_path, gdal.GA_ReadOnly)

# dataset이 None이 아닌지 확인
if dataset is None:
    print("파일을 열 수 없습니다. 경로를 확인하거나 GDAL이 해당 형식을 지원하는지 확인하세요.")
else:
    # BIL 파일의 메타 데이터 정보
    geotransform = dataset.GetGeoTransform()
    band = dataset.GetRasterBand(1)

    # 데이터 읽기 (데이터 타입 최적화: uint16 사용)
    data = band.ReadAsArray().astype(np.uint16)

    # 유효한 데이터 추출 (65535는 제외)
    valid_data = np.where(data != 65535)

    # 위도와 경도 계산
    x_coords = geotransform[0] + valid_data[1] * geotransform[1] + valid_data[0] * geotransform[2]
    y_coords = geotransform[3] + valid_data[1] * geotransform[4] + valid_data[0] * geotransform[5]

    # 유효한 데이터 값
    values = data[valid_data]

    # 데이터 프레임으로 변환
    df = pd.DataFrame({
        'Longitude': x_coords,
        'Latitude': y_coords,
        'Soil Mapping Unit': values
    })

    # 데이터 프레임 저장
    df.to_csv('soil_mapping_data.csv', index=False)

    # 데이터 프레임 출력
    df


In [8]:
len(df.iloc[:,2].unique())

29469

In [9]:
HWSD_SMU = pd.read_csv('./HWSD2_SMU.csv')

In [11]:
len(HWSD_SMU.HWSD2_SMU_ID.unique())

#SMU 개수 비교해보니 비슷함

29538

- 드디어 위도,경도, SMU 맵핑된 데이터 추출했다.

# 위도, 경도에 따른 나라, 지역 구하기

- GoogleV3에 역 지오코딩 활용

In [14]:
country = df.copy()

In [18]:
country.tail()

Unnamed: 0,Longitude,Latitude,Soil Mapping Unit
222694353,-67.3,-55.975,13406
222694354,-67.291667,-55.975,13406
222694355,-67.283333,-55.975,13406
222694356,-67.275,-55.975,13406
222694357,-67.266667,-55.975,13406


- 약 2억개의 위치 정보

In [29]:
import googlemaps
API_KEY = ''


gmaps = googlemaps.Client(key=API_KEY)
reverse_geocode_result = gmaps.reverse_geocode((-67.266667,-55.975))
reverse_geocode_result

[{'address_components': [{'long_name': '3846P2MG+82',
    'short_name': '3846P2MG+82',
    'types': ['plus_code']}],
  'formatted_address': '3846P2MG+82',
  'geometry': {'bounds': {'northeast': {'lat': -67.26662499999999,
     'lng': -55.974875},
    'southwest': {'lat': -67.26675, 'lng': -55.97499999999999}},
   'location': {'lat': -67.266667, 'lng': -55.97499999999999},
   'location_type': 'GEOMETRIC_CENTER',
   'viewport': {'northeast': {'lat': -67.2653385197085,
     'lng': -55.9735885197085},
    'southwest': {'lat': -67.2680364802915, 'lng': -55.9762864802915}}},
  'place_id': 'GhIJCft2EhHRUMARzMzMzMz8S8A',
  'plus_code': {'global_code': '3846P2MG+82'},
  'types': ['plus_code']}]

In [30]:
plus_code = '3846P2MG+82'

# Plus Code를 위도와 경도로 변환
location = gmaps.geocode(plus_code)[0]['geometry']['location']
lat = location['lat']
lng = location['lng']
print(f'위도: {lat}, 경도: {lng}')

# 위도와 경도를 이용하여 역 지오코딩 수행
reverse_geocode_result = gmaps.reverse_geocode((lat, lng))
for component in reverse_geocode_result[0]['address_components']:
    if 'country' in component['types']:
        country = component['long_name']
        print(f'나라: {country}')
        break

위도: -67.2666875, 경도: -55.9749375


In [32]:
reverse_geocode_result = gmaps.reverse_geocode((lat, lng))
for component in reverse_geocode_result[0]['address_components']:
    if 'country' in component['types']:
        country = component['long_name']
        print(f'나라: {country}')
        break

In [35]:
reverse_geocode_result[0]['formatted_address']

'3846P2MG+82'

Plus Codes (예시, 3846P2MG+82)

- 도로명 주소가 없는 지역의 위치를 공유할 때
- 주소가 불명확하거나 번지수가 없는 지역에서 정확한 위치를 지정할 때
- 긴 주소를 입력하거나 공유하는 대신 짧은 코드를 사용하여 위치를 나타낼 때

In [71]:
# from tqdm.notebook import tqdm

# address = []

# for idx, row in tqdm(country.iterrows()):
#     lat = row.Latitude
#     lng = row.Longitude
#     reverse_geocode_result = gmaps.reverse_geocode((lat,lng))

#     address.append(reverse_geocode_result[0]['formatted_address'])

    

In [41]:
import numpy as np
country['address'] = np.nan

In [44]:
country.head()

Unnamed: 0,Longitude,Latitude,Soil Mapping Unit,address
0,-35.741667,83.625,6998,
1,-35.733333,83.625,6998,
2,-35.725,83.625,6998,
3,-35.716667,83.625,6998,
4,-35.708333,83.625,6998,


In [46]:
country['address'][0:70572] = address

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  country['address'][0:70572] = address


In [51]:
country.iloc[572:900]

Unnamed: 0,Longitude,Latitude,Soil Mapping Unit,address
572,-35.583333,83.616667,6998,C9M6JC88+MM
573,-35.575000,83.616667,6998,C9M6JC8G+M2
574,-35.566667,83.616667,6998,C9M6JC8M+M8
575,-35.558333,83.616667,6998,C9M6JC8R+MM
576,-35.550000,83.616667,6998,C9M6JC8X+MX
...,...,...,...,...
895,-32.450000,83.616667,3479,C9M9JH82+M2
896,-32.441667,83.616667,3479,C9M9JH85+M8
897,-32.433333,83.616667,3479,C9M9JH88+MM
898,-32.425000,83.616667,3479,C9M9JH8F+MX


In [50]:
country.to_csv('soil_mapping_data_2.csv')

In [1]:
import pandas as pd

soil_lo = pd.read_csv('soil_mapping_data_2.csv')
soil_lo.head()

  soil_lo = pd.read_csv('soil_mapping_data_2.csv')


Unnamed: 0.1,Unnamed: 0,Longitude,Latitude,Soil Mapping Unit,address
0,0,-35.741667,83.625,6998,C9M6J7F5+X8
1,1,-35.733333,83.625,6998,C9M6J7F8+XM
2,2,-35.725,83.625,6998,C9M6J7FG+X2
3,3,-35.716667,83.625,6998,C9M6J7FM+X8
4,4,-35.708333,83.625,6998,C9M6J7FR+XM


#### 원하는 주소 데이터가 아님 

---

In [2]:
soil_lo = soil_lo[['Longitude','Latitude','Soil Mapping Unit']]
soil_lo

Unnamed: 0,Longitude,Latitude,Soil Mapping Unit
0,-35.741667,83.625,6998
1,-35.733333,83.625,6998
2,-35.725000,83.625,6998
3,-35.716667,83.625,6998
4,-35.708333,83.625,6998
...,...,...,...
222694353,-67.300000,-55.975,13406
222694354,-67.291667,-55.975,13406
222694355,-67.283333,-55.975,13406
222694356,-67.275000,-55.975,13406


In [4]:
soil_lo_copy = soil_lo.copy()

***위도, 경도를 소수점 둘쨰자리로 맞춘 이유***

소수점 둘째 자리까지 반올림하면 약 1.1km x 1.1km 크기의 격자가 만들어지며, 이는 지역의 대표성의 특성을 충분히 반영할 수 있음.
- 그래서 토양과 기후의 위도,경도 범위를 동일하게 맞추고 맵핑하고 있음
- 우리는 대략적으로 이 위치를 대표하는 범위에서 기후, 토양 특성을 알면 됨

In [11]:
soil_lo['Longitude'] = round(soil_lo['Longitude'],2)
soil_lo['Latitude'] = round(soil_lo['Latitude'],2)

In [19]:
climate_lo = pd.read_csv('../dfLocation_8.csv')
lo = climate_lo[['NAME','LATITUDE','LONGITUDE']]
lo

Unnamed: 0,NAME,LATITUDE,LONGITUDE
0,"WARWICK 3.2 WNW, NY US",41.268357,-74.415952
1,"MONROE 1.7 SE, NY US",41.301445,-74.166969
2,"PORT JERVIS 0.4 SE, NY US",41.374007,-74.684483
3,"PINE BUSH 3.4 WSW, NY US",41.587962,-74.357663
4,"WALDEN 1.2 S, NY US",41.542690,-74.188349
...,...,...,...
9995,"VICTORIA 1.3 E, TX US",28.822000,-96.961000
9996,"VICTORIA 2.0 S, TX US",28.795720,-96.984380
9997,"INEZ 5.3 SSW, TX US",28.804625,-96.826581
9998,"INEZ 2.2 S, TX US",28.843042,-96.799064


In [14]:
soil_lo.loc[(round(soil_lo['Latitude'],2) == 41.60) & (round(soil_lo['Longitude'],2) == -75.70)]


Unnamed: 0,Longitude,Latitude,Soil Mapping Unit
107909679,-75.7,41.6,4637


In [22]:
from tqdm.notebook import tqdm_notebook
country_df_8 = pd.DataFrame()

for idx, row in tqdm_notebook(lo.iterrows(), total=lo.shape[0]):
    lat = round(row['LATITUDE'], 2)
    lng = round(row['LONGITUDE'], 2)
    df = soil_lo.loc[(round(soil_lo['Latitude'],2) == lat) & (round(soil_lo['Longitude'],2) == lng)].copy()
    df['Country'] = lo.loc[idx, 'NAME']

    if country_df_8.empty:
        country_df_8 = df
    else:
        country_df_8 = pd.concat([country_df_8, df], axis=0)

country_df_8.reset_index(drop=True, inplace=True)
country_df_8.to_csv('country_df_8.csv')
country_df_8   

  0%|          | 0/10000 [00:00<?, ?it/s]

Unnamed: 0,Longitude,Latitude,Soil Mapping Unit,Country
0,-74.4250,41.2667,4641,"WARWICK 3.2 WNW, NY US"
1,-74.4167,41.2667,4641,"WARWICK 3.2 WNW, NY US"
2,-74.1667,41.3000,4644,"MONROE 1.7 SE, NY US"
3,-74.6833,41.3667,4645,"PORT JERVIS 0.4 SE, NY US"
4,-74.6750,41.3667,4645,"PORT JERVIS 0.4 SE, NY US"
...,...,...,...,...
14462,-96.9750,28.8000,5036,"VICTORIA 2.0 S, TX US"
14463,-96.8333,28.8000,5036,"INEZ 5.3 SSW, TX US"
14464,-96.8000,28.8417,5036,"INEZ 2.2 S, TX US"
14465,-96.8417,28.8833,5036,"INEZ 2.9 W, TX US"


- 결국에는 기상 관측소별 토양 특성을 맵핑한 건데, 이걸 포도 품종과 어떻게 연결...?

In [23]:
climate_lo = pd.read_csv('../dfLocation_9.csv')
lo = climate_lo[['NAME','LATITUDE','LONGITUDE']]
lo

  climate_lo = pd.read_csv('../dfLocation_9.csv')


Unnamed: 0,NAME,LATITUDE,LONGITUDE
0,"VICTORIA 11.0 W, TX US",28.844667,-97.163211
1,"VICTORIA 2.1 NNW, TX US",28.853700,-96.990300
2,"VICTORIA 9.7 ESE, TX US",28.778986,-96.831036
3,"VICTORIA 14.0 SW, TX US",28.659784,-97.118868
4,"VICTORIA 12.1 W, TX US",28.842100,-97.182317
...,...,...,...
9995,"IOLA 1 W, KS US",37.923300,-95.424100
9996,"IOLA, KS US",37.916670,-95.400000
9997,"IONIA, KS US",39.661100,-98.348300
9998,"IRENE, KS US",37.833333,-101.933333


In [72]:
# from tqdm.notebook import tqdm_notebook
# country_df_9 = pd.DataFrame()

# for idx, row in tqdm_notebook(lo.iterrows(), total=lo.shape[0]):
#     lat = round(row['LATITUDE'], 2)
#     lng = round(row['LONGITUDE'], 2)
#     df = soil_lo.loc[(round(soil_lo['Latitude'],2) == lat) & (round(soil_lo['Longitude'],2) == lng)].copy()
#     df['Country'] = lo.loc[idx, 'NAME']

#     if country_df_9.empty:
#         country_df_9 = df
#     else:
#         country_df_9 = pd.concat([country_df_9, df], axis=0)

# country_df_9.reset_index(drop=True, inplace=True)
# country_df_9   

In [26]:
country_df_9.to_csv('country_df_9.csv')

In [None]:
climate_lo = pd.read_csv('../dfLocation_10.csv')
lo = climate_lo[['NAME','LATITUDE','LONGITUDE']]
lo

In [None]:
country_df_10 = pd.DataFrame()

for idx, row in tqdm_notebook(lo.iterrows(), total=lo.shape[0]):
    lat = round(row['LATITUDE'], 2)
    lng = round(row['LONGITUDE'], 2)
    df = soil_lo.loc[(round(soil_lo['Latitude'],2) == lat) & (round(soil_lo['Longitude'],2) == lng)].copy()
    df['Country'] = lo.loc[idx, 'NAME']

    if country_df_10.empty:
        country_df_10 = df
    else:
        country_df_10 = pd.concat([country_df_10, df], axis=0)

country_df_10.reset_index(drop=True, inplace=True)
country_df_10.to_csv('country_df_10.csv')
country_df_10  

- 어제부터 기상 관측소 기준으로 토양데이터를 연결하고 있었음
- 기상 관측소기준 대략적인 범위로 포도 생산지 알 수 있을 거라 생각
- 하지만 문제는 토양 특성이 다를 수 있음
- 그래서 현재 포도를 생산하고 있는 지역의 토양 ,기후 데이터를 아는게 우선

---

## 포도 생산지 위도, 경도 추출부터 다시 시작...

In [156]:
#토양 데이터 불러오기
import pandas as pd
import pyarrow.parquet as pq

# # Parquet 파일 열기
# soil_lo = pq.read_table('soil_mapping_data_2.parquet')

# # 테이블 데이터를 pandas 데이터프레임으로 변환
# soil_lo = soil_lo.to_pandas()

# # 데이터프레임 출력
# print(soil_lo.head(10))


# Parquet 파일을 작은 청크로 읽어들이기
# chunk_size = 10000  # 청크 크기
soil_lo = pd.read_parquet('soil_mapping_data_2.parquet')

# # 청크를 순회하며 처리
# for chunk in iterator:
#     # 청크 처리 (예: 데이터 프레임으로 변환)
#     df_chunk = chunk.to_pandas()
#     print(df_chunk.head())


In [None]:
# import googlemaps
# API_KEY = ''


# gmaps = googlemaps.Client(key=API_KEY)
# name = '' #포도 생산지역

# #위도, 경도 검색 테스트

# location = gmaps.geocode(name)[0]['geometry']['location']
# lat = location['lat']
# lng = location['lng']
# print(f'위도: {lat}, 경도: {lng}')


In [163]:
!pip install pyxlsb

Collecting pyxlsb
  Downloading pyxlsb-1.0.10-py2.py3-none-any.whl.metadata (2.5 kB)
Downloading pyxlsb-1.0.10-py2.py3-none-any.whl (23 kB)
Installing collected packages: pyxlsb
Successfully installed pyxlsb-1.0.10


In [172]:
import pandas as pd

#와인 생산지 데이터 불러오기

wine = pd.read_excel("./Wine and Region_DGH_pre.xlsb.xlsx")
wine

Unnamed: 0,Continent,Country,Region Lv1,Region Lv2,Region Lv3,Winery,Grape_Variety,Latitude,Longtitude
0,Africa,Algeria,Algiers,,,,,,
1,Africa,Algeria,Béjaïa,,,,,,
2,Africa,Algeria,Chlef Province,Dahra,,,,,
3,Africa,Algeria,Mascara,,,,,,
4,Africa,Algeria,Médéa,,,,,,
...,...,...,...,...,...,...,...,...,...
484,Europe,Hungary,Balaton,Nagy-Somló,,Somlói Vándor Pince,Juhfark,47.141182,17.367134
485,Europe,Hungary,Balaton,Nagy-Somló,,Weingut Moric,White Blend,47.833402,16.477539
486,Europe,Hungary,Balaton,Nagy-Somló,,Somlo Abbey Cellar,Juhfark,47.139014,17.373512
487,Europe,Hungary,Balaton,Nagy-Somló,,Kolonics Pinceszet Somlo,Harslevelu,47.135723,17.382081


In [173]:
wine = wine.loc[wine['Latitude'].notnull()]
wine

Unnamed: 0,Continent,Country,Region Lv1,Region Lv2,Region Lv3,Winery,Grape_Variety,Latitude,Longtitude
298,Europe,Germany,Ahr,Ahrweiler,,"Jean Stodden, Das Rotweingut",Pinot Noir,50.515048,7.036348
299,Europe,Germany,Ahr,Ahrweiler,,Weingut H.J. Kreuzberg,Pinot Noir,50.536753,7.043099
300,Europe,Germany,Ahr,Ahrweiler,,Weingut Adeneuer,Pinot Noir,50.547435,7.107272
301,Europe,Germany,Ahr,Ahrweiler,,Julia Bertram,Pinot Noir,50.514968,7.036928
302,Europe,Germany,Ahr,Altenahr,,Weingut Deutzerhof Cossmann-Hehle,Pinot Noir,50.526587,7.016387
...,...,...,...,...,...,...,...,...,...
484,Europe,Hungary,Balaton,Nagy-Somló,,Somlói Vándor Pince,Juhfark,47.141182,17.367134
485,Europe,Hungary,Balaton,Nagy-Somló,,Weingut Moric,White Blend,47.833402,16.477539
486,Europe,Hungary,Balaton,Nagy-Somló,,Somlo Abbey Cellar,Juhfark,47.139014,17.373512
487,Europe,Hungary,Balaton,Nagy-Somló,,Kolonics Pinceszet Somlo,Harslevelu,47.135723,17.382081


In [175]:
wine = wine[['Continent','Country','Region  Lv1','Grape_Variety','Latitude','Longtitude']]
wine

Unnamed: 0,Continent,Country,Region Lv1,Grape_Variety,Latitude,Longtitude
298,Europe,Germany,Ahr,Pinot Noir,50.515048,7.036348
299,Europe,Germany,Ahr,Pinot Noir,50.536753,7.043099
300,Europe,Germany,Ahr,Pinot Noir,50.547435,7.107272
301,Europe,Germany,Ahr,Pinot Noir,50.514968,7.036928
302,Europe,Germany,Ahr,Pinot Noir,50.526587,7.016387
...,...,...,...,...,...,...
484,Europe,Hungary,Balaton,Juhfark,47.141182,17.367134
485,Europe,Hungary,Balaton,White Blend,47.833402,16.477539
486,Europe,Hungary,Balaton,Juhfark,47.139014,17.373512
487,Europe,Hungary,Balaton,Harslevelu,47.135723,17.382081


In [43]:
import glob
import pandas as pd

wine_lo = pd.DataFrame()

for file in glob.glob("./wine/*.xlsx"):
    df = pd.read_excel(file)

    if wine_lo.empty:
        wine_lo = df
    
    else:
        wine_lo = pd.concat([wine_lo,df],axis=0)


wine_lo


Unnamed: 0,Continent,Country,Region Lv1,Region Lv2,Region Lv3,Winery,Grape_Variety,Latitude,Longtitude,Unnamed: 9,...,Unnamed: 16108,Unnamed: 16109,Unnamed: 16110,Unnamed: 16111,Unnamed: 16112,Unnamed: 16113,Unnamed: 16114,Unnamed: 16115,Unnamed: 16116,Unnamed: 16117
0,Americas,Argentina,Río Negro Province,Río Negro,,"Wapisa Malbec, Rio Negro, Argentina",Malbec,-40.765363,-63.357985,,...,,,,,,,,,,
1,Americas,Argentina,Río Negro Province,Río Negro,,"Humberto Canale Gran Reserva Malbec, Rio Negro...",Malbec,-39.067943,-67.640917,,...,,,,,,,,,,
2,Americas,Argentina,Río Negro Province,Río Negro,,"Bodega Noemia de Patagonia Malbec, Rio Negro, ...",Malbec,-39.033487,-67.329324,,...,,,,,,,,,,
3,Americas,Argentina,Río Negro Province,Río Negro,,"Bodega Noemia de Patagonia '2', Rio Negro, Arg...",Cabernet Sauvignon,-39.033487,-67.329324,,...,,,,,,,,,,
4,Americas,Argentina,Río Negro Province,Río Negro,,"Bodega Noemia de Patagonia J. Alberto, Rio Neg...",Malbec,-39.033487,-67.329324,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1126,Oceania,New Zealand,Hawke's Bay (GI),Central Hawke's Bay (GI),,,,,,,...,,,,,,,,,,
1127,Oceania,New Zealand,Marlborough (GI),Southern Valleys,,,,,,,...,,,,,,,,,,
1128,Oceania,New Zealand,Marlborough (GI),Wairau Valley,,,,,,,...,,,,,,,,,,
1129,Oceania,New Zealand,Northland (GI),,,,,,,,...,,,,,,,,,,


In [44]:
wine_lo.columns

Index(['Continent', 'Country', 'Region  Lv1', 'Region Lv2', 'Region Lv3',
       'Winery', 'Grape_Variety', 'Latitude', 'Longtitude', 'Unnamed: 9',
       ...
       'Unnamed: 16108', 'Unnamed: 16109', 'Unnamed: 16110', 'Unnamed: 16111',
       'Unnamed: 16112', 'Unnamed: 16113', 'Unnamed: 16114', 'Unnamed: 16115',
       'Unnamed: 16116', 'Unnamed: 16117'],
      dtype='object', length=16118)

In [45]:
wine_lo = wine_lo[['Continent','Country','Region  Lv1','Grape_Variety','Latitude','Longtitude']]

In [101]:
wine = wine_lo.loc[wine_lo['Latitude'].notnull()]
wine

Unnamed: 0,Continent,Country,Region Lv1,Grape_Variety,Latitude,Longtitude
0,Americas,Argentina,Río Negro Province,Malbec,-40.765363,-63.357985
1,Americas,Argentina,Río Negro Province,Malbec,-39.067943,-67.640917
2,Americas,Argentina,Río Negro Province,Malbec,-39.033487,-67.329324
3,Americas,Argentina,Río Negro Province,Cabernet Sauvignon,-39.033487,-67.329324
4,Americas,Argentina,Río Negro Province,Malbec,-39.033487,-67.329324
...,...,...,...,...,...,...
892,Oceania,Australia,Port Phillip,Syrah,-38.002726,144.227670
893,Oceania,Australia,Port Phillip,Pinot Noir,-38.002726,144.227670
894,Oceania,Australia,Port Phillip,Syrah,-37.214668,144.631572
895,Oceania,Australia,Port Phillip,Pinot Noir,-37.295397,144.710140


포도 품종 개수 -> 161 개

In [48]:
len(wine['Grape_Variety'].unique())

161

In [109]:
# Convert Latitude column to string and remove commas
wine['Latitude'] = wine['Latitude'].astype(str).str.replace(',', '')

# Convert the cleaned string column to numeric
wine['Latitude'] = pd.to_numeric(wine['Latitude'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wine['Latitude'] = wine['Latitude'].astype(str).str.replace(',', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wine['Latitude'] = pd.to_numeric(wine['Latitude'])


In [110]:
# Convert Latitude column to string and remove commas
wine['Longitude'] = wine['Longitude'].astype(str).str.replace(',', '')

# Convert the cleaned string column to numeric
wine['Longitude'] = pd.to_numeric(wine['Longitude'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wine['Longitude'] = wine['Longitude'].astype(str).str.replace(',', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wine['Longitude'] = pd.to_numeric(wine['Longitude'])


In [111]:
wine.info()

<class 'pandas.core.frame.DataFrame'>
Index: 476 entries, 0 to 475
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Continent      476 non-null    object 
 1   Country        476 non-null    object 
 2   Region  Lv1    476 non-null    object 
 3   Grape_Variety  476 non-null    object 
 4   Latitude       476 non-null    float64
 5   Longitude      476 non-null    float64
dtypes: float64(2), object(4)
memory usage: 26.0+ KB


In [112]:
wine.isnull().sum()

Continent        0
Country          0
Region  Lv1      0
Grape_Variety    0
Latitude         0
Longitude        0
dtype: int64

In [50]:
wine.reset_index(drop=True,inplace=True)
wine

Unnamed: 0,Continent,Country,Region Lv1,Grape_Variety,Latitude,Longtitude
0,Americas,Argentina,Río Negro Province,Malbec,-40.765363,-63.357985
1,Americas,Argentina,Río Negro Province,Malbec,-39.067943,-67.640917
2,Americas,Argentina,Río Negro Province,Malbec,-39.033487,-67.329324
3,Americas,Argentina,Río Negro Province,Cabernet Sauvignon,-39.033487,-67.329324
4,Americas,Argentina,Río Negro Province,Malbec,-39.033487,-67.329324
...,...,...,...,...,...,...
1704,Oceania,Australia,Port Phillip,Syrah,-38.002726,144.227670
1705,Oceania,Australia,Port Phillip,Pinot Noir,-38.002726,144.227670
1706,Oceania,Australia,Port Phillip,Syrah,-37.214668,144.631572
1707,Oceania,Australia,Port Phillip,Pinot Noir,-37.295397,144.710140


In [None]:
#전체 지역 위도, 경도 검색

# df['lat'] = np.nan
# df['lng'] = np.nan

# for idx, row in df.iterrows():
    
#     location = gmaps.geocode(str(row.region))[0]['geometry']['location']
#     lat = location['lat']
#     lng = location['lng']

#     df.loc[idx,'lat'] = lat
#     df.loc[idx,'lng'] = lng



In [56]:
round(wine['Latitude'][0],2),round(wine['Longtitude'][0],2)

(-40.77, -63.36)

In [57]:
soil_lo.loc[(round(soil_lo['Latitude'],2) == round(wine['Latitude'][0],2)) & (round(soil_lo['Longitude'],2) == round(wine['Longtitude'][0],2))]

Unnamed: 0,Longitude,Latitude,Soil Mapping Unit
220769368,-63.358333,-40.766667,12245
220771021,-63.358333,-40.775,12245


토양 데이터에서 위도, 경도 검색해서 맵핑

In [177]:
from tqdm.notebook import tqdm_notebook

wine_soil = pd.DataFrame()

for idx, row in tqdm_notebook(wine.iterrows(), total=wine.shape[0]):
    #포도 데이터 위,경도
    lat = round(row['Latitude'], 2)
    lng = round(row['Longtitude'], 2)

    #토양 데이터에서 검색
    smu = soil_lo.loc[(round(soil_lo['Latitude'],2) == lat) & (round(soil_lo['Longitude'],2) == lng)].copy()
    smu['Continent'] = row.Continent
    smu['Country'] = row.Country
    smu['Region Lv1'] = row['Region  Lv1']
    smu['Grape_Variety'] = row['Grape_Variety']

    if wine_soil.empty:
        wine_soil = smu
    else:
        wine_soil = pd.concat([wine_soil, smu], axis=0)


wine_soil


    #포도 데이터에 SMU 합치기
    # wine.loc[idx,'SMU'] = smu

  

  0%|          | 0/191 [00:00<?, ?it/s]

Unnamed: 0,Longitude,Latitude,Soil Mapping Unit,Continent,Country,Region Lv1,Grape_Variety
83336405,7.041667,50.525000,10231,Europe,Germany,Ahr,Pinot Noir
83361821,7.041667,50.516667,10230,Europe,Germany,Ahr,Pinot Noir
83285508,7.041667,50.541667,10231,Europe,Germany,Ahr,Pinot Noir
83260054,7.108333,50.550000,10189,Europe,Germany,Ahr,Pinot Noir
83387222,7.041667,50.508333,10230,Europe,Germany,Ahr,Pinot Noir
...,...,...,...,...,...,...,...
91375195,16.483333,47.833333,9993,Europe,Hungary,Balaton,White Blend
93414392,17.366667,47.141667,9795,Europe,Hungary,Balaton,Juhfark
93414393,17.375000,47.141667,9795,Europe,Hungary,Balaton,Harslevelu
93414394,17.383333,47.141667,9795,Europe,Hungary,Balaton,Harslevelu


In [81]:
wine_soil.reset_index(drop=True, inplace=True)
wine_soil.to_csv('wine_soil.csv')

토양 raw 데이터 에서 SMU 코드로 합치기

In [178]:
#토양 raw데이터 만들기(layer 1 까지 있는거)
HWSD2_SMU = pd.read_csv('../soil_data_raw/soil_data_raw//HWSD2_SMU.CSV')
HWSD2_LAYERS_raw = pd.read_csv('../soil_data_raw/soil_data_raw/HWSD2_LAYERS_raw.csv')

soil_data = pd.merge(HWSD2_SMU,HWSD2_LAYERS_raw,how = 'inner', left_on='ID',right_on='ID')
soil_data.head()

  HWSD2_LAYERS_raw = pd.read_csv('../soil_data_raw/soil_data_raw/HWSD2_LAYERS_raw.csv')


Unnamed: 0,Unnamed: 0_x,ID,HWSD2_SMU_ID_x,WISE30s_SMU_ID_x,HWSD1_SMU_ID_x,COVERAGE_x,SHARE_x,WRB4_x,WRB_PHASES_x,WRB2_x,...,CEC_SOIL,CEC_CLAY,CEC_EFF,TEB,BSAT,ALUM_SAT,ESP,TCARBON_EQ,GYPSUM,ELEC_COND
0,0,669,12707,WD10012707,12707,3,40,ALfr,ALfr,AL,...,8,46,6,5,59,0,2,0.0,0.1,1
1,1,695,11825,WD30011825,11825,2,100,ALfr,ALfr,AL,...,9,20,6,4,41,50,2,0.0,0.1,0
2,2,696,11823,WD30011823,11823,2,100,ALfr,ALfr,AL,...,9,20,6,4,41,50,2,0.0,0.1,0
3,3,697,13458,WD30013458,13458,3,100,ALfr,ALfr,AL,...,9,20,6,4,41,50,2,0.0,0.1,0
4,4,698,11824,WD30011824,11824,2,100,ALfr,ALfr,AL,...,9,20,6,4,41,50,2,0.0,0.1,0


In [179]:
#포도 데이터 와 SMU로 합치기
soil_data.drop('Unnamed: 0_x', inplace=True, axis=1)

wine_soil_data = pd.merge(wine_soil,soil_data,how='inner',left_on='Soil Mapping Unit', right_on='HWSD2_SMU_ID_x')
wine_soil_data


Unnamed: 0,Longitude,Latitude,Soil Mapping Unit,Continent,Country,Region Lv1,Grape_Variety,ID,HWSD2_SMU_ID_x,WISE30s_SMU_ID_x,...,CEC_SOIL,CEC_CLAY,CEC_EFF,TEB,BSAT,ALUM_SAT,ESP,TCARBON_EQ,GYPSUM,ELEC_COND
0,7.041667,50.525000,10231,Europe,Germany,Ahr,Pinot Noir,51125,10231,WD30010231,...,18,31,9,6,37,27,1,0.0,1.6,1
1,7.041667,50.541667,10231,Europe,Germany,Ahr,Pinot Noir,51125,10231,WD30010231,...,18,31,9,6,37,27,1,0.0,1.6,1
2,7.016667,50.533333,10231,Europe,Germany,Ahr,Pinot Noir,51125,10231,WD30010231,...,18,31,9,6,37,27,1,0.0,1.6,1
3,7.041667,50.541667,10231,Europe,Germany,Ahr,Pinot Noir,51125,10231,WD30010231,...,18,31,9,6,37,27,1,0.0,1.6,1
4,7.041667,50.541667,10231,Europe,Germany,Ahr,Pinot Noir Precoce,51125,10231,WD30010231,...,18,31,9,6,37,27,1,0.0,1.6,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256,17.375000,47.141667,9795,Europe,Hungary,Balaton,Harslevelu,207940,9795,WD40009795,...,13,47,14,13,81,13,2,0.0,0.1,1
257,17.383333,47.141667,9795,Europe,Hungary,Balaton,Harslevelu,207940,9795,WD40009795,...,13,47,14,13,81,13,2,0.0,0.1,1
258,16.475000,47.833333,9993,Europe,Hungary,Balaton,White Blend,32739,9993,WD40009993,...,26,65,26,26,93,0,1,0.0,0.8,1
259,16.483333,47.833333,9993,Europe,Hungary,Balaton,White Blend,32739,9993,WD40009993,...,26,65,26,26,93,0,1,0.0,0.8,1


In [16]:
wine_soil_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 566 entries, 0 to 565
Data columns (total 77 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Longitude          566 non-null    float64
 1   Latitude           566 non-null    float64
 2   Soil Mapping Unit  566 non-null    int64  
 3   Continent          566 non-null    object 
 4   Country            566 non-null    object 
 5   Region Lv1         566 non-null    object 
 6   ID                 566 non-null    int64  
 7   HWSD2_SMU_ID_x     566 non-null    int64  
 8   WISE30s_SMU_ID_x   566 non-null    object 
 9   HWSD1_SMU_ID_x     566 non-null    int64  
 10  COVERAGE_x         566 non-null    int64  
 11  SHARE_x            566 non-null    int64  
 12  WRB4_x             566 non-null    object 
 13  WRB_PHASES_x       566 non-null    object 
 14  WRB2_x             566 non-null    object 
 15  WRB2_CODE          566 non-null    int64  
 16  FAO90_x            566 non

In [17]:
wine_soil_data.columns

Index(['Longitude', 'Latitude', 'Soil Mapping Unit', 'Continent', 'Country',
       'Region Lv1', 'ID', 'HWSD2_SMU_ID_x', 'WISE30s_SMU_ID_x',
       'HWSD1_SMU_ID_x', 'COVERAGE_x', 'SHARE_x', 'WRB4_x', 'WRB_PHASES_x',
       'WRB2_x', 'WRB2_CODE', 'FAO90_x', 'KOPPEN', 'TEXTURE_USDA_x',
       'REF_BULK_DENSITY', 'BULK_DENSITY', 'DRAINAGE_x', 'ROOT_DEPTH_x',
       'AWC_x', 'PHASE1_x', 'PHASE2_x', 'ROOTS_x', 'IL_x', 'ADD_PROP_x',
       'Unnamed: 0_y', 'HWSD2_SMU_ID_y', 'NSC_MU_SOURCE1', 'NSC_MU_SOURCE2',
       'WISE30s_SMU_ID_y', 'HWSD1_SMU_ID_y', 'COVERAGE_y', 'SEQUENCE',
       'SHARE_y', 'NSC', 'WRB_PHASES_y', 'WRB4_y', 'WRB2_y', 'FAO90_y',
       'ROOT_DEPTH_y', 'PHASE1_y', 'PHASE2_y', 'ROOTS_y', 'IL_y', 'SWR',
       'DRAINAGE_y', 'AWC_y', 'ADD_PROP_y', 'LAYER', 'TOPDEP', 'BOTDEP',
       'COARSE', 'SAND', 'SILT', 'CLAY', 'TEXTURE_USDA_y', 'TEXTURE_SOTER',
       'BULK', 'REF_BULK', 'ORG_CARBON', 'PH_WATER', 'TOTAL_N', 'CN_RATIO',
       'CEC_SOIL', 'CEC_CLAY', 'CEC_EFF', 'TEB', 

In [180]:
# 와인생산지 + 토양 데이터 합치기 성공

target = ['HWSD1_SMU_ID_x','ID','HWSD2_SMU_ID_x','WISE30s_SMU_ID_x','COVERAGE_x','WRB_PHASES_x','WRB2_x','WRB2_CODE',
          'PHASE1_x','PHASE2_x','ADD_PROP_x','FAO90_x','Unnamed: 0_y','NSC_MU_SOURCE1','NSC_MU_SOURCE2','COVERAGE_y','NSC','WRB_PHASES_y','WRB2_y','FAO90_y',
          'PHASE1_y','PHASE2_y','ADD_PROP_y','LAYER','TOPDEP','BOTDEP','WISE30s_SMU_ID_y','HWSD1_SMU_ID_y']

wine_soil_data.drop(target,axis=1,inplace=True)
wine_soil_data

Unnamed: 0,Longitude,Latitude,Soil Mapping Unit,Continent,Country,Region Lv1,Grape_Variety,SHARE_x,WRB4_x,KOPPEN,...,CEC_SOIL,CEC_CLAY,CEC_EFF,TEB,BSAT,ALUM_SAT,ESP,TCARBON_EQ,GYPSUM,ELEC_COND
0,7.041667,50.525000,10231,Europe,Germany,Ahr,Pinot Noir,90,CMdy,C,...,18,31,9,6,37,27,1,0.0,1.6,1
1,7.041667,50.541667,10231,Europe,Germany,Ahr,Pinot Noir,90,CMdy,C,...,18,31,9,6,37,27,1,0.0,1.6,1
2,7.016667,50.533333,10231,Europe,Germany,Ahr,Pinot Noir,90,CMdy,C,...,18,31,9,6,37,27,1,0.0,1.6,1
3,7.041667,50.541667,10231,Europe,Germany,Ahr,Pinot Noir,90,CMdy,C,...,18,31,9,6,37,27,1,0.0,1.6,1
4,7.041667,50.541667,10231,Europe,Germany,Ahr,Pinot Noir Precoce,90,CMdy,C,...,18,31,9,6,37,27,1,0.0,1.6,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256,17.375000,47.141667,9795,Europe,Hungary,Balaton,Harslevelu,60,LVcr,D,...,13,47,14,13,81,13,2,0.0,0.1,1
257,17.383333,47.141667,9795,Europe,Hungary,Balaton,Harslevelu,60,LVcr,D,...,13,47,14,13,81,13,2,0.0,0.1,1
258,16.475000,47.833333,9993,Europe,Hungary,Balaton,White Blend,60,CHha,D,...,26,65,26,26,93,0,1,0.0,0.8,1
259,16.483333,47.833333,9993,Europe,Hungary,Balaton,White Blend,60,CHha,D,...,26,65,26,26,93,0,1,0.0,0.8,1


In [118]:
wine_soil_data.drop('Unnamed: 0_x',axis=1,inplace=True)
wine_soil_data

Unnamed: 0,Longitude,Latitude,Soil Mapping Unit,Continent,Country,Region Lv1,Grape_Variety,SHARE_x,WRB4_x,KOPPEN,...,CEC_SOIL,CEC_CLAY,CEC_EFF,TEB,BSAT,ALUM_SAT,ESP,TCARBON_EQ,GYPSUM,ELEC_COND
0,-16.541667,28.375000,1419,Europe,Spain,Canary Islands,Alfrocheiro Preto,40,LVcr,B,...,10,44,12,12,83,0,2,0.0,0.1,1
1,-16.541667,28.366667,1419,Europe,Spain,Canary Islands,Alfrocheiro Preto,40,LVcr,B,...,10,44,12,12,83,0,2,0.0,0.1,1
2,-16.541667,28.375000,1419,Europe,Spain,Canary Islands,Vijariego Negro,40,LVcr,B,...,10,44,12,12,83,0,2,0.0,0.1,1
3,-16.541667,28.366667,1419,Europe,Spain,Canary Islands,Vijariego Negro,40,LVcr,B,...,10,44,12,12,83,0,2,0.0,0.1,1
4,-16.541667,28.375000,1419,Europe,Spain,Canary Islands,Listan Negro,40,LVcr,B,...,10,44,12,12,83,0,2,0.0,0.1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,18.375000,48.258333,10723,Europe,Slovakia,Nitrianska,Cabernet Franc,62,LVha,D,...,15,47,13,13,80,0,1,0.0,0.1,1
668,18.383333,48.258333,10723,Europe,Slovakia,Nitrianska,Cabernet Franc,62,LVha,D,...,15,47,13,13,80,0,1,0.0,0.1,1
669,17.516667,48.425000,10713,Europe,Slovakia,Malokarpatska,Blaufrankisch,50,CHha,D,...,26,65,26,26,93,0,1,0.0,0.8,1
670,17.516667,48.416667,10713,Europe,Slovakia,Malokarpatska,Blaufrankisch,50,CHha,D,...,26,65,26,26,93,0,1,0.0,0.8,1


In [181]:
import numpy as np
wine_soil_data['Top12_Grape']= np.nan
wine_soil_data

Unnamed: 0,Longitude,Latitude,Soil Mapping Unit,Continent,Country,Region Lv1,Grape_Variety,SHARE_x,WRB4_x,KOPPEN,...,CEC_CLAY,CEC_EFF,TEB,BSAT,ALUM_SAT,ESP,TCARBON_EQ,GYPSUM,ELEC_COND,Top12_Grape
0,7.041667,50.525000,10231,Europe,Germany,Ahr,Pinot Noir,90,CMdy,C,...,31,9,6,37,27,1,0.0,1.6,1,
1,7.041667,50.541667,10231,Europe,Germany,Ahr,Pinot Noir,90,CMdy,C,...,31,9,6,37,27,1,0.0,1.6,1,
2,7.016667,50.533333,10231,Europe,Germany,Ahr,Pinot Noir,90,CMdy,C,...,31,9,6,37,27,1,0.0,1.6,1,
3,7.041667,50.541667,10231,Europe,Germany,Ahr,Pinot Noir,90,CMdy,C,...,31,9,6,37,27,1,0.0,1.6,1,
4,7.041667,50.541667,10231,Europe,Germany,Ahr,Pinot Noir Precoce,90,CMdy,C,...,31,9,6,37,27,1,0.0,1.6,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256,17.375000,47.141667,9795,Europe,Hungary,Balaton,Harslevelu,60,LVcr,D,...,47,14,13,81,13,2,0.0,0.1,1,
257,17.383333,47.141667,9795,Europe,Hungary,Balaton,Harslevelu,60,LVcr,D,...,47,14,13,81,13,2,0.0,0.1,1,
258,16.475000,47.833333,9993,Europe,Hungary,Balaton,White Blend,60,CHha,D,...,65,26,26,93,0,1,0.0,0.8,1,
259,16.483333,47.833333,9993,Europe,Hungary,Balaton,White Blend,60,CHha,D,...,65,26,26,93,0,1,0.0,0.8,1,


In [120]:
wine_soil_data.to_csv('wine_soil_data_2.csv')

### 위도, 경도 좌표를 넣으면 SMU코드가 나오는 함수

In [None]:
def SMU(lat,lng,soil_df):
    
    

---

### 기후 데이터에서 위,경도 검색해서 기후 합치기

In [2]:
import glob
import pandas as pd

climate_lo = pd.DataFrame()

for file in glob.glob("../climate_location/*.csv"):
    df = pd.read_csv(file)

    if climate_lo.empty:
        climate_lo = df
    
    else:
        climate_lo = pd.concat([climate_lo,df],axis=0)


climate_lo


  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)
  df = pd.read_csv(file)


Unnamed: 0.1,Unnamed: 0,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,CDSD,CDSD_ATTRIBUTES,CLDD,...,MN08,MN08_ATTRIBUTES,MN09,MN09_ATTRIBUTES,MX07,MX07_ATTRIBUTES,MX08,MX08_ATTRIBUTES,MX09,MX09_ATTRIBUTES
0,0,ACW00011604,1949-01,17.116670,-61.783330,10.1,"ST JOHNS COOLIDGE FIELD, AC",182.6,,182.6,...,,,,,,,,,,
1,1,ACW00011647,1957-09,17.133330,-61.783330,19.2,"ST JOHNS, AC",,,,...,,,,,,,,,,
2,2,AE000041196,1944-04,25.333000,55.517000,34.0,"SHARJAH INTER. AIRP, AE",,,184.1,...,,,,,,,,,,
3,3,AEM00041194,1983-01,25.255000,55.364000,10.4,"DUBAI INTERNATIONAL, AE",,,,...,,,,,,,,,,
4,4,AEM00041217,1983-06,24.433000,54.651000,26.8,"ABU DHABI INTERNATIONAL, AE",,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9995,USC00143984,1904-09,37.923300,-95.424100,290.8,"IOLA 1 W, KS US",,,,...,,,,,,,,,,
9996,9996,USC00143989,1905-09,37.916670,-95.400000,292.9,"IOLA, KS US",,,113.2,...,,,,,,,,,,
9997,9997,USC00143997,1894-01,39.661100,-98.348300,468.5,"IONIA, KS US",0.0,6.0,0.0,...,,,,,,,,,,
9998,9998,USC00144008,1910-03,37.833333,-101.933333,,"IRENE, KS US",,,3.7,...,,,,,,,,,,


In [3]:
target = ['STATION','LATITUDE','LONGITUDE','NAME']
climate_lo = climate_lo[target]
climate_lo

Unnamed: 0,STATION,LATITUDE,LONGITUDE,NAME
0,ACW00011604,17.116670,-61.783330,"ST JOHNS COOLIDGE FIELD, AC"
1,ACW00011647,17.133330,-61.783330,"ST JOHNS, AC"
2,AE000041196,25.333000,55.517000,"SHARJAH INTER. AIRP, AE"
3,AEM00041194,25.255000,55.364000,"DUBAI INTERNATIONAL, AE"
4,AEM00041217,24.433000,54.651000,"ABU DHABI INTERNATIONAL, AE"
...,...,...,...,...
9995,USC00143984,37.923300,-95.424100,"IOLA 1 W, KS US"
9996,USC00143989,37.916670,-95.400000,"IOLA, KS US"
9997,USC00143997,39.661100,-98.348300,"IONIA, KS US"
9998,USC00144008,37.833333,-101.933333,"IRENE, KS US"


In [33]:
from tqdm.notebook import tqdm_notebook
import pandas as pd

# 빈 데이터프레임 초기화
wine_total_data = pd.DataFrame()

# 각 행을 반복하며 처리
for idx, row in tqdm_notebook(wine_soil_data.iterrows(), total=wine_soil_data.shape[0]):
    # 포도 데이터 위,경도 반올림
    lat = round(row['Latitude'])
    lng = round(row['Longitude'])

    # 기후 데이터에서 검색
    climate_filtered = climate_lo[(climate_lo['LATITUDE'].round() == lat) & (climate_lo['LONGITUDE'].round() == lng)].copy()
    
    # 원래 데이터 가져오기
    wine_row = wine_soil_data.loc[idx]
    
    # 병합
    merged_data = wine_row.to_frame().T.merge(climate_filtered, how='left', left_on=['Latitude', 'Longitude'], right_on=['LATITUDE', 'LONGITUDE'])

    # 결과 데이터프레임에 추가
    if wine_total_data.empty:
        wine_total_data = merged_data
    else:
        wine_total_data = pd.concat([wine_total_data, merged_data], axis=0)

# 결과 확인
wine_total_data


  0%|          | 0/566 [00:00<?, ?it/s]

Unnamed: 0,Longitude,Latitude,Soil Mapping Unit,Continent,Country,Region Lv1,ID,HWSD2_SMU_ID_x,WISE30s_SMU_ID_x,COVERAGE_x,...,BSAT,ALUM_SAT,ESP,TCARBON_EQ,GYPSUM,ELEC_COND,STATION,LATITUDE,LONGITUDE,NAME
0,3.058333,36.75,7001,Africa,Algeria,Algiers,291461,7001,WD30007001,1,...,-9,-9,-9,-9.0,-9.0,-9,,,,
0,5.058333,36.75,7001,Africa,Algeria,Béjaïa,291461,7001,WD30007001,1,...,-9,-9,-9,-9.0,-9.0,-9,,,,
0,0.141667,35.4,7001,Africa,Algeria,Mascara,291461,7001,WD30007001,1,...,-9,-9,-9,-9.0,-9.0,-9,,,,
0,2.758333,36.258333,7001,Africa,Algeria,Médéa,291461,7001,WD30007001,1,...,-9,-9,-9,-9.0,-9.0,-9,,,,
0,-1.325,34.883333,7001,Africa,Algeria,Tlemcen,291461,7001,WD30007001,1,...,-9,-9,-9,-9.0,-9.0,-9,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,173.933333,-35.408333,6345,Oceania,New Zealand,Northland,4657,6345,WD30006345,4,...,50,11,1,0.0,0.0,0,,,,
0,175.016667,-37.616667,6353,Oceania,New Zealand,Waikato,50998,6353,WD30006353,4,...,37,27,1,0.0,1.6,1,,,,
0,175.016667,-37.625,6353,Oceania,New Zealand,Waikato,50998,6353,WD30006353,4,...,37,27,1,0.0,1.6,1,,,,
0,176.091667,-40.408333,6358,Oceania,New Zealand,Wairarapa,50989,6358,WD30006358,4,...,37,27,1,0.0,1.6,1,,,,


In [53]:
# 위, 경도 검색 실패
climate_lo.loc[(-35.6 <= climate_lo['LATITUDE']) & (climate_lo['LATITUDE'] <= -34.2) & (173.3 <= climate_lo['LONGITUDE']) & (climate_lo['LONGITUDE'] <= 174.7)]


Unnamed: 0,STATION,LATITUDE,LONGITUDE,NAME


위도, 경도로는 맵핑할 수가 없음

> 지역별 기상관측소 NCEI 코드로 일일히 맵핑해야함

In [57]:
climate_lo.reset_index(drop=True,inplace=True)

In [60]:
climate_lo['Country'] = climate_lo['STATION'].apply(lambda x : x[:2])


In [70]:
#그럼 와인 생산지의 기상 관측소 전부 검색하기 
wine_soil_data['Country'].unique()

array(['Algeria', 'Morocco', 'South Africa', 'Tunisia', 'Brazil', 'Chile',
       'Mexico', 'Uruguay', 'Albania', 'Austria',
       'Bosnia and Herzegovina', 'Bulgaria', 'Cyprus', 'Czech Republic',
       'France', 'Georgia', 'Hungary', 'Ireland', 'Italy',
       'North Macedonia', 'Poland', 'Portugal', 'Russia', 'Slovenia',
       'Spain', 'Switzerland', 'Turkey', 'Ukraine', 'Armenia',
       'Azerbaijan', 'China', 'India', 'Iran', 'Israel', 'Japan',
       'Lebanon', 'Palestinian territories', 'Syria', 'New Zealand',
       'Cape Verde', 'Argentina', 'Bolivia', 'Canada', 'Colombia',
       'Costa Rica', 'Peru', 'United States', 'Venezuela', 'Belgium',
       'Serbia', 'Croatia', 'Germany', 'Greece', 'Latvia', 'Lithuania',
       'Luxembourg', 'Moldova', 'Montenegro', 'Netherlands', 'Norway',
       'Romania', 'Slovakia', 'Sweden', 'Burma', 'Indonesia',
       'South Korea', 'Vietnam', 'Australia'], dtype=object)

- 현재 와인 생산지별 기상 관측소가 여러개 검색됨(위도, 경도로 검색한 것이 아닌 지역명으로 검색)
- 어떻게 합치면 좋을지?
    1. 검색된 기상관측소를 전부 추가 행으로?
    2. 혹은 대표 관측소 하나만?
- 노가다 필요함
    1. climate_lo 의 나라 코드로 특정 나라 검색
    2. GPT한테 NCEI 코드 던져줌
    3. wine_soil_data에서 특정나라의 지역별로 NCEI 코드 찾아달라고 요청
    4. GPT가 찾아주면 채워넣으면 됨

---

- Parquet 란?

Parquet는 대규모 데이터 처리를 위한 열 지향의 파일 포맷입니다. 대용량의 구조화된 데이터를 저장하기위해 설계되었고,
데이터의 압축 및 직렬화를 효율적으로 수행하여 데이터의 저장 공간은 줄이고 처리 속도를 높입니다. 하둡같은 대규모 데이터 처리 시스템에서 자주 사용됩니다.

In [27]:
!pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-16.1.0-cp38-cp38-win_amd64.whl.metadata (3.1 kB)
Downloading pyarrow-16.1.0-cp38-cp38-win_amd64.whl (25.9 MB)
   ---------------------------------------- 0.0/25.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/25.9 MB 1.3 MB/s eta 0:00:21
    --------------------------------------- 0.4/25.9 MB 6.4 MB/s eta 0:00:04
   - -------------------------------------- 0.8/25.9 MB 7.7 MB/s eta 0:00:04
   - -------------------------------------- 1.3/25.9 MB 7.3 MB/s eta 0:00:04
   -- ------------------------------------- 1.8/25.9 MB 8.6 MB/s eta 0:00:03
   --- ------------------------------------ 2.3/25.9 MB 9.3 MB/s eta 0:00:03
   ---- ----------------------------------- 2.9/25.9 MB 9.6 MB/s eta 0:00:03
   ----- ---------------------------------- 3.4/25.9 MB 9.4 MB/s eta 0:00:03
   ------ --------------------------------- 4.1/25.9 MB 9.8 MB/s eta 0:00:03
   ------- -------------------------------- 4.6/25.9 MB 9.9 MB/s eta 0:00:03
   ------

In [30]:
soil_lo_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 222694358 entries, 0 to 222694357
Data columns (total 3 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Longitude          float64
 1   Latitude           float64
 2   Soil Mapping Unit  int64  
dtypes: float64(2), int64(1)
memory usage: 5.0 GB


In [31]:
import pyarrow as pa
import pyarrow.parquet as pq

# 파일명(읽기, 쓰기)
# csv_file = 'keyword.txt'
parquet_file = 'soil_mapping_data_2.parquet'

# csv 읽을 때 옵션 값(chunk, header, dtype)
# chunk_size = 100000
# header = ['base_date', 'category_code', 'rank', 'keyword', 'link_id']
# dtype = {
#     'base_date':'str',
#     'category_code':'str',
#     'rank':'int16',
#     'keyword':'str',
#     'link_id':'str',
# }

# csv_stream - chunk_size만큼 나눠서 읽기
# csv_stream = pd.read_csv(csv_file, chunksize=chunk_size, names=header, dtype=dtype, low_memory=False)

# parquet 스키마
parquet_schema = pa.schema([
    ('Longitude', pa.float64()),
    ('Latitude', pa.float64()),
    ('Soil Mapping Unit', pa.int64())
])

In [32]:
# for i, chunk in enumerate(csv_stream):
#     print(f"chunk : {i}")
 
#     # 첫 번째 조각인 경우 parquet_writer 객체 생성
#     if i == 0:
parquet_writer = pq.ParquetWriter(parquet_file, parquet_schema, compression='snappy')
table = pa.Table.from_pandas(soil_lo_copy, schema=parquet_schema)
parquet_writer.write_table(table)
 
parquet_writer.close()

In [4]:
#토양 데이터 불러오기
import pandas as pd
import pyarrow.parquet as pq

# # Parquet 파일 열기

iterator = pd.read_parquet('soil_mapping_data_2.parquet')


In [5]:
iterator.head()

Unnamed: 0,Longitude,Latitude,Soil Mapping Unit
0,-35.741667,83.625,6998
1,-35.733333,83.625,6998
2,-35.725,83.625,6998
3,-35.716667,83.625,6998
4,-35.708333,83.625,6998


기상관측소 데이터도 압축

In [4]:
climate_lo.info()

<class 'pandas.core.frame.DataFrame'>
Index: 112041 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   STATION    112041 non-null  object 
 1   LATITUDE   112041 non-null  float64
 2   LONGITUDE  112041 non-null  float64
 3   NAME       112041 non-null  object 
dtypes: float64(2), object(2)
memory usage: 4.3+ MB


In [6]:
import pyarrow as pa
import pyarrow.parquet as pq

# 파일명(읽기, 쓰기)
# csv_file = 'keyword.txt'
parquet_file = 'climate_location.parquet'


# parquet 스키마
parquet_schema = pa.schema([
    ('STATION', pa.string()),
    ('LATITUDE', pa.float64()),
    ('LONGITUDE', pa.float64()),
    ('NAME', pa.string())
])

In [7]:
parquet_writer = pq.ParquetWriter(parquet_file, parquet_schema, compression='snappy')
table = pa.Table.from_pandas(climate_lo, schema=parquet_schema)
parquet_writer.write_table(table)
parquet_writer.close()

In [8]:
climate_lo = pd.read_parquet('climate_location.parquet')
climate_lo.head()

Unnamed: 0,STATION,LATITUDE,LONGITUDE,NAME
0,ACW00011604,17.11667,-61.78333,"ST JOHNS COOLIDGE FIELD, AC"
1,ACW00011647,17.13333,-61.78333,"ST JOHNS, AC"
2,AE000041196,25.333,55.517,"SHARJAH INTER. AIRP, AE"
3,AEM00041194,25.255,55.364,"DUBAI INTERNATIONAL, AE"
4,AEM00041217,24.433,54.651,"ABU DHABI INTERNATIONAL, AE"


---

In [121]:
wine_soil_data

Unnamed: 0,Longitude,Latitude,Soil Mapping Unit,Continent,Country,Region Lv1,Grape_Variety,SHARE_x,WRB4_x,KOPPEN,...,CEC_CLAY,CEC_EFF,TEB,BSAT,ALUM_SAT,ESP,TCARBON_EQ,GYPSUM,ELEC_COND,Top12_Grape
0,-16.541667,28.375000,1419,Europe,Spain,Canary Islands,Alfrocheiro Preto,40,LVcr,B,...,44,12,12,83,0,2,0.0,0.1,1,
1,-16.541667,28.366667,1419,Europe,Spain,Canary Islands,Alfrocheiro Preto,40,LVcr,B,...,44,12,12,83,0,2,0.0,0.1,1,
2,-16.541667,28.375000,1419,Europe,Spain,Canary Islands,Vijariego Negro,40,LVcr,B,...,44,12,12,83,0,2,0.0,0.1,1,
3,-16.541667,28.366667,1419,Europe,Spain,Canary Islands,Vijariego Negro,40,LVcr,B,...,44,12,12,83,0,2,0.0,0.1,1,
4,-16.541667,28.375000,1419,Europe,Spain,Canary Islands,Listan Negro,40,LVcr,B,...,44,12,12,83,0,2,0.0,0.1,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,18.375000,48.258333,10723,Europe,Slovakia,Nitrianska,Cabernet Franc,62,LVha,D,...,47,13,13,80,0,1,0.0,0.1,1,
668,18.383333,48.258333,10723,Europe,Slovakia,Nitrianska,Cabernet Franc,62,LVha,D,...,47,13,13,80,0,1,0.0,0.1,1,
669,17.516667,48.425000,10713,Europe,Slovakia,Malokarpatska,Blaufrankisch,50,CHha,D,...,65,26,26,93,0,1,0.0,0.8,1,
670,17.516667,48.416667,10713,Europe,Slovakia,Malokarpatska,Blaufrankisch,50,CHha,D,...,65,26,26,93,0,1,0.0,0.8,1,


In [125]:
import chardet

# 파일의 첫 10000 바이트를 읽어 인코딩을 확인합니다.
with open('../Project_data/Wine_Soil_Data_0718_v2.csv', 'rb') as f:
    result = chardet.detect(f.read(10000))

# 인코딩을 출력합니다.
print(result['encoding'])


MacRoman


In [182]:
df= pd.read_csv('../Project_data/Wine_Soil_Data_0719.csv')
df

Unnamed: 0.1,Unnamed: 0,Longitude,Latitude,Soil Mapping Unit,Continent,Country,Region Lv1,Grape_Variety,SHARE_x,WRB4_x,...,CEC_CLAY,CEC_EFF,TEB,BSAT,ALUM_SAT,ESP,TCARBON_EQ,GYPSUM,ELEC_COND,Top12_Grape
0,0,-63.358333,-40.766667,12245,Americas,Argentina,Río Negro Province,malbec,60,FLeu,...,63,22,22,94,0,4,2.0,0.7,2,1
1,1,-63.358333,-40.775000,12245,Americas,Argentina,Río Negro Province,malbec,60,FLeu,...,63,22,22,94,0,4,2.0,0.7,2,1
2,2,-67.641667,-39.066667,12325,Americas,Argentina,Río Negro Province,malbec,50,RGeu,...,75,20,20,98,0,3,1.8,4.0,1,1
3,3,-67.641667,-39.075000,12325,Americas,Argentina,Río Negro Province,malbec,50,RGeu,...,75,20,20,98,0,3,1.8,4.0,1,1
4,4,-67.333333,-39.033333,12325,Americas,Argentina,Río Negro Province,malbec,50,RGeu,...,75,20,20,98,0,3,1.8,4.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3173,667,18.375000,48.258333,10723,Europe,Slovakia,Nitrianska,Cabernet Franc,62,LVha,...,47,13,13,80,0,1,0.0,0.1,1,0
3174,668,18.383333,48.258333,10723,Europe,Slovakia,Nitrianska,Cabernet Franc,62,LVha,...,47,13,13,80,0,1,0.0,0.1,1,0
3175,669,17.516667,48.425000,10713,Europe,Slovakia,Malokarpatska,Blaufrankisch,50,CHha,...,65,26,26,93,0,1,0.0,0.8,1,0
3176,670,17.516667,48.416667,10713,Europe,Slovakia,Malokarpatska,Blaufrankisch,50,CHha,...,65,26,26,93,0,1,0.0,0.8,1,0


In [183]:
df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [134]:
df.columns

Index(['Longitude', 'Latitude', 'Soil Mapping Unit', 'Continent', 'Country',
       'Region Lv1', 'Grape_Variety', 'SHARE_x', 'WRB4_x', 'KOPPEN',
       'TEXTURE_USDA_x', 'REF_BULK_DENSITY', 'BULK_DENSITY', 'DRAINAGE_x',
       'ROOT_DEPTH_x', 'AWC_x', 'ROOTS_x', 'IL_x', 'HWSD2_SMU_ID_y',
       'SEQUENCE', 'SHARE_y', 'WRB4_y', 'ROOT_DEPTH_y', 'ROOTS_y', 'IL_y',
       'SWR', 'DRAINAGE_y', 'AWC_y', 'COARSE', 'SAND', 'SILT', 'CLAY',
       'TEXTURE_USDA_y', 'TEXTURE_SOTER', 'BULK', 'REF_BULK', 'ORG_CARBON',
       'PH_WATER', 'TOTAL_N', 'CN_RATIO', 'CEC_SOIL', 'CEC_CLAY', 'CEC_EFF',
       'TEB', 'BSAT', 'ALUM_SAT', 'ESP', 'TCARBON_EQ', 'GYPSUM', 'ELEC_COND',
       'Top10_Grape'],
      dtype='object')

In [135]:
df.columns = ['Longitude', 'Latitude', 'Soil Mapping Unit', 'Continent', 'Country',
       'Region Lv1', 'Grape_Variety', 'SHARE_x', 'WRB4_x', 'KOPPEN',
       'TEXTURE_USDA_x', 'REF_BULK_DENSITY', 'BULK_DENSITY', 'DRAINAGE_x',
       'ROOT_DEPTH_x', 'AWC_x', 'ROOTS_x', 'IL_x', 'HWSD2_SMU_ID_y',
       'SEQUENCE', 'SHARE_y', 'WRB4_y', 'ROOT_DEPTH_y', 'ROOTS_y', 'IL_y',
       'SWR', 'DRAINAGE_y', 'AWC_y', 'COARSE', 'SAND', 'SILT', 'CLAY',
       'TEXTURE_USDA_y', 'TEXTURE_SOTER', 'BULK', 'REF_BULK', 'ORG_CARBON',
       'PH_WATER', 'TOTAL_N', 'CN_RATIO', 'CEC_SOIL', 'CEC_CLAY', 'CEC_EFF',
       'TEB', 'BSAT', 'ALUM_SAT', 'ESP', 'TCARBON_EQ', 'GYPSUM', 'ELEC_COND',
       'Top12_Grape']

In [184]:
data = pd.concat([df,wine_soil_data],axis=0)
data

Unnamed: 0,Longitude,Latitude,Soil Mapping Unit,Continent,Country,Region Lv1,Grape_Variety,SHARE_x,WRB4_x,KOPPEN,...,CEC_CLAY,CEC_EFF,TEB,BSAT,ALUM_SAT,ESP,TCARBON_EQ,GYPSUM,ELEC_COND,Top12_Grape
0,-63.358333,-40.766667,12245,Americas,Argentina,Río Negro Province,malbec,60,FLeu,B,...,63,22,22,94,0,4,2.0,0.7,2,1.0
1,-63.358333,-40.775000,12245,Americas,Argentina,Río Negro Province,malbec,60,FLeu,B,...,63,22,22,94,0,4,2.0,0.7,2,1.0
2,-67.641667,-39.066667,12325,Americas,Argentina,Río Negro Province,malbec,50,RGeu,B,...,75,20,20,98,0,3,1.8,4.0,1,1.0
3,-67.641667,-39.075000,12325,Americas,Argentina,Río Negro Province,malbec,50,RGeu,B,...,75,20,20,98,0,3,1.8,4.0,1,1.0
4,-67.333333,-39.033333,12325,Americas,Argentina,Río Negro Province,malbec,50,RGeu,B,...,75,20,20,98,0,3,1.8,4.0,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256,17.375000,47.141667,9795,Europe,Hungary,Balaton,Harslevelu,60,LVcr,D,...,47,14,13,81,13,2,0.0,0.1,1,
257,17.383333,47.141667,9795,Europe,Hungary,Balaton,Harslevelu,60,LVcr,D,...,47,14,13,81,13,2,0.0,0.1,1,
258,16.475000,47.833333,9993,Europe,Hungary,Balaton,White Blend,60,CHha,D,...,65,26,26,93,0,1,0.0,0.8,1,
259,16.483333,47.833333,9993,Europe,Hungary,Balaton,White Blend,60,CHha,D,...,65,26,26,93,0,1,0.0,0.8,1,


In [196]:
data['Grape_Variety'] = data['Grape_Variety'].str.lower()

In [197]:
grape= data.loc[(data.Top12_Grape == 1.0),'Grape_Variety'].unique()


In [198]:
grape

array(['malbec', 'cabernet sauvignon', 'pinot noir', 'merlot', 'syrah',
       'zinfandel', 'cabernet franc', 'grenache', 'nebbiolo',
       'tempranillo', 'sangiovese', 'montepulciano'], dtype=object)

In [199]:
top_12 = [1 if gv in grape else 0 for gv in data['Grape_Variety']]

In [200]:
data['Top12_Grape'] = top_12

In [201]:
data['Top12_Grape'].value_counts()

Top12_Grape
1    2217
0    1222
Name: count, dtype: int64

In [202]:
len(data['Top12_Grape'])

3439

In [203]:
data.to_csv('wine_soil_data_0721.csv')

---

# 최종 기후 데이터 합치기

In [206]:
import chardet

# 파일의 첫 10000 바이트를 읽어 인코딩을 확인합니다.
with open("C:/Users/user/OneDrive/문서/사용자 지정 Office 서식 파일/바탕 화면/ML_wine/ML_wine/GLOSIS 글로벌 토양 특성/climate_location/cli_2021.xlsx", 'rb') as f:
    result = chardet.detect(f.read(10000))

# 인코딩을 출력합니다.
print(result['encoding'])

None


In [209]:
climate = pd.read_excel("C:/Users/user/OneDrive/문서/사용자 지정 Office 서식 파일/바탕 화면/ML_wine/ML_wine/GLOSIS 글로벌 토양 특성/climate_location/cli_2021.xlsx",index_col=0)
climate

Unnamed: 0,lat,lon,TG,TN,TX,RR,RR1,Year
0,0.25,-80.25,299.502427,293.87598,306.98184,27.746031,5.309524,2021
1,0.25,-79.75,299.484624,293.64417,307.24510,63.801150,7.523810,2021
2,0.25,-79.25,299.607305,293.66605,307.08545,82.949785,8.095238,2021
3,0.25,-78.75,293.708853,287.09363,301.88147,60.724637,7.904762,2021
4,0.25,-78.25,288.315110,281.21375,297.13270,30.210875,7.190476,2021
...,...,...,...,...,...,...,...,...
67415,-0.25,130.25,300.482714,294.72333,306.24112,65.889495,5.880952,2021
67416,-0.25,130.75,300.609352,294.19830,306.83590,65.965905,6.333333,2021
67417,-0.25,131.25,299.835385,294.34714,305.33120,69.212400,6.214286,2021
67418,-0.25,132.25,299.222501,294.12173,304.20984,70.775925,6.547619,2021


In [211]:
data

Unnamed: 0,Longitude,Latitude,Soil Mapping Unit,Continent,Country,Region Lv1,Grape_Variety,SHARE_x,WRB4_x,KOPPEN,...,CEC_CLAY,CEC_EFF,TEB,BSAT,ALUM_SAT,ESP,TCARBON_EQ,GYPSUM,ELEC_COND,Top12_Grape
0,-63.358333,-40.766667,12245,Americas,Argentina,Río Negro Province,malbec,60,FLeu,B,...,63,22,22,94,0,4,2.0,0.7,2,1
1,-63.358333,-40.775000,12245,Americas,Argentina,Río Negro Province,malbec,60,FLeu,B,...,63,22,22,94,0,4,2.0,0.7,2,1
2,-67.641667,-39.066667,12325,Americas,Argentina,Río Negro Province,malbec,50,RGeu,B,...,75,20,20,98,0,3,1.8,4.0,1,1
3,-67.641667,-39.075000,12325,Americas,Argentina,Río Negro Province,malbec,50,RGeu,B,...,75,20,20,98,0,3,1.8,4.0,1,1
4,-67.333333,-39.033333,12325,Americas,Argentina,Río Negro Province,malbec,50,RGeu,B,...,75,20,20,98,0,3,1.8,4.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256,17.375000,47.141667,9795,Europe,Hungary,Balaton,harslevelu,60,LVcr,D,...,47,14,13,81,13,2,0.0,0.1,1,0
257,17.383333,47.141667,9795,Europe,Hungary,Balaton,harslevelu,60,LVcr,D,...,47,14,13,81,13,2,0.0,0.1,1,0
258,16.475000,47.833333,9993,Europe,Hungary,Balaton,white blend,60,CHha,D,...,65,26,26,93,0,1,0.0,0.8,1,0
259,16.483333,47.833333,9993,Europe,Hungary,Balaton,white blend,60,CHha,D,...,65,26,26,93,0,1,0.0,0.8,1,0


In [214]:
climate.columns

Index(['lat', 'lon', 'TG', 'TN', 'TX', 'RR', 'RR1', 'Year'], dtype='object')

In [219]:
# 기후 데이터 기준
from tqdm.notebook import tqdm_notebook
import pandas as pd

# 빈 데이터프레임 초기화
wine_total_data = pd.DataFrame()

# 각 행을 반복하며 처리
for idx, row in tqdm_notebook(climate.iterrows(), total=climate.shape[0]):
    # 기후 데이터 위,경도 반올림
    lat = round(row['lat'])
    lng = round(row['lon'])

    # 데이터에서 검색
    soil_filtered = data[(round(data['Latitude']) == lat) & (round(data['Longitude']) == lng)].copy()
    soil_filtered['TG'] = row['TG']
    soil_filtered['TN'] = row['TN']
    soil_filtered['TX'] = row['TX']
    soil_filtered['RR'] = row['RR']
    soil_filtered['RR1'] = row['RR1']
    soil_filtered['Year'] = row['Year']

    
    # 원래 데이터 가져오기
    
    # 결과 데이터프레임에 추가
    if wine_total_data.empty:
        wine_total_data = soil_filtered
    else:
        wine_total_data = pd.concat([wine_total_data, soil_filtered], axis=0)

# 결과 확인
wine_total_data


  0%|          | 0/67420 [00:00<?, ?it/s]

Unnamed: 0,Longitude,Latitude,Soil Mapping Unit,Continent,Country,Region Lv1,Grape_Variety,SHARE_x,WRB4_x,KOPPEN,...,TCARBON_EQ,GYPSUM,ELEC_COND,Top12_Grape,TG,TN,TX,RR,RR1,Year
670,-100.850000,21.208333,18344,Americas,Mexico,Coahuila / Durango,malbec,53,CLpt,B,...,17.2,0.2,0,1,296.846743,279.51572,317.52783,40.539153,4.904762,2021.0
670,-100.850000,21.208333,18344,Americas,Mexico,Coahuila / Durango,malbec,53,CLpt,B,...,17.2,0.2,0,1,296.016913,279.98334,316.09198,34.349409,5.238095,2021.0
666,-99.883333,20.700000,18324,Americas,Mexico,Querétaro,syrah,53,PHha,C,...,0.0,0.0,0,1,294.280882,279.96274,313.37300,27.237040,5.095238,2021.0
667,-99.883333,20.700000,18324,Americas,Mexico,Querétaro,tempranillo,53,PHha,C,...,0.0,0.0,0,1,294.280882,279.96274,313.37300,27.237040,5.095238,2021.0
668,-99.883333,20.700000,18324,Americas,Mexico,Querétaro,malbec,53,PHha,C,...,0.0,0.0,0,1,294.280882,279.96274,313.37300,27.237040,5.095238,2021.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664,-64.608333,-21.575000,12532,Americas,Bolivia,Tarija Department,tannat,50,LPeu,C,...,0.0,0.3,1,0,292.704945,279.82860,305.25250,32.855223,5.857143,2021.0
665,-75.750000,-14.008333,18816,Americas,Peru,Ica Region,tannat,60,AR,B,...,0.0,0.1,0,0,294.970977,286.55646,304.60748,0.039163,0.000000,2021.0
665,-75.750000,-14.008333,18816,Americas,Peru,Ica Region,tannat,60,AR,B,...,0.0,0.1,0,0,294.268625,286.21810,304.25677,0.684856,0.000000,2021.0
665,-75.750000,-14.008333,18816,Americas,Peru,Ica Region,tannat,60,AR,B,...,0.0,0.1,0,0,295.222970,286.59293,305.37946,0.046065,0.000000,2021.0


In [220]:
len(data['Latitude']),len(wine_total_data['Latitude'])

(3439, 13054)

In [222]:
wine_total_data.to_csv('../Project_data/wine_total_data_0721.csv')

In [258]:
#토양 데이터 기준
#정수형 으로 변환

data['LatInt'] = data['Latitude'].astype(int)
data['LngInt'] = data['Longitude'].astype(int)

# Climate 데이터도 정수형으로 변환
climate['lat_int'] = climate['lat'].astype(int)
climate['lon_int'] = climate['lon'].astype(int)

# 데이터프레임 병합
merged_data_2 = pd.merge(
    data,
    climate,
    how='left',
    left_on=['LatInt', 'LngInt'],
    right_on=['lat_int', 'lon_int']
)

# 결과 확인
merged_data_2


Unnamed: 0,Longitude,Latitude,Soil Mapping Unit,Continent,Country,Region Lv1,Grape_Variety,SHARE_x,WRB4_x,KOPPEN,...,TG,TN,TX,RR,RR1,Year,lat_round,lon_round,lat_int,lon_int
0,-63.358333,-40.766667,12245,Americas,Argentina,Río Negro Province,malbec,60,FLeu,B,...,290.671964,278.31567,303.50006,14.568954,1.333333,2021,-41.0,-64.0,-40,-63
1,-63.358333,-40.766667,12245,Americas,Argentina,Río Negro Province,malbec,60,FLeu,B,...,290.655011,278.43283,303.47107,15.902047,1.428571,2021,-41.0,-63.0,-40,-63
2,-63.358333,-40.766667,12245,Americas,Argentina,Río Negro Province,malbec,60,FLeu,B,...,291.058975,278.14188,304.67847,13.090754,1.285714,2021,-40.0,-64.0,-40,-63
3,-63.358333,-40.766667,12245,Americas,Argentina,Río Negro Province,malbec,60,FLeu,B,...,290.893624,278.20532,304.39062,17.103797,1.500000,2021,-40.0,-63.0,-40,-63
4,-63.358333,-40.775000,12245,Americas,Argentina,Río Negro Province,malbec,60,FLeu,B,...,290.671964,278.31567,303.50006,14.568954,1.333333,2021,-41.0,-64.0,-40,-63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13817,16.483333,47.833333,9993,Europe,Hungary,Balaton,white blend,60,CHha,D,...,288.641716,271.34473,301.28100,27.442335,4.714286,2021,48.0,17.0,47,16
13818,19.150000,46.391667,9768,Europe,Hungary,Duna,kadarka blau,50,PHcc,D,...,289.334150,270.87646,301.29350,24.005887,4.214286,2021,46.0,19.0,46,19
13819,19.150000,46.391667,9768,Europe,Hungary,Duna,kadarka blau,50,PHcc,D,...,289.495897,271.16970,301.07513,22.787967,4.261905,2021,46.0,20.0,46,19
13820,19.150000,46.391667,9768,Europe,Hungary,Duna,kadarka blau,50,PHcc,D,...,289.209322,270.88278,301.52680,22.684277,4.404762,2021,47.0,19.0,46,19


In [259]:
merged_data_2.to_csv('../Project_data/wine_total_data_0721.csv')