# 개요

# Prepare

# 1. 초기 세팅

## Library & data load

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import re

In [5]:
# 보기 옵션
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [6]:
# 데이터 불러오기
# 2018~2019
df_lst = []
names = ['2018','201901_06','201907_12']
for name in names:
    address = "/Users/luci031/Desktop/Coding/g_auction/data_origin/auction_"+name+".csv"
    if name == '201907_12':
        a = pd.read_csv(address,encoding='euc_kr',sep=',')
    else:
        a = pd.read_csv(address,encoding='euc_kr',sep=';')
    df_lst.append(a)

# 2020~    
for year in range(2020,2023):
    for i in range(1,13):
        if i < 13:
            if i < 10:
                df_name = "auction_"+str(year)+"0"+str(i)
            else:
                df_name = "auction_"+str(year)+str(i)
        a = pd.read_csv("/Users/luci031/Desktop/Coding/g_auction/data_origin/"+df_name+".csv",encoding='euc_kr')
        df_lst.append(a)

    # 데이터 전처리
for df in df_lst:
    df.reset_index(inplace=True,drop=True)
    if 'Unnamed: 0' in df.columns:
        df.drop(columns=['Unnamed: 0'],inplace=True)

# 모든 데이터 통합
df = df_lst[0]
for dfs in df_lst[1:]:
    df = pd.concat([df,dfs])

df.reset_index(inplace=True,drop=True)

  a = pd.read_csv(address,encoding='euc_kr',sep=',')


# Pre-Processing

In [7]:
df

Unnamed: 0,품목명,단위,등급,가격,산지,친환경구분,입력일
0,[깻잎]깻잎(찹),2kg,특(1등),11500,대구,일반,20180622
1,[깻잎]깻잎(찹),4kg,특(1등),15500,대구,일반,20180622
2,[무]열무,8kg,특(1등),3300,경기 포천시,일반,20180622
3,[무]열무,4kg,특(1등),6900,경기 포천시,일반,20180622
4,[무]열무,4kg,특(1등),5300,경기 포천시,일반,20180622
...,...,...,...,...,...,...,...
29707989,[홍합국산]홍합국산,10kg,보통(3등),12000,"경남 마산(고성,진해)",일반,20221231
29707990,[홍합국산]홍합국산,10kg,보통(3등),9000,"경남 마산(고성,진해)",일반,20221231
29707991,[홍합국산]홍합국산,10kg,보통(3등),10000,"경남 마산(고성,진해)",일반,20221231
29707992,[홍합국산]홍합국산,10kg,보통(3등),8000,"경남 마산(고성,진해)",일반,20221231


In [8]:
# 결측치 확인
df.isnull().sum()

품목명           0
단위            0
등급            0
가격         4426
산지       100804
친환경구분         0
입력일           0
dtype: int64

- 결측치 처리

In [9]:
# 결측치 제거
df = df.dropna(axis=0)

- 영문명으로 전환

In [10]:
df = df.rename(columns = {'품목명':'prd',
                          '단위':'scale','등급':'class',
                          '가격':'price',
                          '산지':'origin',
                          '친환경구분':'eco',
                          '입력일':'reg_date'})

## column 별 정보

### class

In [11]:
# 등급명 변환
df111 = pd.DataFrame([['특(1등)', 1], ['상(2등)',2], ['보통(3등)',3], ['9등(등외)',9], ['없음',9], ['5등',5], ['4등',4], ['6등',6], ['7등',7],['8등',8]],columns=['class','new_class'])
df2 = df.merge(df111)
df2 = df2.drop(columns='class')

In [12]:
df2 = df2.rename(columns={'new_class':'class'})

In [13]:
df2['class'].value_counts()

1    22870269
2     2926103
3     1242318
9     1186171
4      128333
5      123751
6       38585
7       13562
8        3929
Name: class, dtype: int64

### eco

In [14]:
# 영문명으로 전환
# 전환기 -> 유기농 재배로 전환중
df2.loc[df2["eco"]=="일반","eco"] = "normal"
df2.loc[df2["eco"]=="우수농산물","eco"] = "good_prd"
df2.loc[df2["eco"]=="무농약","eco"] = "ecofriendly"
df2.loc[df2["eco"]=="유기농","eco"] = "organic"
df2.loc[df2["eco"]=="품질인증","eco"] = "certified"
df2.loc[df2["eco"]=="전환기","eco"] = "ing"
df2.loc[df2["eco"]=="저농약","eco"] = "low"
df2.loc[df2["eco"]=="산지안전성","eco"] = "safe"

In [15]:
df2['eco'].value_counts()

normal         25261802
good_prd        1915301
ecofriendly     1051712
certified        210780
ing               50144
organic           28524
low               14737
safe                 21
Name: eco, dtype: int64

### reg_date

In [16]:
df2['reg_date'] = df2['reg_date'].astype('str')
df2['reg_date'] = pd.to_datetime(df2['reg_date'].astype('str'),format='%Y%m%d')

### price

In [17]:
# 계산을 위해 scale 값 미리 변경
# 현재 값들은 string이므로 float 형태로 변환할 필요가 있음
# 몇몇 이상치 처리도 진행
def scale_checker(scale):
  if scale.startswith('.')==True:
    return format(float('0'+scale[:-2]),'.2f')
  else:
    return format(float((scale[:-2])),'.2f')
df2['scale'] = df2['scale'].apply(scale_checker)
df2['scale'] = df2['scale'].astype('float')
# df2['scale'] = format(round(df2['scale'].apply(pd.to_numeric),2),'.2f')

In [18]:
df2['price'] = pd.to_numeric(df2['price'])

In [19]:
df2[df2['price']<0]

Unnamed: 0,prd,scale,price,origin,eco,reg_date,class
8099973,[버섯]버섯(표고버섯),8.0,-56000,경기 연천군,normal,2020-09-29,1
8117334,[버섯]버섯(표고버섯),8.0,-56000,경기 연천군,normal,2020-09-30,1


In [20]:
# 1원짜리 거래 발견, 너무 거래값이 작은 값들은 삭제
pd.options.display.float_format = '{:.5f}'.format
df2['price'].describe()

count   28533021.00000
mean       23225.23175
std       137210.13061
min       -56000.00000
25%         7900.00000
50%        14000.00000
75%        25500.00000
max     99999999.00000
Name: price, dtype: float64

In [21]:
df2 = df2.drop(df2[df2['price']<500].index)
df2 = df2.drop(df2[df2['scale']==0].index)

- 품목명, 단위, 등급, 산지, 친환경구분, 입력일 일치하는 값들 거래 합치기

In [22]:
df2

Unnamed: 0,prd,scale,price,origin,eco,reg_date,class
0,[깻잎]깻잎(찹),2.00000,11500,대구,normal,2018-06-22,1
1,[깻잎]깻잎(찹),4.00000,15500,대구,normal,2018-06-22,1
2,[무]열무,8.00000,3300,경기 포천시,normal,2018-06-22,1
3,[무]열무,4.00000,6900,경기 포천시,normal,2018-06-22,1
4,[무]열무,4.00000,5300,경기 포천시,normal,2018-06-22,1
...,...,...,...,...,...,...,...
28533016,[호박]애호박,8.00000,10000,경상남도 진주시,normal,2022-12-31,4
28533017,[호박]애호박,8.00000,6000,경상남도 진주시,normal,2022-12-31,4
28533018,[호박]애호박,8.00000,16500,경상남도 진주시,normal,2022-12-31,4
28533019,[호박]애호박,8.00000,6000,경상남도 진주시,normal,2022-12-31,4


In [23]:
df2 = df2.groupby(['prd','origin','eco','reg_date','class']).sum().reset_index()

In [24]:
# kg 당 가격 생성
df2['price_kg'] = round((df2['price']/df2['scale']),2)

### prd
- 품목명 정리
- 품목 종류 정리

In [28]:
df2['price_kg'].describe()

count   4753344.00000
mean       3553.08898
std        8192.83400
min           0.08000
25%        1258.93000
50%        2272.32000
75%        4083.33000
max     7757360.00000
Name: price_kg, dtype: float64

In [29]:
# 대부분 [상품 종류]상품명 으로 정리되어 있는걸 알 수 있음
df2['prd'].value_counts()

[오이]백다다기       89464
[호박]애호박        63466
[가지]가지         58735
[버섯]새송이        42092
[딸기]설향         37813
               ...  
[미나리]단미나리          1
[포도]다노렛(포도)        1
[포도]다노렛            1
[버섯]목이수입           1
[땅콩]국산땅콩           1
Name: prd, Length: 3702, dtype: int64

In [30]:
# 정리를 위해 상품명 삭제
def prd_sort(words):
  p = re.compile('[ㄱ-힣]+')
  word = re.search(p,words)
  return word.group()

df2['prd'] = df2['prd'].apply(prd_sort)

In [31]:
# 기타 품목 삭제
df2 = df2.drop(df2[df2['prd']=='기타'].index)

In [32]:
# 국산,수입산,깐,기타 삭제
def prd_sort2(words):
  p = re.compile('[^국산수입깐기타]+')
  word = re.search(p,str(words))
  return word.group()

df2["prd"] = df2["prd"].apply(prd_sort2)


In [33]:
###
df2.loc[df2["prd"]=="감말랭이","prd"] = "감"
df2.loc[df2["prd"]=="떫은감","prd"] = "감"

# 고 -> 고수로 변경
df2.loc[df2["prd"]=="고","prd"] = "고수"
# 과실류, 과일류, 과채, 채소 -> 과채류로 통합
df2.loc[df2["prd"]=="과실류","prd"] = "과채류"
df2.loc[df2["prd"]=="과일류","prd"] = "과채류"
df2.loc[df2["prd"]=="과일야채류","prd"] = "과채류"
df2.loc[df2["prd"]=="과일과채류","prd"] = "과채류"
df2.loc[df2["prd"]=="과일","prd"] = "과채류"
df2.loc[df2["prd"]=="과채","prd"] = "과채류"
df2.loc[df2["prd"]=="채소","prd"] = "과채류"
df2.loc[df2["prd"]=="건채류","prd"] = "과채류"
df2.loc[df2["prd"]=="근채류","prd"] = "과채류"
df2.loc[df2["prd"]=="쌈용채소","prd"] = "과채류"
df2.loc[df2["prd"]=="쌈채류","prd"] = "과채류"
df2.loc[df2["prd"]=="양채류","prd"] = "과채류"
df2.loc[df2["prd"]=="엽경채류","prd"] = "과채류"
df2.loc[df2["prd"]=="채류","prd"] = "과채류"
df2.loc[df2["prd"]=="채소류","prd"] = "과채류"
# 고들빼 -> 고들빼기로 변경
df2.loc[df2["prd"]=="고들빼","prd"] = "고들빼기"

df2.loc[df2["prd"]=="금감","prd"] = "귤"
df2.loc[df2["prd"]=="만감","prd"] = "귤"

df2.loc[df2["prd"]=="검복","prd"] = "복어"
# 꼴뚜 -> 꼴두기로 변경
df2.loc[df2["prd"]=="꼴뚜","prd"] = "꼴뚜기"
# 그레이프푸룻 -> 자몽으로 통합
df2.loc[df2["prd"]=="그레이프푸룻","prd"] = "자몽"

# 느 -> 느타리버섯으로 통합
df2.loc[df2["prd"]=="느","prd"] = "느타리버섯"

# 대구 관련 품목들 정리
df2.loc[df2["prd"]=="대구고니","prd"] = "대구"
df2.loc[df2["prd"]=="대구머리","prd"] = "대구"
df2.loc[df2["prd"]=="대구포","prd"] = "대구"
df2.loc[df2["prd"]=="대구알","prd"] = "대구"
df2.loc[df2["prd"]=="대구원양","prd"] = "대구"

# 듀리안 -> 두리안으로 통합
df2.loc[df2["prd"]=="듀리안","prd"] = "두리안"
# 딸 -> 딸기로 변경
df2.loc[df2["prd"]=="딸","prd"] = "딸기"

# 로케트 -> 루꼴라로 변경
df2.loc[df2["prd"]=="로케트","prd"] = "루꼴라"

# 만가닥 -> 만가닥버섯으로 변경
df2.loc[df2["prd"]=="만가닥","prd"] = "만가닥버섯"
# 맛 -> 맛조개로 변경
df2.loc[df2["prd"]=="맛","prd"] = "맛조개"
# 망고스턴 -> 망고스틴으로 수정
df2.loc[df2["prd"]=="망고스턴","prd"] = "망고스틴"
# 메 -> 메기로 변경
df2.loc[df2["prd"]=="메","prd"] = "메기"
# 명태 관련 품목들 정리
df2.loc[df2["prd"]=="명태곤니","prd"] = "명태"
df2.loc[df2["prd"]=="명태알","prd"] = "명태"
df2.loc[df2["prd"]=="명태포","prd"] = "명태"
df2.loc[df2["prd"]=="코다리명태","prd"] = "명태"
# 무우 -> 무로 통합
df2.loc[df2["prd"]=="무우","prd"] = "무"

# 방풍 -> 방풍나물로 통합
df2.loc[df2["prd"]=="방풍","prd"] = "방풍나물"
# 버섯류 -> 버섯으로 변경
df2.loc[df2["prd"]=="버섯류","prd"] = "버섯"
# 벗,버찌 -> 체리로 통합
df2.loc[df2["prd"]=="벗","prd"] = "체리"
df2.loc[df2["prd"]=="버찌","prd"] = "체리"
df2.loc[df2["prd"]=="벗찌","prd"] = "체리"
# 브로커리, 브로코리 -> 브로콜리로 통합
df2.loc[df2["prd"]=="브로커리","prd"] = "브로콜리"
df2.loc[df2["prd"]=="브로코리","prd"] = "브로콜리"

# 삼 -> 수삼으로 변경
df2.loc[df2["prd"]=="삼","prd"] = "수삼"
# 새우살 -> 새우로 변경
df2.loc[df2["prd"]=="새우살","prd"] = "새우"
df2.loc[df2["prd"]=="남빙양새우","prd"] = "새우"
df2.loc[df2["prd"]=="닭새우","prd"] = "새우"
df2.loc[df2["prd"]=="대하","prd"] = "새우"
df2.loc[df2["prd"]=="동백하새우","prd"] = "새우"
df2.loc[df2["prd"]=="백새우","prd"] = "새우"
df2.loc[df2["prd"]=="보리새우","prd"] = "새우"
df2.loc[df2["prd"]=="적새우","prd"] = "새우"
df2.loc[df2["prd"]=="젓새우","prd"] = "새우"
df2.loc[df2["prd"]=="중하","prd"] = "새우"
# 세러리, 셀러리 -> 샐러리로 통합
df2.loc[df2["prd"]=="세러리","prd"] = "샐러리"
df2.loc[df2["prd"]=="셀러리","prd"] = "샐러리"
# 생고추 -> 고추로 통합
df2.loc[df2["prd"]=="생고추","prd"] = "고추"
df2.loc[df2["prd"]=="건고추","prd"] = "고추"
df2.loc[df2["prd"]=="고추잎","prd"] = "고추"
df2.loc[df2["prd"]=="붉은고추","prd"] = "고추"
df2.loc[df2["prd"]=="풋고추","prd"] = "고추"

# 아보카드 -> 아보카도로 변경
df2.loc[df2["prd"]=="아보카드","prd"] = "아보카도"
# 아스파라가스 -> 아스파라거스로 변경
df2.loc[df2["prd"]=="아스파라가스","prd"] = "아스파라거스"
# 알 -> 알타리무로 변경
df2.loc[df2["prd"]=="알","prd"] = "알타리무"
# 알배 -> 알배기로 변경
df2.loc[df2["prd"]=="알배","prd"] = "알배기"
# 양채 -> 양채류로 통합
df2.loc[df2["prd"]=="양채","prd"] = "양채류"
# 얼갈이 -> 얼갈이배추로 통합
df2.loc[df2["prd"]=="얼갈이","prd"] = "얼갈이배추"
# 엽채류, 엽채 -> 엽경채류로 통합
df2.loc[df2["prd"]=="엽채류","prd"] = "엽경채류"
df2.loc[df2["prd"]=="엽채","prd"] = "엽경채류"
# 오징어 관련 품목들 정리
df2.loc[df2["prd"]=="오징어다리","prd"] = "오징어"
df2.loc[df2["prd"]=="오징어원양","prd"] = "오징어"
df2.loc[df2["prd"]=="오징어채","prd"] = "오징어"
df2.loc[df2["prd"]=="활복오징어","prd"] = "오징어"
df2.loc[df2["prd"]=="오징어채원양","prd"] = "오징어"
df2.loc[df2["prd"]=="갑오징어살","prd"] = "오징어"
df2.loc[df2["prd"]=="갑오징어몸통","prd"] = "오징어"
df2.loc[df2["prd"]=="갑오징어다리","prd"] = "오징어"
df2.loc[df2["prd"]=="갑오징어","prd"] = "오징어"
df2.loc[df2["prd"]=="활복오징어원양","prd"] = "오징어"
df2.loc[df2["prd"]=="오징어살","prd"] = "오징어"
# 옥, 풋옥 -> 옥수수로 정리
df2.loc[df2["prd"]=="옥","prd"] = "옥수수"
df2.loc[df2["prd"]=="풋옥","prd"] = "옥수수"
# 임연 -> 임연수어로 정리
df2.loc[df2["prd"]=="임연","prd"] = "임연수어"


# 절단 -> 절단쭈꾸미 -> 쭈꾸미로 정리
df2.loc[df2["prd"]=="절단","prd"] = "쭈꾸미"
# 조개살 -> 조개로 정리
df2.loc[df2["prd"]=="조개살","prd"] = "조개"
df2.loc[df2["prd"]=="민들조개","prd"] = "조개"
df2.loc[df2["prd"]=="북방대합조개","prd"] = "조개"

# 참조 -> 참조기로 정리
df2.loc[df2["prd"]=="참조","prd"] = "참조기"

# 칼라후라워,칼리플라워, 칼리후라워 -> 컬리플라워로 통합
df2.loc[df2["prd"]=="칼라후라워","prd"] = "컬리플라워"
df2.loc[df2["prd"]=="칼리플라워","prd"] = "컬리플라워"
df2.loc[df2["prd"]=="칼리후라워","prd"] = "컬리플라워"
# 코라비 -> 콜라비로 통합
df2.loc[df2["prd"]=="코라비","prd"] = "콜라비"
# 키조개날개 -> 키조개로 통합
df2.loc[df2["prd"]=="키조개날개","prd"] = "키조개"

# 팽이 -> 팽이버섯으로 통합
df2.loc[df2["prd"]=="팽이","prd"] = "팽이버섯"
# 파세리 -> 파슬리로 변경
df2.loc[df2["prd"]=="파세리","prd"] = "파슬리"
# 표고 -> 표고버섯으로 통합
df2.loc[df2["prd"]=="표고","prd"] = "표고버섯"
# 포장바지락 -> 바지락으로 변경
df2.loc[df2["prd"]=="포장바지락","prd"] = "바지락"
df2.loc[df2["prd"]=="문어바지락","prd"] = "바지락"

### 일단 여기까지 실행

df2.loc[df2["prd"]=="갈치포","prd"] = "갈치"

#각굴 -> 굴로 통합
df2.loc[df2["prd"]=="각굴","prd"] = "굴"

#가오리채 -> 가오리로 통합
df2.loc[df2["prd"]=="가오리채","prd"] = "가오리"
df2.loc[df2["prd"]=="가오리포","prd"] = "가오리"
df2.loc[df2["prd"]=="간재미","prd"] = "가오리"
# 게지 -> 키조개로 통합
df2.loc[df2["prd"]=="게지","prd"] = "키조개"
#고 -> 고구마로 통합
df2.loc[df2["prd"]=="고","prd"] = "고구마"
df2.loc[df2["prd"]=="고구마순","prd"] = "고구마"
#꼴뚜 -> 꼴뚜기로 통합
df2.loc[df2["prd"]=="꼴뚜","prd"] = "꼴뚜기"

df2.loc[df2["prd"]=="꽁치원양","prd"] = "꽁치"

df2.loc[df2["prd"]=="노랑가오리","prd"] = "가오리"

df2.loc[df2["prd"]=="놀래","prd"] = "노래미"

df2.loc[df2["prd"]=="능성어원양","prd"] = "능성어"

df2.loc[df2["prd"]=="다래순","prd"] = "다래"

df2.loc[df2["prd"]=="다슬","prd"] = "다슬기"
#돗나물 -> 돌나물로 통합
df2.loc[df2["prd"]=="돗나물","prd"] = "돌나물"

df2.loc[df2["prd"]=="동죽살","prd"] = "동죽"
#레드쉬 -> 레디쉬로 통합
df2.loc[df2["prd"]=="레드쉬","prd"] = "레디쉬"
#만가닥 -> 만가닥버섯으로 통합
df2.loc[df2["prd"]=="만가닥","prd"] = "만가닥버섯"

df2.loc[df2["prd"]=="머위대","prd"] = "머위"
#메 -> 메기로 통합
df2.loc[df2["prd"]=="메","prd"] = "메기"

df2.loc[df2["prd"]=="민물새우원양","prd"] = "민물새우"
#봄동 -> 봄동배추로 통합
df2.loc[df2["prd"]=="봄동","prd"] = "봄동배추"
#봉지굴 -> 굴로 통합
df2.loc[df2["prd"]=="봉지굴","prd"] = "굴"
#박 -> 수박으로 통합
df2.loc[df2["prd"]=="박","prd"] = "수박"
#백조 -> 조기로 통합 
df2.loc[df2["prd"]=="백조","prd"] = "조기"
#벅굴 -> 벚굴로 통합
df2.loc[df2["prd"]=="벅굴","prd"] = "벚굴"

df2.loc[df2["prd"]=="병어살","prd"] = "병어"

df2.loc[df2["prd"]=="보리멸원양","prd"] = "보리멸"

df2.loc[df2["prd"]=="부지갱이","prd"] = "부지깽이"

#비 -> 비타민으로 통합
df2.loc[df2["prd"]=="비","prd"] = "비타민"
#빈스 -> 콩으로 통합
df2.loc[df2["prd"]=="빈스","prd"] = "콩"
#삐틀이 -> 고둥으로 통합
df2.loc[df2["prd"]=="삐틀이","prd"] = "고둥"
#새송이 -> 새송이버섯으로 통합
df2.loc[df2["prd"]=="새송이","prd"] = "새송이버섯"

df2.loc[df2["prd"]=="새싹","prd"] = "새싹채소"

df2.loc[df2["prd"]=="새치","prd"] = "황새치"
#세트 -> 과채류로 통합
df2.loc[df2["prd"]=="세트","prd"] = "과채류"
#알 -> 알로애로 통합
df2.loc[df2["prd"]=="알","prd"] = "알로에"

df2.loc[df2["prd"]=="양조","prd"] = "양조기"
#엄나무 -> 음나무로 통합
df2.loc[df2["prd"]=="엄나무","prd"] = "음나무"
#염고등어 -> 고등어로 통합
df2.loc[df2["prd"]=="염고등어","prd"] = "고등어"
#오만둥이 -> 미더덕으로 통합
df2.loc[df2["prd"]=="오만둥이","prd"] = "미더덕"
#오징어 관련 품목들 정리
df2.loc[df2["prd"]=="선동오징어","prd"] = "오징어"
df2.loc[df2["prd"]=="오징어몸통","prd"] = "오징어"
#우렁쉥이 -> 멍게로 통합
df2.loc[df2["prd"]=="우렁쉥이","prd"] = "멍게"
#우뭇가시리 -> 우뭇가사리로 통합
df2.loc[df2["prd"]=="우뭇가시리","prd"] = "우뭇가사리"
#적채 -> 적양배추로 통합
df2.loc[df2["prd"]=="적채","prd"] = "적양배추"
df2.loc[df2["prd"]=="빨간양배추","prd"] = "적양배추"
#절단낙지 -> 낙지로 통합
df2.loc[df2["prd"]=="절단낙지","prd"] = "낙지"
#조 -> 조개로 통합
df2.loc[df2["prd"]=="조","prd"] = "조개"
df2.loc[df2["prd"]=="개량조개","prd"] = "조개"
#청어원양 -> 청어로 통합
df2.loc[df2["prd"]=="청어원양","prd"] = "청어"
#키조개꼭지 -> 키조개로 통합
df2.loc[df2["prd"]=="키조개꼭지","prd"] = "키조개"

# 꽃게 관련 꽃게로 통합
df2.loc[df2["prd"]=="암꽃게","prd"] = "꽃게"
df2.loc[df2["prd"]=="숫꽃게","prd"] = "꽃게"
df2.loc[df2["prd"]=="꽃게살","prd"] = "꽃게"
# 우렁 -> 우렁이로 통합
df2.loc[df2["prd"]=="우렁","prd"] = "우렁이"
df2.loc[df2["prd"]=="논우렁이","prd"] = "우렁이"
# 알로애 -> 알로에로 변경
df2.loc[df2["prd"]=="알로애","prd"] = "알로에"

df2.loc[df2["prd"]=="영지버섯","prd"] = "영지"

df2.loc[df2["prd"]=="자연","prd"] = "송이"

df2.loc[df2["prd"]=="적근대","prd"] = "근대"

df2.loc[df2["prd"]=="적겨자","prd"] = "겨자"

df2.loc[df2["prd"]=="줄삼치","prd"] = "삼치"

df2.loc[df2["prd"]=="참꼬막","prd"] = "꼬막"

df2.loc[df2["prd"]=="참다래","prd"] = "다래"

df2.loc[df2["prd"]=="참당귀","prd"] = "당귀"

df2.loc[df2["prd"]=="참조기","prd"] = "조기"

df2.loc[df2["prd"]=="청겨자","prd"] = "겨자"
# 피마자잎 -> 피마자로 변경
df2.loc[df2["prd"]=="피미자잎","prd"] = "피마자"
df2.loc[df2["prd"]=="피마자잎","prd"] = "피마자"
# 식용허브 -> 허브로 변경
df2.loc[df2["prd"]=="식용허브","prd"] = "허브"
# 선인장열매 -> 선인장으로 변경
df2.loc[df2["prd"]=="선인장열매","prd"] = "선인장"
# 쌈추 -> 쌈채류로 변경
df2.loc[df2["prd"]=="쌈추","prd"] = "쌈채류"
# 덟은감 -> 감으로 변경
df2.loc[df2["prd"]=="덟은감","prd"] = "감"

df2.loc[df2["prd"]=="파래김","prd"] = "파래"

df2.loc[df2["prd"]=="패션푸룻","prd"] = "패션후르츠"
# 피뿔고동 -> 고둥으로 변경
df2.loc[df2["prd"]=="피뿔고동","prd"] = "고둥"
# 꼬시래 -> 꼬시래기으로 변경
df2.loc[df2["prd"]=="꼬시래","prd"] = "꼬시래기"
# 연어살 -> 연어로 변경
df2.loc[df2["prd"]=="연어살","prd"] = "연어"
# 비단멍게 -> 멍게로 변경
df2.loc[df2["prd"]=="비단멍게","prd"] = "멍게"
# 물가자미 -> 가자미로 변경
df2.loc[df2["prd"]=="물가자미","prd"] = "가자미"
# 물메 -> 메기로 변경
df2.loc[df2["prd"]=="물메","prd"] = "메기"
# 붉은메 -> 메기로 변경
df2.loc[df2["prd"]=="붉은메","prd"] = "메기"
# 무우 -> 무로 통합
df2.loc[df2["prd"]=="무우","prd"] = "무"

#각굴 -> 굴로 통합
df2.loc[df2["prd"]=="각굴","prd"] = "굴"
df2.loc[df2["prd"]=="봉지굴","prd"] = "굴"
df2.loc[df2["prd"]=="벅굴","prd"] = "굴"

# 배추 관련 품목들 정리
df2.loc[df2["prd"]=="봄동배추","prd"] = "배추"
df2.loc[df2["prd"]=="얼갈이배추","prd"] = "배추"
df2.loc[df2['prd'].str.contains('알배'),"prd"] = "배추"

# 조미제품 삭제
df2.drop(df2[df2['prd']=='조미제품'].index,inplace=True)
df2.drop(df2[df2['prd']=='어류원양'].index,inplace=True)
df2.drop(df2[df2['prd']=='동물'].index,inplace=True)
df2.drop(df2[df2['prd']=='깔게'].index,inplace=True)
df2.drop(df2[df2['prd']=='맛살'].index,inplace=True)
df2.drop(df2[df2['prd']=='서류'].index,inplace=True)
df2.drop(df2[df2['prd']=='시바'].index,inplace=True)
df2.drop(df2[df2['prd']=='양조'].index,inplace=True)
df2.drop(df2[df2['prd']=='어류'].index,inplace=True)
df2.drop(df2[df2['prd']=='절임'].index,inplace=True)
df2.drop(df2[df2['prd']=='춘채'].index,inplace=True)
df2.drop(df2[df2['prd']=='패류'].index,inplace=True)
df2.drop(df2[df2['prd'].str.contains('조미')].index,inplace=True)

In [34]:
#df[df['prd'].str.contains('패류')]#['prd'].unique()

In [35]:
a = sorted(df2['prd'].unique())
for i in range(len(a)):
  if i%15 != 0:
    print(a[i],end=' ')
  else:
    print(a[i],end=' ')
    print()

가리비 
가물치 가오리 가자미 가재 가죽나물 가지 갈치 감 감귤 감자 갓 강낭콩 강달이 개불 개암 
개조개 갯가재 갯장어 건고구마순 게 겨자 고구마 고니 고둥 고들빼기 고등어 고비 고사리 고수 고추 
곤달비 곤드레나물 골뱅이 곶감 과채류 굴 귤 그린빈스 근대 김 깻잎 꼬막 꼬시래기 꼴뚜기 꽁치 
꽃게 꽈리고추 낙지 날치알 냉이 넙치 노래미 노루궁뎅이버섯 농어 눈볼대 느타리버섯 능성어 다랑어 다래 다슬기 
다시마 단감 달래 당귀 당근 대구 대추 대파 더덕 도다리 도라지 도루묵 도토리 돈나물 돌게 
돌나물 돔 동부콩 동자개 동죽 두류 두릅 두리안 딸기 땅콩 라임 람부탄 레디쉬 레몬 루꼴라 
리치 마 마늘 만가닥버섯 맛조개 망고 망고스틴 망둥어 매생이 매실 머루 머위 멍게 메기 메론 
메밀 멸치 명이나물 명태 모과 모시조개 모자반 목이 무 무순 무청 무화과 문어 미꾸라지 미나리 
미더덕 미역 민들레 민물돔 민물새우 민어 바나나 바다가재 바지락 박대 밤 방어 방울토마토 방풍나물 배 
배추 백합 밴댕이 뱀장어 뱅어 버섯 벚굴 병어 보리 보리멸 복분자 복숭아 복어 볼락 부세 
부지깽이 부추 붕어 붕장어 브로콜리 블루베리 비단조개 비름 비타민 비트 빙어 사과 살구 삼나물 삼치 
상어 상추 상황버섯 새꼬막 새송이버섯 새싹채소 새우 새조개 샐러리 생강 생채 서대 석류 선인장 세발나물 
소라 속새 솔잎 송이 수박 수삼 숙주나물 순무 시금치 신선초 실파 쌈채류 쑥 쑥갓 씀바귀 
아귀 아로니아 아보카도 아스파라거스 아욱 알로에 알타리무 앵두 야자 야콘 양미리 양배추 양상추 양송이 양조기 
양채류 양태 양파 여주 연근 연어 열무 엽경채류 영지 오가피 오디 오렌지 오이 오징어 오징어알 
오크라 옥돔 옥수수 완두콩 용과 우럭조개 우렁이 우뭇가사리 우엉 우엉대 원추리 위소라 유자 유채 으름 
은어 은행 음나무 임연수어 잉어 자두 자라 자리돔 자몽 잔대 잣 장어 재첩 적양배추 전갱이 
전복 전어 조개 조기 조피볼락 죽순 죽합 준치 질경이 쪽파 쭈꾸미 참깨 참나물 참돔 참외 
참죽나무순 천도

In [36]:
# 기존 products 리스트와 비교
ex_prd = pd.read_csv("../products.csv")
df2[~df2['prd'].isin(ex_prd['prd'])]['prd'].unique()

array([], dtype=object)

In [37]:
new_prd = pd.DataFrame(df2['prd'].unique(),columns=['prd'])
products = ex_prd.append(new_prd[~new_prd['prd'].isin(ex_prd['prd'])],ignore_index=True)
products = ex_prd.append(new_prd,ignore_index=True)

  products = ex_prd.append(new_prd[~new_prd['prd'].isin(ex_prd['prd'])],ignore_index=True)
  products = ex_prd.append(new_prd,ignore_index=True)


In [38]:
# 저장
products.to_csv('../products.csv')

In [39]:
# 전체 거래량과 품목량
# 비교를 위해 분포 파악
# 거래량이 너무 적은 품목은 삭제할 필요성이 있음

prd_lst = (df2['prd'].value_counts()).to_frame()
prd_lst.describe()

Unnamed: 0,prd
count,329.0
mean,14444.06687
std,42433.35395
min,1.0
25%,45.0
50%,786.0
75%,6680.0
max,395206.0


In [40]:
# 500개 미만 품목은 비교가 힘드므로 삭제
# 줄어든 종류만 남기기

# prd_lst = prd_lst[prd_lst['prd']>=500]
# prd_lst.reset_index(inplace=True)
# prd_lst = prd_lst.rename(columns = {'index':'prd','prd':'count'})
# df3 = df2.merge(prd_lst)
# df3[df3['count']>=500]['prd'].value_counts()

In [41]:
# 확인 완료했으므로 count는 drop

# df3.drop('count',axis=1,inplace=True)

### scale

In [42]:
df3 = df2
np.set_printoptions(precision=2, suppress=True)
df3['scale'].unique()

array([ 10.  ,   5.  ,  20.  , ..., 450.01,  49.19,  26.85])

In [43]:
df3[df3['scale']>=50]

Unnamed: 0,prd,origin,eco,reg_date,class,scale,price,price_kg
416,가리비,경남 남해\t,normal,2022-10-14,3,50.00000,227000,4540.00000
420,가리비,경남 남해\t,normal,2022-10-21,3,55.00000,250000,4545.45000
421,가리비,경남 남해\t,normal,2022-10-22,3,50.00000,225000,4500.00000
427,가리비,경남 남해\t,normal,2022-10-29,3,50.00000,225000,4500.00000
428,가리비,경남 남해\t,normal,2022-10-31,3,60.00000,273000,4550.00000
...,...,...,...,...,...,...,...,...
4752509,홍합,전남 여수\t,normal,2022-12-27,3,150.00000,153000,1020.00000
4752510,홍합,전남 여수\t,normal,2022-12-28,3,140.00000,139000,992.86000
4752511,홍합,전남 여수\t,normal,2022-12-29,3,180.00000,182500,1013.89000
4752512,홍합,전남 여수\t,normal,2022-12-30,3,130.00000,135000,1038.46000


In [44]:
# 50kg 이상 거래된 건들 이상치라 판단하고 확인 진행
df3[df3['scale']>=50]['prd'].unique()

array(['가리비', '가자미', '가죽나물', '가지', '굴', '갈치', '감', '감귤', '감자', '오징어', '갓',
       '강낭콩', '개조개', '갯가재', '건고구마순', '고추', '과채류', '게', '키조개', '고구마', '고둥',
       '고들빼기', '고등어', '고사리', '곤달비', '곶감', '귤', '자몽', '근대', '두류', '버섯',
       '새우', '양채류', '엽경채류', '바지락', '홍합', '깻잎', '꼬막', '꽁치', '꽈리고추', '낙지',
       '날치알', '냉이', '우렁이', '눈볼대', '느타리버섯', '다래', '단감', '달래', '당근', '대구',
       '대추', '대파', '더덕', '도라지', '도토리', '돌나물', '동죽', '두릅', '두리안', '딸기',
       '땅콩', '레몬', '마', '마늘', '망고', '매생이', '매실', '머위', '메론', '명태', '모과',
       '모시조개', '무', '무청', '무화과', '메기', '미나리', '미더덕', '민들레', '바나나', '수박',
       '밤', '방울토마토', '방풍나물', '배', '배추', '백합', '뱀장어', '체리', '병어', '복숭아',
       '부세', '부추', '브로콜리', '블루베리', '비름', '비트', '콩', '적양배추', '사과', '살구',
       '삼치', '상추', '새꼬막', '새송이버섯', '새조개', '생강', '석류', '샐러리', '소라', '속새',
       '수삼', '숙주나물', '순무', '꽃게', '시금치', '실파', '쑥', '쑥갓', '아귀', '아보카도',
       '아스파라거스', '아욱', '알타리무', '앵두', '양배추', '양상추', '양송이', '양파', '연근',
       '연어', '열무', '오디', '오렌지', '오이', '옥수수', '완두콩', '멍게', 

In [45]:
# 배추
# display(df3[(df3['prd']=='배추')]['price_kg'].describe())
# display(df3[(df3['scale']>=50) & (df3['prd']=='배추')]['price_kg'].describe())
# display(df3[(df3['scale']>=50) & (df3['prd']=='배추')]['scale'].sum())
# display(df3[(df3['prd']=='배추')]['scale'].sum())
# df3 = df3.drop(df3[(df3['scale']>=50)&(df3['prd']=='배추')].index)

In [46]:
# 중간 저장
address2 = "/Users/luci031/Desktop/Coding/g_auction/data_ingredients/auction_mid2.parquet"
df3.to_parquet(address2,engine="pyarrow", compression='gzip')
# df3.to_csv("/Users/luci031/Desktop/Coding/g_auction/data_proceed/auction_total.csv")

# 여기서 다시 시작

In [47]:
address2 = "/Users/luci031/Desktop/Coding/g_auction/data_ingredients/auction_mid2.parquet"
df3 = pd.read_parquet(address2)

### origin

In [48]:
df3

Unnamed: 0,prd,origin,eco,reg_date,class,scale,price,price_kg
0,가리비,경남 거제(장승포),normal,2018-10-11,3,10.00000,19000,1900.00000
1,가리비,경남 거제(장승포),normal,2018-10-12,3,5.00000,20000,4000.00000
2,가리비,경남 거제(장승포),normal,2018-10-15,3,5.00000,20000,4000.00000
3,가리비,경남 거제(장승포),normal,2018-10-16,3,5.00000,19000,3800.00000
4,가리비,경남 거제(장승포),normal,2018-10-22,3,5.00000,18000,3600.00000
...,...,...,...,...,...,...,...,...
4753339,황강달이,전남 신안,normal,2021-05-26,3,20.00000,47000,2350.00000
4753340,황강달이,전남 신안,normal,2021-05-27,3,40.00000,94000,2350.00000
4753341,황새치,전북 군산\t,normal,2022-06-09,3,4.00000,6000,1500.00000
4753342,흑돔,국내산,normal,2021-03-30,3,10.00000,26700,2670.00000


In [49]:
# 도,시, 국산, 수입 표기 분류 필요
df3['origin'].unique()

array(['경남 거제(장승포)', '경남 남해', '경남 남해\t', ..., '충북 보은군 수한면', '전남 순천시 황전면',
       '충남 서천군 문산면'], dtype=object)

In [50]:
a = pd.DataFrame(df3['origin'].unique())

In [51]:
splits = a[0].str.split(' ')
cacul = splits.apply(lambda x: pd.Series(x))
df66 = pd.merge(a,cacul, left_index=True, right_index=True, how='inner')
df66

Unnamed: 0,0_x,0_y,1,2,3
0,경남 거제(장승포),경남,거제(장승포),,
1,경남 남해,경남,남해,,
2,경남 남해\t,경남,남해\t,,
3,"경남 마산(고성,진해)",경남,"마산(고성,진해)",,
4,경남 사천,경남,사천,,
...,...,...,...,...,...
1677,경남 의령군 용덕면,경남,의령군,용덕면,
1678,충남 부여군 홍산면,충남,부여군,홍산면,
1679,충북 보은군 수한면,충북,보은군,수한면,
1680,전남 순천시 황전면,전남,순천시,황전면,


In [52]:
# 세부 카테고리 정리 (state, city 편집 시 여기서 시작)

df4 = df66.rename(columns = {'0_x':'origin','0_y':'state',1:'city'})

# 2,3은 필요없는 정보이므로 삭제
df4 = df4.drop(columns=[2,3])

In [53]:
df4.head(1)

Unnamed: 0,origin,state,city
0,경남 거제(장승포),경남,거제(장승포)


In [54]:
df4['state'].unique()

array(['경남', '국내산', '국내산\t', '전남', '전북', '수입', '제주', '충남', '강원', '경기',
       '경북', '서울', '전라북도', '강원도', '경기도', '경상남도', '경상북도', '광주', '광주광역시',
       '광주시', '대구', '대구광역시', '대전', '부산', '서울시', '성남시', '세종', '전주시', '충북',
       '충청남도', '인천', '전라남도', '수입남아프리카공화국', '충청북도', '제주도', '제주자치도', '수입산',
       '제주특별자치', '서울특별시', '제주시', '제주특별자', '제주특별자치도', '미국', '대전광역시',
       '제주/서귀포', '대구시', '부산시', '인천시', '가락동', '오스트레일리아', '호주', '세종시', '중국',
       '공통출하처', '없음', '청주시', '인천광역시', '대전시', '서해안', '미등록', '남해안', '세종자치시',
       '멕시코', '베트남', '태국', '필리핀', '칠레', '남아프리카', '이스라엘', '페루', '해외',
       '서귀포시', '미국(US)', '불가리아', '인도네시아', '타이', '울산', '뉴질랜드', '브라질',
       '우즈베키스탄', '기타국', '이란', '남아프리카공화국', '러시아', '북한산', '대만', '포항시',
       '원양산', '그리스', '네델란드', '이탈리아', '기타외국', '부산광역시', '원양산(원양산)',
       '중국(CN)', '전북전주시완산구', '충북청주시상당구', '경기고양시덕양구', '경기고양시일산구',
       '경기성남시수정구', '충북청주시흥덕구', '이집트', '에콰도르', '캘리포니아', '국산', '해당사항없',
       '경남/남해군', '전남/고흥군', '충남/홍성군', '오스트레일', '인도', '인디아', '캄보디아',
       '필리핀(PH)'

In [55]:
# 시,도 제거
def name_sort(words):
  p = re.compile('[^시도]+')
  word = re.search(p,str(words))
  return word.group()

df4["state"] = df4["state"].apply(name_sort)

# 오타도 제거

def name_sort2(words):
  p = re.compile('[^\t]+')
  word = re.search(p,words)
  return word.group()

df4["state"] = df4["state"].apply(name_sort2)

In [56]:
df4["state"].unique()

array(['경남', '국내산', '전남', '전북', '수입', '제주', '충남', '강원', '경기', '경북', '서울',
       '전라북', '경상남', '경상북', '광주', '광주광역', '대구', '대구광역', '대전', '부산', '성남',
       '세종', '전주', '충북', '충청남', '인천', '전라남', '수입남아프리카공화국', '충청북', '제주자치',
       '수입산', '제주특별자치', '서울특별', '제주특별자', '미국', '대전광역', '제주/서귀포', '가락동',
       '오스트레일리아', '호주', '중국', '공통출하처', '없음', '청주', '인천광역', '서해안', '미등록',
       '남해안', '세종자치', '멕', '베트남', '태국', '필리핀', '칠레', '남아프리카', '이스라엘',
       '페루', '해외', '서귀포', '미국(US)', '불가리아', '인', '타이', '울산', '뉴질랜드',
       '브라질', '우즈베키스탄', '기타국', '이란', '남아프리카공화국', '러', '북한산', '대만', '포항',
       '원양산', '그리스', '네델란드', '이탈리아', '기타외국', '부산광역', '원양산(원양산)', '중국(CN)',
       '전북전주', '충북청주', '경기고양', '경기성남', '이집트', '에콰', '캘리포니아', '국산',
       '해당사항없', '경남/남해군', '전남/고흥군', '충남/홍성군', '오스트레일', '인디아', '캄보디아',
       '필리핀(PH)', '부천', '울산광역', '남인', '과테말라', '코스타리카', '콜롬비아', '미얀마',
       '아르헨티나', '우크라이나', '파나마', '에쿠아', '에콰돌', '수입산(기타)', '이', '세종특별자치',
       '수원', '충남/아산', '터키', '캐나다', '우즈베크', '우즈베키스', '일본', '경북/구미',


In [57]:
# state
# 특이케이스 제거

# 국내
df4.loc[df4["state"]=="국내산","city"] = "미상"
df4.loc[df4["state"]=="국내산","state"] = "국산"
df4.loc[df4["state"]=="원양산","city"] = "미상"
df4.loc[df4["state"]=="원양산","state"] = "국산"
df4.loc[df4["state"]=="동해안","city"] = "미상"
df4.loc[df4["state"]=="동해안","state"] = "국산"
df4.loc[df4["state"]=="공통출하처","city"] = "미상"
df4.loc[df4["state"]=="공통출하처","state"] = "국산"

## 도단위
df4.loc[df4["state"]=="충청북","state"] = "충북"
df4.loc[df4["state"]=="충청남","state"] = "충남"
df4.loc[df4["state"]=="전라북","state"] = "전북"
df4.loc[df4["state"]=="전라남","state"] = "전남"
df4.loc[df4["state"]=="경상북","state"] = "경북"
df4.loc[df4["state"]=="경상남","state"] = "경남"

## 시단위
### 광역시

df4.loc[df4["state"]=="대전","city"] = "대전"
df4.loc[df4["state"]=="대구","city"] = "대구"
df4.loc[df4["state"]=="서울특별","city"] = "서울"
df4.loc[df4["state"]=="서울특별","state"] = "서울"
df4.loc[df4["state"]=="인천광역","city"] = "인천"
df4.loc[df4["state"]=="인천광역","state"] = "인천"
df4.loc[df4["state"]=="부산","city"] = "부산"
df4.loc[df4["state"]=="부산광역","city"] = "부산"
df4.loc[df4["state"]=="부산광역","state"] = "부산"
df4.loc[df4["state"]=="가락동","city"] = "서울"
df4.loc[df4["state"]=="가락동","state"] = "서울"
df4.loc[df4["state"]=="세종","city"] = "세종"
df4.loc[df4["state"]=="세종특별자치","city"] = "세종"
df4.loc[df4["state"]=="세종특별자치","state"] = "세종"
df4.loc[df4["state"]=="세종자치","state"] = "세종"
df4.loc[df4["state"]=="세종자치","city"] = "세종"
df4.loc[df4["state"]=="대전","city"] = "대전"
df4.loc[df4["state"]=="제주","city"] = "제주"
df4.loc[df4["state"]=="제주특별자치","city"] = "제주"
df4.loc[df4["state"]=="제주특별자치","state"] = "제주"
df4.loc[df4["state"]=="제주특별자","city"] = "제주"
df4.loc[df4["state"]=="제주특별자","state"] = "제주"
df4.loc[df4["state"]=="제주자치","state"] = "제주"
df4.loc[df4["state"]=="제주자치","city"] = "제주"
df4.loc[df4["state"]=="제주/서귀포","city"] = "제주"
df4.loc[df4["state"]=="제주/서귀포","state"] = "제주"
df4.loc[df4["state"]=="서귀포","city"] = "제주"
df4.loc[df4["state"]=="서귀포","state"] = "제주"
df4.loc[df4["state"]=="세종","city"] = "세종"
df4.loc[df4["state"]=="울산","city"] = "울산"
df4.loc[df4["state"]=="울산","state"] = "울산"
df4.loc[df4["state"]=="광주광역","state"] = "광주"
df4.loc[df4["state"]=="광주광역","city"] = "광주"
df4.loc[df4["state"]=="대전광역","state"] = "대전"
df4.loc[df4["state"]=="대전광역","city"] = "대전"
df4.loc[df4["state"]=="대구광역","state"] = "대구"
df4.loc[df4["state"]=="대구광역","city"] = "대구"
df4.loc[df4["state"]=="울산광역","state"] = "울산"
df4.loc[df4["state"]=="울산광역","city"] = "울산"

### 일반시
df4.loc[df4["state"]=="성남","city"] = "성남"
df4.loc[df4["state"]=="성남","state"] = "경기"
df4.loc[df4["state"]=="경기성남","city"] = "성남"
df4.loc[df4["state"]=="경기성남","state"] = "경기"
df4.loc[df4["state"]=="전주","city"] = "전주"
df4.loc[df4["state"]=="전주","state"] = "전북"
df4.loc[df4["state"]=="청주","city"] = "청주"
df4.loc[df4["state"]=="청주","state"] = "충북"
df4.loc[df4["state"]=="수원","city"] = "수원"
df4.loc[df4["state"]=="수원","state"] = "경기"
df4.loc[df4["state"]=="부천","city"] = "부천"
df4.loc[df4["state"]=="부천","state"] = "경기"
df4.loc[df4["state"]=="경기고양","city"] = "고양"
df4.loc[df4["state"]=="경기고양","state"] = "경기"
df4.loc[df4["state"]=="충북청주","city"] = "청주"
df4.loc[df4["state"]=="충북청주","state"] = "충북"
df4.loc[df4["state"]=="충북청주시흥덕구","city"] = "청주"
df4.loc[df4["state"]=="충북청주시흥덕구","state"] = "충북"
df4.loc[df4["state"]=="전북전주","city"] = "전주"
df4.loc[df4["state"]=="전북전주","state"] = "전북"
df4.loc[df4["state"]=="경북/문경","city"] = "문경"
df4.loc[df4["state"]=="경북/문경","state"] = "경북"
df4.loc[df4["state"]=="포항","city"] = "포항"
df4.loc[df4["state"]=="포항","state"] = "경북"
df4.loc[df4["state"]=="경북/안동","city"] = "안동"
df4.loc[df4["state"]=="경북/안동","state"] = "경북"
df4.loc[df4["state"]=="전남/무안군","state"] = "전남"
df4.loc[df4["state"]=="전남/무안군","city"] = "무안"
df4.loc[df4["state"]=="무안","state"] = "전남"
df4.loc[df4["state"]=="무안","city"] = "무안"
df4.loc[df4["state"]=="경북/구미","city"] = "구미"
df4.loc[df4["state"]=="경북/구미","state"] = "경북"
df4.loc[df4["state"]=="경북/상주","city"] = "상주"
df4.loc[df4["state"]=="경북/상주","state"] = "경북"
df4.loc[df4["state"]=="경북/영천","city"] = "영천"
df4.loc[df4["state"]=="경북/영천","state"] = "경북"
df4.loc[df4["state"]=="경남/합천군","city"] = "합천"
df4.loc[df4["state"]=="경남/합천군","state"] = "경남"
df4.loc[df4["state"]=="전남/고흥군","city"] = "고흥"
df4.loc[df4["state"]=="전남/고흥군","state"] = "전남"
df4.loc[df4["state"]=="경남/남해군","state"] = "경남"
df4.loc[df4["state"]=="경남/남해군","city"] = "남해"
df4.loc[df4["state"]=="충남/아산","state"] = "충남"
df4.loc[df4["state"]=="충남/아산","city"] = "아산"
df4.loc[df4["state"]=="충남/홍성군","state"] = "충남"
df4.loc[df4["state"]=="충남/홍성군","city"] = "아산"
df4.loc[df4["state"]=="서해안","city"] = "미상"
df4.loc[df4["state"]=="서해안","state"] = "국산"
df4.loc[df4["state"]=="남해안","city"] = "미상"
df4.loc[df4["state"]=="남해안","state"] = "국산"


# 외국
## 아시아
df4.loc[df4["state"]=="칠레","city"] = "칠레"
df4.loc[df4["state"]=="칠레","state"] = "수입"
df4.loc[df4["state"]=="북한산","city"] = "북한"
df4.loc[df4["state"]=="북한산","state"] = "수입"
df4.loc[df4["state"]=="일본","city"] = "일본"
df4.loc[df4["state"]=="일본","state"] = "수입"
df4.loc[df4["state"]=="베트남","city"] = "베트남"
df4.loc[df4["state"]=="베트남","state"] = "수입"
df4.loc[df4["state"]=="우즈베크","city"] = "우즈베키스탄"
df4.loc[df4["state"]=="우즈베크","state"] = "수입"
df4.loc[df4["state"]=="이스라엘","city"] = "이스라엘"
df4.loc[df4["state"]=="이스라엘","state"] = "수입"
df4.loc[df4["state"]=="터키","city"] = "터키"
df4.loc[df4["state"]=="터키","state"] = "수입"
df4.loc[df4["state"]=="중국","city"] = "중국"
df4.loc[df4["state"]=="중국","state"] = "수입"
df4.loc[df4["state"]=="중국(CN)","city"] = "중국"
df4.loc[df4["state"]=="중국(CN)","state"] = "수입"
df4.loc[df4["state"]=="타이","city"] = "태국"
df4.loc[df4["state"]=="타이","state"] = "수입"
df4.loc[df4["state"]=="태국","city"] = "태국"
df4.loc[df4["state"]=="태국","state"] = "수입"
df4.loc[df4["state"]=="필리핀","city"] = "필리핀"
df4.loc[df4["state"]=="필리핀","state"] = "수입"
df4.loc[df4["state"]=="필리핀(PH)","city"] = "필리핀"
df4.loc[df4["state"]=="필리핀(PH)","state"] = "수입"
df4.loc[df4["state"]=="우즈베키스탄","city"] = "우즈베키스탄"
df4.loc[df4["state"]=="우즈베키스탄","state"] = "수입"
df4.loc[df4["state"]=="말레이지아","city"] = "말레이시아"
df4.loc[df4["state"]=="말레이지아","state"] = "수입"
df4.loc[df4["state"]=="대만","city"] = "대만"
df4.loc[df4["state"]=="대만","state"] = "수입"
df4.loc[df4["state"]=="베트남","city"] = "베트남"
df4.loc[df4["state"]=="베트남","state"] = "수입"
df4.loc[df4["state"]=="이란","city"] = "이란"
df4.loc[df4["state"]=="이란","state"] = "수입"
df4.loc[df4["state"]=="러","city"] = "러시아"
df4.loc[df4["state"]=="러","state"] = "수입"
df4.loc[df4["state"]=="캄보디아","city"] = "캄보디아"
df4.loc[df4["state"]=="캄보디아","state"] = "수입"
df4.loc[df4["state"]=="이집트","city"] = "이집트"
df4.loc[df4["state"]=="이집트","state"] = "수입"
df4.loc[df4["state"]=="인디아","city"] = "인도"
df4.loc[df4["state"]=="인디아","state"] = "수입"
df4.loc[df4["state"]=="미얀마","city"] = "미얀마"
df4.loc[df4["state"]=="미얀마","state"] = "수입"
df4.loc[(df4["state"]=="인")&(df4["origin"]=="인도네시아"),"city"] = "인도네시아"
df4.loc[(df4["state"]=="인")&(df4["origin"]=="인도네시아"),"state"] = "수입"
df4.loc[(df4["state"]=="인")&(df4["origin"]=="인도"),"city"] = "인도"
df4.loc[(df4["state"]=="인")&(df4["origin"]=="인도"),"state"] = "수입"

## 아메리카

df4.loc[df4["state"]=="캘리포니아","city"] = "미국"
df4.loc[df4["state"]=="캘리포니아","state"] = "수입"
df4.loc[df4["state"]=="포클랜드","city"] = "영국"
df4.loc[df4["state"]=="포클랜드","state"] = "수입"
df4.loc[df4["state"]=="콜롬비아","city"] = "콜롬비아"
df4.loc[df4["state"]=="콜롬비아","state"] = "수입"
df4.loc[df4["state"]=="에콰","city"] = "에콰도르"
df4.loc[df4["state"]=="에콰","state"] = "수입"
df4.loc[df4["state"]=="에쿠아","city"] = "에콰도르"
df4.loc[df4["state"]=="에쿠아","state"] = "수입"
df4.loc[df4["state"]=="에콰돌","city"] = "에콰도르"
df4.loc[df4["state"]=="에콰돌","state"] = "수입"
df4.loc[df4["state"]=="과테말라","city"] = "과테말라"
df4.loc[df4["state"]=="과테말라","state"] = "수입"
df4.loc[df4["state"]=="코스타리카","city"] = "코스타리카"
df4.loc[df4["state"]=="코스타리카","state"] = "수입"
df4.loc[df4["state"]=="멕","city"] = "멕시코"
df4.loc[df4["state"]=="멕","state"] = "수입"
df4.loc[df4["state"]=="페루","city"] = "페루"
df4.loc[df4["state"]=="페루","state"] = "수입"
df4.loc[df4["state"]=="미국","city"] = "미국"
df4.loc[df4["state"]=="미국","state"] = "수입"
df4.loc[df4["state"]=="미국(US)","city"] = "미국"
df4.loc[df4["state"]=="미국(US)","state"] = "수입"
df4.loc[df4["state"]=="아르헨티나","city"] = "아르헨티나"
df4.loc[df4["state"]=="아르헨티나","state"] = "수입"
df4.loc[df4["state"]=="브라질","city"] = "브라질"
df4.loc[df4["state"]=="브라질","state"] = "수입"
df4.loc[df4["state"]=="캐나다","city"] = "캐나다"
df4.loc[df4["state"]=="캐나다","state"] = "수입"
df4.loc[df4["state"]=="파나마","city"] = "파나마"
df4.loc[df4["state"]=="파나마","state"] = "수입"
df4.loc[df4["state"]=="우즈베키스","city"] = "우즈베키스탄"
df4.loc[df4["state"]=="우즈베키스","state"] = "수입"

## 유럽
df4.loc[df4["state"]=="스페인","city"] = "스페인"
df4.loc[df4["state"]=="스페인","state"] = "수입"
df4.loc[df4["state"]=="우크라이나","city"] = "우크라이나"
df4.loc[df4["state"]=="우크라이나","state"] = "수입"
df4.loc[df4["state"]=="이탈리아","city"] = "이탈리아"
df4.loc[df4["state"]=="이탈리아","state"] = "수입"
df4.loc[df4["state"]=="이태리","city"] = "이탈리아"
df4.loc[df4["state"]=="이태리","state"] = "수입"
df4.loc[df4["state"]=="불가리아","city"] = "불가리아"
df4.loc[df4["state"]=="불가리아","state"] = "수입"
df4.loc[df4["state"]=="그리스","city"] = "그리스"
df4.loc[df4["state"]=="그리스","state"] = "수입"
df4.loc[df4["state"]=="네델란드","city"] = "네덜란드"
df4.loc[df4["state"]=="네델란드","state"] = "수입"
df4.loc[df4["state"]=="프랑스","city"] = "프랑스"
df4.loc[df4["state"]=="프랑스","state"] = "수입"

## 아프리카
df4.loc[df4["state"]=="남아프리카","city"] = "남아공"
df4.loc[df4["state"]=="남아프리카","state"] = "수입"
df4.loc[df4["state"]=="남아프리카공화국","city"] = "남아공"
df4.loc[df4["state"]=="남아프리카공화국","state"] = "수입"
df4.loc[df4["state"]=="수입남아프리카공화국","city"] = "남아공"
df4.loc[df4["state"]=="수입남아프리카공화국","state"] = "수입"
df4.loc[df4["state"]=="남아공","city"] = "남아공"
df4.loc[df4["state"]=="남아공","state"] = "수입"


## 오세아니아
df4.loc[df4["state"]=="뉴질랜드","city"] = "뉴질랜드"
df4.loc[df4["state"]=="뉴질랜드","state"] = "수입"
df4.loc[df4["state"]=="뉴질랜드(NZ","city"] = "뉴질랜드"
df4.loc[df4["state"]=="뉴질랜드(NZ","state"] = "수입"
df4.loc[df4["state"]=="오스트레일","city"] = "호주"
df4.loc[df4["state"]=="오스트레일","state"] = "수입"
df4.loc[df4["state"]=="오스트레일리아","city"] = "호주"
df4.loc[df4["state"]=="오스트레일리아","state"] = "수입"
df4.loc[df4["state"]=="호주","city"] = "호주"
df4.loc[df4["state"]=="호주","state"] = "수입"
df4.loc[df4["state"]=="통가","city"] = "통가"
df4.loc[df4["state"]=="통가","state"] = "수입"
df4.loc[df4["state"]=="뉴칼레","city"] = "뉴칼레도니아"
df4.loc[df4["state"]=="뉴칼레","state"] = "수입"
df4.loc[df4["state"]=="뉴","city"] = "뉴칼레도니아"
df4.loc[df4["state"]=="뉴","state"] = "수입"
df4.loc[df4["state"]=="누벨칼레","city"] = "뉴칼레도니아"
df4.loc[df4["state"]=="누벨칼레","state"] = "수입"
df4.loc[df4["state"]=="파푸아","city"] = "파푸아뉴기니"
df4.loc[df4["state"]=="파푸아","state"] = "수입"
# 기타
df4.loc[df4["state"]=="수입산","state"] = "수입"
df4.loc[df4["state"]=="기타국","state"] = "수입"
df4.loc[df4["state"]=="기타외국","city"] = "미상"
df4.loc[df4["state"]=="기타외국","state"] = "수입"
df4.loc[df4["state"]=="해외","state"] = "수입"


In [58]:
# df4 = df4.drop(df4[(df4['state']=='없음')].index)
# df4 = df4.drop(df4[(df4['state']=='해당사항없')].index)
# df4.drop(df4[df4["state"]=="미등록"].index,inplace=True)
# df4.drop(df4[df4["state"]=="우편번호오류"].index,inplace=True)
# df4.drop(df4[df4["state"]=="수입산(기타)"].index,inplace=True)
# df4.drop(df4[df4["state"]=="기타국"].index,inplace=True)
# df4.drop(df4[df4["state"]=="해외"].index,inplace=True)
# df4.drop(df4[df4["state"]=="북대서양"].index,inplace=True)
# df4.drop(df4[df4["state"]=="남인"].index,inplace=True)
# df4.drop(df4[df4["state"]=="원양산(원양산)"].index,inplace=True)
# df4 = df4.drop(df4[(df4['origin']=='이시원')].index)

In [59]:
df4['state'].unique()

array(['경남', '국산', '전남', '전북', '수입', '제주', '충남', '강원', '경기', '경북', '서울',
       '광주', '대구', '대전', '부산', '세종', '충북', '인천', '없음', '미등록', '울산',
       '원양산(원양산)', '해당사항없', '남인', '수입산(기타)', '이', '북대서양', '우편번호오류'],
      dtype=object)

In [60]:
df4[df4['state']=='원양산(원양산)']

Unnamed: 0,origin,state,city
1027,원양산(원양산),원양산(원양산),


In [61]:
df3[df3['origin'].str.contains('해당사항없')]

Unnamed: 0,prd,origin,eco,reg_date,class,scale,price,price_kg
1516301,마늘,해당사항없,normal,2018-07-02,1,20.0,140000,7000.0
1516302,마늘,해당사항없,normal,2018-07-02,3,40.0,220000,5500.0
1516303,마늘,해당사항없,normal,2018-07-02,9,20.0,60000,3000.0
1516304,마늘,해당사항없,normal,2018-07-09,1,20.0,140000,7000.0
1516305,마늘,해당사항없,normal,2018-07-09,3,20.0,120000,6000.0
1516306,마늘,해당사항없,normal,2018-07-10,1,20.0,120000,6000.0
1516307,마늘,해당사항없,normal,2018-07-10,3,40.0,180000,4500.0
1516308,마늘,해당사항없,normal,2018-07-10,9,20.0,70000,3500.0
1516309,마늘,해당사항없,normal,2018-07-16,1,20.0,120000,6000.0
1516310,마늘,해당사항없,normal,2018-07-16,3,40.0,210000,5250.0


In [62]:
# df4[df4['state']=='남인']

In [63]:
# city
# 우선 결측치 제거
df4.loc[df4["city"].isnull(),"city"] = "미상"

In [64]:
df4['city'].isnull().sum()

0

In [65]:
df4.loc[df4['city']=='시',"city"] = "삭제"

In [66]:
# 이후 시,군 등 제거
def name_sort(words):
  p = re.compile('[^시군]+')
  word = re.search(p,words)
  return word.group()

df4["city"] = df4["city"].apply(name_sort)

In [67]:
# \t 도 제거
def name_sort2(words):
  p = re.compile('[^\t]+')
  word = re.search(p,words)
  return word.group()

df4["city"] = df4["city"].apply(name_sort2)

In [68]:
df4['city'].unique()

array(['거제(장승포)', '남해', '마산(고성,진해)', '사천', '삼천포', '통영', '미상', '고흥(나로도)',
       '여수', '산', '러', '미국', '베트남', '일본', '중국', '목포(무안)', '기타', '브라질',
       '아르헨티나', '칠레', '캐나다', '제주', '서산', '장항(서천)', '노르웨이', '대만', '영월',
       '광주', '여주', '이천', '고성', '밀양', '진주', '창녕', '하동', '함안', '함양', '문경',
       '상주', '청도', '송파구', '곡성', '광양', '구례', '담양', '보성', '순천', '해남', '남원',
       '순창', '고창', '논산', '당진', '보령', '부여', '서천', '청양', '태안', '홍성', '강릉',
       '양구', '원주', '인제', '철원', '춘천', '태백', '평창', '홍천', '화천', '횡성', '고양',
       '광명', '구리', '남양주', '성남', '안성', '양주', '양평', '연천', '용인', '평택', '포천',
       '하남', '화성', '김포', '파주', '거제', '김해', '의령', '진해', '창원', '합천', '경산',
       '고령', '위', '성주', '영주', '의성', '광산구', '남구', '북구', '대구', '대전', '부산',
       '강동구', '도봉구', '강서구', '동대문구', '세종', '고흥', '나주', '무안', '화순', '임실',
       '전주', '완주', '익산', '공주', '금산', '아산', '예산', '천안', '괴산', '음성', '제천',
       '진천', '충주', '청주', '정선', '가평', '봉화', '창', '강화', '인천', '강진', '완도',
       '모로코', '모리타니아', '세네갈', '스페인', '아랍에미레이트', '오만', '

In [69]:
# 나머지 케이스 정리

# 국내
df4.loc[df4["city"]=="기타","city"] = "미상"
## 일반시
df4.loc[df4["city"]=="위","city"] = "군위"
df4.loc[df4["city"]=="포","city"] = "군포"
df4.loc[df4["city"]=="고흥(나로도)","city"] = "고흥"
df4.loc[df4["city"]=="산","city"] = "군산"
df4.loc[df4["city"]=="장항(서천)","city"] = "서천"
df4.loc[df4["city"]=="삼천포","city"] = "사천"
df4.loc[df4["city"]=="삼천포","city"] = "사천"
df4.loc[df4["city"]=="순천(광양)","city"] = "순천"
df4.loc[df4["city"]=="대천(보령)","city"] = "보령"
df4.loc[df4["city"]=="대천(보령)","city"] = "보령"
df4.loc[df4["city"]=="화성(평택)","city"] = "화성"
df4.loc[df4["city"]=="흥","city"] = "시흥"
df4.loc[df4["city"]=="목포(무안)","city"] = "목포"
df4.loc[df4["city"]=="목포(무안)","state"] = "전남"
df4.loc[df4["city"]=="보성(벌교)","city"] = "보성"
df4.loc[df4["city"]=="울진(후포)","city"] = "울진"
df4.loc[df4["city"]=="논","city"] = "논산"
df4.loc[df4["city"]=="논","city"] = "논산"
df4.loc[df4["city"]=="순","city"] = "순창"
df4.loc[df4["city"]=="익","city"] = "익산"
df4.loc[df4["city"]=="미금","city"] = "성남"
df4.loc[df4["city"]=="주문진","city"] = "강릉"
df4.loc[df4["city"]=="울릉도","city"] = "울릉"
df4.loc[df4["city"]=="거제(장승포)","city"] = "거제"
df4.loc[df4["city"]=="남양","city"] = "남양주"
df4.loc[df4["city"]=="밀","city"] = "밀양"
df4.loc[df4["city"]=="예","city"] = "예산"
df4.loc[df4["city"]=="속초(아야진)","city"] = "속초"
df4.loc[(df4["city"]=="완")&(df4["state"]=="전북"),"city"] = "완주"
df4.loc[(df4["state"]=="전북")&(df4["city"]=="남"),"city"] = "남원"
#df4.loc[(df4['prd']=="호박") & (df4['state']=='전북'),"city"] = "전주"
df4.loc[(df4["state"]=="전남")&(df4["city"]=="신"),"city"] = "신안"
df4.loc[(df4["state"]=="전북")&(df4["city"]=="장"),"city"] = "장수"
df4.loc[(df4["state"]=="전남")&(df4["city"]=="진"),"city"] = "진도"
df4.loc[(df4["state"]=="충북")&(df4["city"]=="충"),"city"] = "충주"
df4.loc[(df4["state"]=="충북")&(df4["city"]=="청"),"city"] = "청주"
df4.loc[(df4["state"]=="충남")&(df4["city"]=="부"),"city"] = "부여"
df4.loc[(df4["state"]=="충남")&(df4["city"]=="예"),"city"] = "예산"
df4.loc[(df4["state"]=="충남")&(df4["city"]=="청"),"city"] = "청양"
df4.loc[(df4["state"]=="충남")&(df4["city"]=="보"),"city"] = "보령"
df4.loc[(df4["state"]=="충남")&(df4["city"]=="서"),"city"] = "서산"
df4.loc[(df4["state"]=="경남")&(df4["city"]=="의"),"city"] = "의령"
df4.loc[(df4["state"]=="전남")&(df4["city"]=="함"),"city"] = "함평"
df4.loc[(df4["state"]=="경북")&(df4["city"]=="영"),"city"] = "영주"
df4.loc[(df4["state"]=="경북")&(df4["city"]=="안"),"city"] = "안동"
df4.loc[(df4["state"]=="경남")&(df4["city"]=="거"),"city"] = "거창"
df4.loc[(df4["state"]=="경남")&(df4["city"]=="합"),"city"] = "합천"
df4.loc[(df4["state"]=="경남")&(df4["city"]=="밀"),"city"] = "밀양"
df4.loc[(df4["state"]=="경남")&(df4["city"]=="고"),"city"] = "고성"
df4.loc[(df4["state"]=="경남")&(df4["city"]=="진"),"city"] = "진주"
df4.loc[(df4["state"]=="경남")&(df4["city"]=="함"),"city"] = "함양"
df4.loc[(df4["state"]=="전북")&(df4["city"]=="김"),"city"] = "김제"
df4.loc[(df4["state"]=="전북")&(df4["city"]=="전"),"city"] = "전주"
df4.loc[(df4["state"]=="충남")&(df4["city"]=="아"),"city"] = "아산"
df4.loc[(df4["state"]=="전남")&(df4["city"]=="여"),"city"] = "여수"
df4.loc[(df4["state"]=="전남")&(df4["city"]=="나"),"city"] = "나주"
df4.loc[(df4["state"]=="전북")&(df4["city"]=="고"),"city"] = "고창"
df4.loc[(df4["state"]=="전북")&(df4["city"]=="부"),"city"] = "부안"
df4.loc[(df4["state"]=="전남")&(df4["city"]=="무"),"city"] = "무안"
df4.loc[(df4["state"]=="전남")&(df4["city"]=="고"),"city"] = "고흥"
df4.loc[(df4["state"]=="전남")&(df4["city"]=="해"),"city"] = "해남"
df4.loc[(df4["state"]=="경북")&(df4["city"]=="김"),"city"] = "김천"
df4.loc[(df4["state"]=="경북")&(df4["city"]=="청"),"city"] = "청도"
df4.loc[(df4["state"]=="경북")&(df4["city"]=="문"),"city"] = "문경"
df4.loc[(df4["state"]=="경남")&(df4["city"]=="하"),"city"] = "하동"
df4.loc[(df4["state"]=="경북")&(df4["city"]=="구"),"city"] = "구미"
df4.loc[(df4["state"]=="경북")&(df4["city"]=="상"),"city"] = "상주"
# 전라남도 장 -> 장흥, 장성 두개 있으므로 구분 불가, 삭제


## 광역시
df4.loc[df4["city"]=="강서구","city"] = "서울"
df4.loc[df4["city"]=="은평구","city"] = "서울"
df4.loc[df4["city"]=="강동구","city"] = "서울"
df4.loc[df4["city"]=="마포구","city"] = "서울"
df4.loc[df4["city"]=="중구","city"] = "서울"
df4.loc[df4["city"]=="송파구","city"] = "서울"
df4.loc[df4["city"]=="도봉구","city"] = "서울"
df4.loc[df4["city"]=="중랑구","city"] = "서울"
df4.loc[df4["city"]=="양천구","city"] = "서울"
df4.loc[df4["city"]=="성동구","city"] = "서울"
df4.loc[df4["city"]=="강남구","city"] = "서울"
df4.loc[df4["city"]=="동대문구","city"] = "서울"
df4.loc[df4["city"]=="영등포구","city"] = "서울"
df4.loc[df4["city"]=="영등포","city"] = "서울"
df4.loc[df4["city"]=="서초구","city"] = "서울"
df4.loc[df4["city"]=="구로구","city"] = "서울"
df4.loc[df4["city"]=="동대문","city"] = "서울"
df4.loc[df4["city"]=="금천구","city"] = "서울"
df4.loc[df4["city"]=="동작구","city"] = "서울"
df4.loc[df4["city"]=="용산구","city"] = "서울"
df4.loc[df4["city"]=="광진구","city"] = "서울"
df4.loc[df4["city"]=="종로구","city"] = "서울"
df4.loc[df4["city"]=="서대문구","city"] = "서울"
df4.loc[df4["city"]=="서대문","city"] = "서울"
df4.loc[df4["city"]=="성북구","city"] = "서울"
df4.loc[df4["city"]=="관악구","city"] = "서울"
df4.loc[df4["city"]=="강북구","city"] = "서울"
df4.loc[df4["city"]=="노원구","city"] = "서울"
df4.loc[df4["city"]=="송파","city"] = "서울"

df4.loc[df4["city"]=="광산구","city"] = "광주"
df4.loc[(df4["state"]=="광주")&(df4["city"]=="남구"),"city"] = "광주"
df4.loc[(df4["state"]=="광주")&(df4["city"]=="북구"),"city"] = "광주"
df4.loc[(df4["state"]=="광주")&(df4["city"]=="서구"),"city"] = "광주"
df4.loc[df4["city"]=="동구","city"] = "광주"
df4.loc[(df4["state"]=="인천")&(df4["city"]=="남구"),"city"] = "인천"
df4.loc[(df4["state"]=="인천")&(df4["city"]=="서구"),"city"] = "인천"
df4.loc[df4["city"]=="미추홀","city"] = "인천"
df4.loc[df4["city"]=="부평구","city"] = "인천"
df4.loc[df4["city"]=="남동구","city"] = "인천"
df4.loc[df4["city"]=="미추홀구","city"] = "인천"
df4.loc[df4["city"]=="계양구","city"] = "인천"
df4.loc[df4["city"]=="연수구","city"] = "인천"
df4.loc[df4["city"]=="마산(고성,진해)","city"] = "창원"
df4.loc[(df4["state"]=="경남")&(df4["city"]=="창"),"city"] = "창원"
df4.loc[(df4["state"]=="부산")&(df4["city"]=="북구"),"city"] = "부산"
df4.loc[df4["city"]=="오포","city"] = "부산"
df4.loc[df4["city"]=="부산진","city"] = "부산"
df4.loc[df4["city"]=="사상구","city"] = "부산"
df4.loc[df4["city"]=="부산진구","city"] = "부산"
df4.loc[df4["city"]=="북제주","city"] = "제주"
df4.loc[df4["city"]=="남제주","city"] = "제주"
df4.loc[df4["city"]=="서귀포","city"] = "제주"
df4.loc[(df4["state"]=="대구")&(df4["city"]=="남구"),"city"] = "대구"
df4.loc[(df4["state"]=="대구")&(df4["city"]=="북구"),"city"] = "대구"
df4.loc[(df4["state"]=="울산")&(df4["city"]=="남구"),"city"] = "울산"
df4.loc[(df4["state"]=="울산")&(df4["city"]=="북구"),"city"] = "울산"
df4.loc[(df4["state"]=="대전")&(df4["city"]=="서구"),"city"] = "대전"
df4.loc[df4["city"]=="달성","city"] = "대구"
df4.loc[df4["city"]=="고성(거진,대진)","city"] = "고성"
df4.loc[df4["city"]=="연기","state"] = "세종"
df4.loc[df4["city"]=="연기","city"] = "세종"
df4.loc[df4["city"]=="대덕구","city"] = "대전"
df4.loc[df4["city"]=="유성구","city"] = "대전"
df4.loc[df4["city"]=="울주","city"] = "울산"

# 해외
## 아시아
df4.loc[df4["city"]=="러","city"] = "러시아"
df4.loc[df4["city"]=="말레이지아","city"] = "말레이시아"
df4.loc[df4["city"]=="말레이","city"] = "말레이시아"
df4.loc[df4["city"]=="타이","city"] = "태국"
df4.loc[df4["city"]=="인도네","city"] = "인도네시아"
df4.loc[df4["city"]=="사우디","city"] = "사우디아라비아"
df4.loc[df4["city"]=="미국령","city"] = "미국"
df4.loc[df4["city"]=="코스","city"] = "오스트레일리아"
df4.loc[df4["city"]=="몬트세라트","city"] = "스페인"
df4.loc[df4["city"]=="그린랜드","city"] = "그린란드"
df4.loc[df4["city"]=="방글라데","city"] = "방글라데시"
df4.loc[df4["city"]=="모리타니아","city"] = "모리타니"
df4.loc[df4["city"]=="포루투갈","city"] = "포르투갈"
df4.loc[df4["city"]=="에라리온","city"] = "시에라리온"
## 아메리카
df4.loc[df4["city"]=="멕","city"] = "멕시코"
## 아프리카
df4.loc[df4["city"]=="남아공산","city"] = "남아공"
df4.loc[df4["city"]=="남아프리카공","city"] = "남아공"
df4.loc[df4["city"]=="국가","city"] = "삭제"
## 유럽
## 오세아니아

In [70]:
#df4.drop(df4[(df4['state']=='전남')&(df4["city"]=="영")].index,inplace=True)
#df4.drop(df4[(df4['state']=='경북')&(df4["city"]=="청")].index,inplace=True)
#df4.drop(df4[(df4['state']=='전남')&(df4["city"]=="장")].index,inplace=True)
#df4.drop(df4[(df4['state']=='충남')&(df4["city"]=="천")].index,inplace=True)
#df4.drop(df4[(df4['state']=='경북']&(df4["city"]=="성")].index,inplace=True)
# df4.drop(df4[df4["city"]=="불령리유니온"].index,inplace=True)

In [71]:
#df2[(df2['prd']=='양파') & (df2['price']==23000) & (df2['scale']==15)]['origin'].unique()

In [72]:
#df3[df3['origin']=='전라남도 영']

In [73]:
df4[df4['city']=='남구']

Unnamed: 0,origin,state,city


In [74]:
df4[df4['city']=='서구']['origin'].unique()

array([], dtype=object)

In [75]:
# 기존 city 리스트와 비교
ex_city = pd.read_csv("../cities.csv")
df4[~df4['city'].isin(ex_city['city'])]['city'].unique()

array([], dtype=object)

In [76]:
# 리스트 최신화
new_city = pd.DataFrame(df4['city'].unique(),columns=['city'])
cities = ex_city.append(new_city,ignore_index=True)

  cities = ex_city.append(new_city,ignore_index=True)


In [77]:
cities.drop(columns=['Unnamed: 0'],inplace=True)
cities.drop_duplicates(inplace=True)
cities.reset_index(inplace=True,drop=True)

In [78]:
#cities.to_csv('../cities.csv')

In [79]:
# 처리 완료하였으니 df4는 백업으로 두고 df5 활용
#df5 = df4.drop(columns='origin')

# 여기서 다시 시작

- origin 키로 써서 머지로 합치기
- 합친 뒤에 state, city 드랍하기
- 위에서 처리 안된 부분 확인하고 처리
- 파일도 새로 만들기

In [80]:
df3

Unnamed: 0,prd,origin,eco,reg_date,class,scale,price,price_kg
0,가리비,경남 거제(장승포),normal,2018-10-11,3,10.00000,19000,1900.00000
1,가리비,경남 거제(장승포),normal,2018-10-12,3,5.00000,20000,4000.00000
2,가리비,경남 거제(장승포),normal,2018-10-15,3,5.00000,20000,4000.00000
3,가리비,경남 거제(장승포),normal,2018-10-16,3,5.00000,19000,3800.00000
4,가리비,경남 거제(장승포),normal,2018-10-22,3,5.00000,18000,3600.00000
...,...,...,...,...,...,...,...,...
4753339,황강달이,전남 신안,normal,2021-05-26,3,20.00000,47000,2350.00000
4753340,황강달이,전남 신안,normal,2021-05-27,3,40.00000,94000,2350.00000
4753341,황새치,전북 군산\t,normal,2022-06-09,3,4.00000,6000,1500.00000
4753342,흑돔,국내산,normal,2021-03-30,3,10.00000,26700,2670.00000


In [81]:
df5 = df4.merge(df3,on='origin')
# 카운트 안지워진 파일이면 드랍하고 진행
#df5.drop('count',axis=1,inplace=True)
df5

Unnamed: 0,origin,state,city,prd,eco,reg_date,class,scale,price,price_kg
0,경남 거제(장승포),경남,거제,가리비,normal,2018-10-11,3,10.00000,19000,1900.00000
1,경남 거제(장승포),경남,거제,가리비,normal,2018-10-12,3,5.00000,20000,4000.00000
2,경남 거제(장승포),경남,거제,가리비,normal,2018-10-15,3,5.00000,20000,4000.00000
3,경남 거제(장승포),경남,거제,가리비,normal,2018-10-16,3,5.00000,19000,3800.00000
4,경남 거제(장승포),경남,거제,가리비,normal,2018-10-22,3,5.00000,18000,3600.00000
...,...,...,...,...,...,...,...,...,...,...
4752093,경남 의령군 용덕면,경남,의령,호박,normal,2019-12-11,1,10.00000,15500,1550.00000
4752094,충남 부여군 홍산면,충남,부여,호박,normal,2019-04-05,1,10.00000,4500,450.00000
4752095,충북 보은군 수한면,충북,보은,호박,normal,2020-06-05,1,10.00000,5500,550.00000
4752096,전남 순천시 황전면,전남,순천,홍고추,normal,2019-11-29,1,10.00000,33000,3300.00000


In [82]:
# state, city 중 문제 항목들 drop
# state
df5 = df5.drop(df5[(df5['state']=='없음')].index)
df5 = df5.drop(df5[(df5['state']=='해당사항없')].index)
df5.drop(df5[df5["state"]=="미등록"].index,inplace=True)
df5.drop(df5[df5["state"]=="우편번호오류"].index,inplace=True)
df5.drop(df5[df5["state"]=="수입산(기타)"].index,inplace=True)
df5.drop(df5[df5["state"]=="기타국"].index,inplace=True)
df5.drop(df5[df5["state"]=="해외"].index,inplace=True)
df5.drop(df5[df5["state"]=="북대서양"].index,inplace=True)
df5.drop(df5[df5["state"]=="남인"].index,inplace=True)
df5.drop(df5[df5["state"]=="원양산(원양산)"].index,inplace=True)
df5 = df5.drop(df5[(df5['origin']=='이시원')].index)
# city
df5.drop(df5[(df5['state']=='전남')&(df5["city"]=="영")].index,inplace=True)
df5.drop(df5[(df5['state']=='경북')&(df5["city"]=="청")].index,inplace=True)
df5.drop(df5[(df5['state']=='전남')&(df5["city"]=="장")].index,inplace=True)
df5.drop(df5[(df5['state']=='충남')&(df5["city"]=="천")].index,inplace=True)
df5.drop(df5[(df5['state']=='경북')&(df5["city"]=="성")].index,inplace=True)
df5.drop(df5[df5["city"]=="불령리유니온"].index,inplace=True)

In [83]:
df5[df5['city']=='장']

Unnamed: 0,origin,state,city,prd,eco,reg_date,class,scale,price,price_kg


In [84]:
# origin 드랍
df5.drop('origin',axis=1,inplace=True)

# 전처리 결과

In [85]:
df5.head()

Unnamed: 0,state,city,prd,eco,reg_date,class,scale,price,price_kg
0,경남,거제,가리비,normal,2018-10-11,3,10.0,19000,1900.0
1,경남,거제,가리비,normal,2018-10-12,3,5.0,20000,4000.0
2,경남,거제,가리비,normal,2018-10-15,3,5.0,20000,4000.0
3,경남,거제,가리비,normal,2018-10-16,3,5.0,19000,3800.0
4,경남,거제,가리비,normal,2018-10-22,3,5.0,18000,3600.0


In [86]:
df5.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4751864 entries, 0 to 4752097
Data columns (total 9 columns):
 #   Column    Dtype         
---  ------    -----         
 0   state     object        
 1   city      object        
 2   prd       object        
 3   eco       object        
 4   reg_date  datetime64[ns]
 5   class     int64         
 6   scale     float64       
 7   price     int64         
 8   price_kg  float64       
dtypes: datetime64[ns](1), float64(2), int64(2), object(4)
memory usage: 362.5+ MB


In [87]:
df5.isnull().sum()

state       0
city        0
prd         0
eco         0
reg_date    0
class       0
scale       0
price       0
price_kg    0
dtype: int64

In [88]:
df5['state'].unique()

array(['경남', '국산', '전남', '전북', '수입', '제주', '충남', '강원', '경기', '경북', '서울',
       '광주', '대구', '대전', '부산', '세종', '충북', '인천', '울산'], dtype=object)

In [89]:
df5['city'].unique()

array(['거제', '남해', '창원', '사천', '통영', '미상', '고흥', '여수', '군산', '러시아', '미국',
       '베트남', '일본', '중국', '목포', '브라질', '아르헨티나', '칠레', '캐나다', '제주', '서산',
       '서천', '노르웨이', '대만', '영월', '광주', '여주', '이천', '고성', '밀양', '진주', '창녕',
       '하동', '함안', '함양', '문경', '상주', '청도', '서울', '곡성', '광양', '구례', '담양',
       '보성', '순천', '해남', '남원', '순창', '고창', '논산', '당진', '보령', '부여', '청양',
       '태안', '홍성', '강릉', '양구', '원주', '인제', '철원', '춘천', '태백', '평창', '홍천',
       '화천', '횡성', '고양', '광명', '구리', '남양주', '성남', '안성', '양주', '양평', '연천',
       '용인', '평택', '포천', '하남', '화성', '김포', '파주', '김해', '의령', '진해', '합천',
       '경산', '고령', '군위', '성주', '영주', '의성', '대구', '대전', '부산', '세종', '나주',
       '무안', '화순', '임실', '전주', '완주', '익산', '공주', '금산', '아산', '예산', '천안',
       '괴산', '음성', '제천', '진천', '충주', '청주', '정선', '가평', '봉화', '강화', '인천',
       '강진', '완도', '모로코', '모리타니', '세네갈', '스페인', '아랍에미레이트', '오만', '파키스탄',
       '페루', '남아공', '영암', '산청', '김천', '예천', '장성', '영동', '영천', '영광', '정읍',
       '함평', '장흥', '장수', '부안', '청송', '거창', '구미

# 파일 저장

In [90]:
df5[(df5['prd'].str.contains('갓')) & (df5['reg_date']==dt.datetime(2020,2,13))]

Unnamed: 0,state,city,prd,eco,reg_date,class,scale,price,price_kg
2488563,경기,포천,갓,normal,2020-02-13,1,16.0,19600,1225.0
3094949,전남,무안,갓,normal,2020-02-13,1,20.0,24000,1200.0


In [91]:
address2 = "/Users/luci031/Desktop/Coding/g_auction/data_ingredients/auction_total3.parquet"
df5.to_parquet(address2,engine="pyarrow", compression='gzip')
df5.to_csv("/Users/luci031/Desktop/Coding/g_auction/data_proceed/auction_total3.csv")