# 形態素解析
- 空港の表記揺れは解消
- 一文字だけの単語の消去
- Verified review的なやつも消去
- 全て小文字化している
- 動詞を含むバージョンとそうでないバージョンを作成
- 空港名は削除する必要あり

In [1]:
import re
import pandas as pd
import treetaggerwrapper

In [2]:
review = pd.read_csv("/Users/kosuke/thesis/airport_thesis/data/review_raw_data.csv")
texts = review['text']

In [4]:
tagger = treetaggerwrapper.TreeTagger(TAGLANG='en', TAGDIR='/Users/kosuke/TreeTagger')
patterns = ["hong\nkong|hkg|hkia|^hk$",
            "taiwan|taoyuan|taipei|tpe", 
            "klia1|klia2|klia|kuala\nlumpur|kuala\nlumper|^malaysia$|^kul$|kla2|^kl$", 
            "\Anarita\Z|nrt",
            "haneda|hnd",
            "manila|naia|mnl|ninoy\naquino",
            "beijing|^pek$",
            "bangkok|suvarnabhumi|bkk|^tg$",
            "incheon|icn",
            "changi|singapore|^sin$",
            "chicago|o'hare|^ord$",
            "new\nyork|jfk|nyc",
            "san\nfrancisco|^sfo$",
            "atlanta|^atl$",
            "los\nangeles|lax|tom\nbradley",
            "shanghai\npudong|shanghai|pudong|pvg",
            "jakarta|cgk",
            "paris|cdg|charles\nde\ngaulle",
            "rome|^fco$|fiumicino",
            "frankfurt|^fra$",
            "heathrow|^lhr$",
            "delhi|^del$|^igi$",
            "mumbai|^bom$",
            "dubai|dxb",
            "hamad|doha|^doh$",
            "newark|^ewr$",
            "las\nvegas|vegas|maccarran",
            "denver|^den$|^dia$",
            "orlando|orland|^mco$",
            "ataturk|istanbul|^ist$",
            "madrid|^mad$|barajas",
            "amsterdam|^ams$|schiphol|schipol",
            "toronto|pearson|yyz",
            "munich|^muc$",
            "gatwick|lgw",
            "sydney|^syd$",
            "barcelona|bcn",
            "guangzhou|baiyun",
            "miami|^mia$",
            "charlotte_douglas|^clt$",
            "phoenix_sky_harbor|phoenix|phx",
            "seattle|seatac|tacoma",
            "dallas|dfw",
            "houston|^iah$",
            "minneapolis|^msp$|minneapolis\nst\npaul",
            "mexico\ncity|^mex$",
            "wifi|wi-fi",
            "line|queue",
            "usa|^us$|^america$",
            "^uk$|british",
            "terminal|t1|t2|t3|t4|t5|2a|2b|2c|2d|2e|2f|2g",
            "abu\ndhabi",
            "laguardia|la\nguardia",
            "british\nairways|^ba$",
            "virgin\natlantic",
            "united\nairline|^ua$",
            "amrican\nairline|^aa$"
            ]
terms = ["hong_kong",
         "taiwan", 
         "kuala_lumpur",
         "narita", 
         "haneda", 
         "manila", 
         "beijing", 
         "bangkok", 
         "incheon", 
         "changi",
         "chicago",
         "new_york",
         "san_francisco",
         "atlanta",
         "los_angels",
         "shanghai",
         "jakarta",
         "paris",
         "rome",
         "frankfurt",
         "heathrow",
         "delhi",
         "mumbai",
         "dubai",
         "hamad",
         "newark",
         "las_vegas",
         "denver",
         "orland",
         "ataturk",
         "madrid",
         "amsterdam",
         "toronto",
         "munich",
         "gatwick",
         "sydney",
         "barcelona",
         "guangzhou",
         "miami",
         "charlotte_douglas",
         "phoenix",
         "seattle",
         "dallas",
         "houston",
         "minneapolis",
         "mexico_city",
         "wifi",
         "line",
         "usa",
         "uk",
         "terminal",
         "abu_dhabi",
         "laguardia",
         "british_airways",
         "virgin_atlantic",
         "united_airline",
         "american_airline"
        ]

In [5]:
stop_words = "unverified\n|review\n|verify\n|hong_kong\n|taiwan\n|kuala_lumpur\n|narita\n|haneda\n|kansai\n|manila\n|beijing\n|bangkok\n|incheon\n|changi\n|chicago\n|vancouver\n|new_york\n|san_francisco\n|atlanta\n|los_angels\n|shanghai\n|jakarta\n|paris\n|^rome$\n|frankfurt\n|heathrow\n|delhi\n|mumbai\n|dubai\n|hamad\n|newark\n|las_vegas\n|denver\n|orland\n|ataturk\n|madrid\n|amsterdam\n|toronto\n|munich\n|gatwick\n|sydney\n|barcelona\n|guangzhou\n|miami\n|unverified|review|verify|hong_kong|taiwan|kuala_lumpur|narita|haneda|kansai|manila|beijing|bangkok|incheon|changi|chicago|vancouver|new_york|san_francisco|atlanta|los_angels|shanghai|jakarta|paris|^rome$|frankfurt|heathrow|delhi|mumbai|dubai|hamad|newark|las_vegas|denver|orland|ataturk|madrid|amsterdam|toronto|munich|gatwick|sydney|barcelona|guangzhou|miami|charlotte_douglas\n|charlotte_douglas|phoenix\n|phoenix|seattle\n|seattle|dallas\n|dallas|houston\n|houston|mexico_city\n|mexico_city"

In [6]:
morphed_text = []
pos_list = ['NN', 'JJ', 'NNS', 'VVD', 'NP', 'VVN', 'JJS', 'VV', 'JJR']
for text in list(texts):
    morph = pd.DataFrame(treetaggerwrapper.make_tags(tagger.TagText(text)))
    morph_sp = morph[morph['pos'].isin(pos_list)]['lemma']
    morph_sp = morph_sp.apply(lambda x: x.lower())
    morph_sp = morph_sp[morph_sp.apply(len) != 1]
    tmp_text = '\n'.join(morph_sp)
    for term, pattern in zip(terms, patterns):
        if re.search(pattern, tmp_text, flags=(re.MULTILINE | re.DOTALL)):
            tmp_text = re.sub(pattern, term, tmp_text, flags=(re.MULTILINE | re.DOTALL))
    tmp_text = re.sub(stop_words, "", tmp_text, flags=(re.MULTILINE | re.DOTALL))
    morphed_text.append(re.sub("\n", " ", tmp_text))
output = pd.DataFrame(morphed_text, columns=['text'])
