In [1]:
import gzip,os,tarfile,sys
sys.path.append(os.pardir+'/src')
from settings import *
from boto3.session import Session
import datetime
import traceback
import logging
import pandas as pd
from pandas import DataFrame
import xml.etree.ElementTree as et

In [None]:
# S3から記事データをダウンロードする関数
def downloadFile(bucket, tag, target_day):
    # バケットから指定されたタグと日付に該当するオブジェクトを取得
    objects = bucket.objects.all().filter(Prefix=tag+target_day)

    for object in objects:
        # データを格納するパスを生成
        path = os.path.join(DATA_DIR,tag+target_day)
        # ダウンロードを実施
        bucket.download_file(object.key, path)

# startで指定された日付からspan日分のファイル名配列を生成
def makeDateList(start, span):
    dateList = []

    for i in range(int(span)):
        dateList.append('EID34151_' + start.strftime("%Y%m%d") + '.xml.gz')
        start = start + datetime.timedelta(days=1)
    return dateList

In [None]:
start_time = datetime.datetime.now()

# コマンドライン引数からダウンロードを開始する日付と範囲を取得
start_date = '20130101'
span = '365'
start = datetime.datetime.strptime(start_date, '%Y%m%d')

# S3へ接続
session = Session(aws_access_key_id=AWS_ACCESS_KEY_ID,aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
s3 = session.resource('s3')
bucket = s3.Bucket(BUCKET_NAME)

# tagと日付リストを設定
tag = "EID34151_"
dateList = makeDateList(start, span)

for date in dateList:
    time = datetime.datetime.now()
    date = date[9:]
    try:
        downloadFile(bucket, tag, date)
        print(date + ' was done ' + str(datetime.datetime.now()-time))
    except Exception as e:
        print('error! ' + date)
        print(logging.error(traceback.format_exc()))

print('it taked ' + str(datetime.datetime.now() - start_time))

In [None]:
# 片山さんのプログラムによる抽出 csv1へ格納
import gzip,os,tarfile,sys
from settings import *
import pandas as pd
from pandas import DataFrame
from boto3.session import Session
import xml.etree.ElementTree as et
import datetime


# xml.gzファイルを解凍、xmlの構造を解析しCSVファイルを作成する関数
def convertToCSV(file_name):
    start = datetime.datetime.now()

    # 引数として渡されたtar.gzファイルを解凍し、オープン
    f = gzip.open(os.path.join(DATA_DIR,'original_Data',file_name), 'r')

    # CSVファイルのカラムに対応する配列を初期化
    ids = []
    headlines = []
    timeofarrivals = []
    langs = []

    # xmlを解析し、rootを取得
    tree = et.parse(f)
    elem = tree.getroot()
    # エラーとなった記事をカウントする変数を初期化
    fail_cnt = 0

    # 解析したxmlから、記事単位で要素を取得
    contents = elem.getiterator('ContentT')

    for content in contents:
        try:
            # 記事の言語情報を取得し、日本語か英語の記事であれば以降の処理を実施
            lang = content.find(".//LanguageString").text
            if lang == 'JAPANESE' or lang == 'ENGLISH':
                # 言語、ID、タイトル、タイムスタンプを配列に格納
                langs.append(lang)
                ids.append(content.find(".//Id/SUID").text)
                headlines.append(content.find(".//Headline").text)
                timeofarrivals.append(content.find(".//TimeOfArrival").text)
        except:
            # 読み取りに失敗した場合はカウント
            fail_cnt += 1

    # カラムに対応する配列を用いてDataFrameを作成
    df = DataFrame({
            "Id":ids,
            "Headline":headlines,
            "TimeOfArrival":timeofarrivals,
            "Language": langs
        }).drop_duplicates() # 重複レコードを削除

    # DataFrameをCSVファイルとして保存
    df.to_csv(os.path.join(DATA_DIR,'csv1',file_name.replace(".xml.gz",".csv")),encoding='utf8',header=False,index=False)
    f.close()

    print('file_name:' + file_name + ' time:' + str(datetime.datetime.now()-start) + ' record_count:' + str(len(df)) + ' fail_count:' + str(fail_cnt))


start_time = datetime.datetime.now()
print('This processing started at ' + str(start_time))

# dataフォルダ配下の全ファイルを取得
files = os.listdir(DATA_DIR+'/original_Data')
# csvフォルダ配下の全フォルダを取得
csvs = os.listdir(DATA_DIR + '/csv1/')

for i, gz in enumerate(files):
    # ファイルの拡張子がxml.gzで、まだcsvファイルが存在しない場合は処理を実施
    if gz[-6:] == 'xml.gz' and gz[:17]+'.csv' not in csvs:
        convertToCSV(gz)

print('It taked ' + str(datetime.datetime.now()) + ' seconds')


In [19]:
# ナカシマのcsvの処理。bodyが入っていて、日本語しか扱っていない。csv2に格納
def my_convertToCSV(file_name):
    start = datetime.datetime.now()

    # 引数として渡されたtar.gzファイルを解凍し、オープン
    f = gzip.open(os.path.join(DATA_DIR,'EID34151/original_Data/2017',file_name), 'r')

    # CSVファイルのカラムに対応する配列を初期化
    ids = []
    headlines = []
    timeofarrivals = []
    bodys = []
    #langs = []

    # xmlを解析し、rootを取得
    tree = et.parse(f)
    elem = tree.getroot()
    # エラーとなった記事をカウントする変数を初期化
    fail_cnt = 0

    # 解析したxmlから、記事単位で要素を取得
    contents = elem.getiterator('ContentT')

    for content in contents:
        try:
            # 記事の言語情報を取得し、日本語か英語の記事であれば以降の処理を実施
            lang = content.find(".//LanguageString").text
            #if lang == 'JAPANESE' or lang == 'ENGLISH':
            if lang == 'JAPANESE':
                if content.find(".//Body").text is not ' ':
                    # 言語、ID、タイトル、タイムスタンプを配列に格納
                    #langs.append(lang)
                    ids.append(content.find(".//Id/SUID").text)
                    headlines.append(content.find(".//Headline").text)
                    timeofarrivals.append(content.find(".//TimeOfArrival").text)
                    bodys.append(content.find(".//Body").text)
        except:
            # 読み取りに失敗した場合はカウント
            fail_cnt += 1
            #traceback.print_exc()
    
    print('fail_cnt:', fail_cnt)
    
    # カラムに対応する配列を用いてDataFrameを作成
    df = DataFrame({"Id":ids})
    df['Headline']=headlines
    df['Body']=bodys
    df['TimeOfArrival']=timeofarrivals
    df = df.drop_duplicates(['Id'])
    # DataFrameをCSVファイルとして保存
    df.to_csv(os.path.join(DATA_DIR,'EID34151/myCsv',file_name.replace(".xml.gz",".csv")),encoding='utf8',index=False)
    f.close()

    print('file_name:' + file_name + ' time:' + str(datetime.datetime.now()-start) + ' record_count:' + str(len(df)) + ' fail_count:' + str(fail_cnt))

In [13]:
#　全ファイル取得
files_2014 = os.listdir(DATA_DIR+'/EID34151/original_Data/2014')
files_2015 = os.listdir(DATA_DIR+'/EID34151/original_Data/2015')
files_2016 = os.listdir(DATA_DIR+'/EID34151/original_Data/2016')
files_2017 = os.listdir(DATA_DIR+'/EID34151/original_Data/2017')

hoge = [files_2015, files_2016, files_2017]

# EID34151_20150203.xml.gzが展開できず
# EID34151_20151026.xml.gzが展開できず

In [21]:
files_2017[12]

'EID34151_20170113.xml.gz'

In [15]:
files_2016.remove('EID34151_20161216.xml.gz')
files_2016.remove('EID34151_20161215.xml.gz')
files_2016.remove('EID34151_20160428.xml.gz')
files_2016.remove('EID34151_20160524.xml.gz')
files_2016.remove('EID34151_20160602.xml.gz')
files_2016.remove('EID34151_20160607.xml.gz')
files_2016.remove('EID34151_20160211.xml.gz')
files_2016.remove('EID34151_20160301.xml.gz')

In [20]:
# dataフォルダ配下の全ファイルを取得
for i in range(0,len(files_2017)):
    try:
        my_convertToCSV(files_2017[i])
    except:
        print('error')
        print(files_2017[i])

fail_cnt: 62
file_name:EID34151_20170101.xml.gz time:0:00:12.992308 record_count:679 fail_count:62
fail_cnt: 60
file_name:EID34151_20170102.xml.gz time:0:00:34.181156 record_count:172 fail_count:60
fail_cnt: 402
file_name:EID34151_20170103.xml.gz time:0:01:39.915972 record_count:834 fail_count:402
fail_cnt: 873
file_name:EID34151_20170104.xml.gz time:0:01:46.678015 record_count:1554 fail_count:873
fail_cnt: 1347
file_name:EID34151_20170105.xml.gz time:0:01:53.983500 record_count:1119 fail_count:1347
fail_cnt: 1412
file_name:EID34151_20170106.xml.gz time:0:01:48.867080 record_count:876 fail_count:1412
fail_cnt: 893
file_name:EID34151_20170107.xml.gz time:0:00:18.803075 record_count:51 fail_count:893
fail_cnt: 72
file_name:EID34151_20170108.xml.gz time:0:00:37.313134 record_count:509 fail_count:72
fail_cnt: 464
file_name:EID34151_20170109.xml.gz time:0:02:18.626374 record_count:1158 fail_count:464
fail_cnt: 905
file_name:EID34151_20170110.xml.gz time:0:02:10.804679 record_count:915 fail_

In [22]:
my_convertToCSV(files_2017[12])

ParseError: not well-formed (invalid token): line 41236769, column 0 (<string>)

以下実験

In [None]:
# data解析　実験
from janome.tokenizer import Tokenizer
file_name = files[1]
hoge = pd.read_csv(os.path.join(DATA_DIR,'csvData1',file_name.replace(".xml.gz",".csv")))
headlines = hoge['Headline']

In [None]:
# 使い方リマインド
t = Tokenizer()
#tokens = t.tokenize(headlines[0])
tokens = t.tokenize('安倍晋三首相と、麻生太郎副総理兼財務相が、新たな「密約」を結んだという情報が飛び込んできた。中島悠太郎とドナルド・トランプは眠い')
for token in tokens:
    #print(token)
    #if (token.part_of_speech.split(',')[0]=='名詞'):
    if (token.part_of_speech.split(',')[2]=='人名'):
        print(token)
        #and token.part_of_speech.split(',')[1]=='固有名詞'):
        

In [None]:
file_name = files[1]
hoge = pd.read_csv(os.path.join(DATA_DIR,'csvData1',file_name.replace(".xml.gz",".csv"))).drop(['Id', 'Body', 'TimeOfArrival'],axis=1)

In [None]:
hoge.shape[0]

In [None]:
hoge.head()

In [None]:
t = Tokenizer()
for i in range(0,hoge.shape[0]):
    tokens = t.tokenize(hoge['Headline'][i])
    for token in tokens:
        #print(token)
        #if (token.part_of_speech.split(',')[0]=='名詞'):
        if (token.part_of_speech.split(',')[2]=='人名'):
            print(token)
            #and token.part_of_speech.split(',')[1]=='固有名詞'):

In [None]:
type(hoge.stack().tolist())

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# just for measuring time
import time
start = time.time()

# need to recieve unicode text, this is callable for TfidfVectorizer
# need to recieve unicode text
def myTokenizer(text):
    #TARGET_CATEGORY = ["名詞", "動詞",  "形容詞", "副詞", "連体詞", "助動詞"]
    #wordsIn=[]
    #t = Tokenizer()
    #tokens = t.tokenize(text)
    #for token in tokens:
    #    tokenCategory = token.part_of_speech.split(',')[0]
    #    tokenBasic = token.base_form
    #    if  (tokenCategory=='名詞' and token.part_of_speech.split(',')[1]=='固有名詞'):
    #        wordsIn.append(token.surface)
    #    elif tokenCategory in TARGET_CATEGORY:
    #        if tokenBasic != '*':                               #if basic form can be defined
    #            wordsIn.append(tokenBasic)
    wordsIn=[]
    t = Tokenizer()
    tokens = t.tokenize(text)
    for token in tokens:
        #print(token)
        #if (token.part_of_speech.split(',')[0]=='名詞'):
        if (token.part_of_speech.split(',')[2]=='人名'):
            wordsIn.append(token.surface)
    return wordsIn


#vectorizer = CountVectorizer(ngram_range=(1, 2),tokenizer=myTokenizer)     
#tfidf_weighted_matrix = vectorizer.fit_transform(tweetsProcessed)
vectorizer = CountVectorizer(ngram_range=(1, 2),tokenizer=myTokenizer,min_df=2).fit(hoge.stack().tolist()) # stop_words = ''
bow = vectorizer.transform(hoge.stack().tolist())

# for time
elapsed_time = time.time() - start
print(elapsed_time)
print("bag_of_words with df as 2: {}\n".format(repr(bow)))

In [None]:
import numpy as np
max_value = bow.max(axis=0).toarray().ravel()
sorted_by_num = max_value.argsort()
feature_names = np.array(vectorizer.get_feature_names())
print("Features with highest tfidf: \n{}\n".format(
      feature_names[sorted_by_num[-30:]]))

In [None]:
print("First 30 features:\n{}".format(vectorizer.vocabulary_))

In [None]:
freqs = [(word, bow.getcol(idx).sum()) for word, idx in vectorizer.vocabulary_.items()]
#sort from largest to smallest
print (sorted (freqs, key = lambda x: -x[1]))

In [None]:
# https://labs.goo.ne.jp/api/jp/named-entity-extraction/
# うまくいかないようであれば

In [None]:
# 図示にあたって

In [None]:
start = datetime.datetime.strptime('20140101', '%Y%m%d')

In [None]:
a = makeDateList(start, '7')
type(a)

In [None]:
a = datetime.datetime.strptime('20140201', '%Y%m%d')
b = datetime.datetime.strptime('20140301', '%Y%m%d')

In [None]:
a<b

In [None]:
# date:headlineのdictを受け取る
# flagは、月=(month)か年(=year)か
def shapeData(data, flag):
    res = pd.DataFrame()
    if (flag=='month'):
        
    elif(flag=='year'):
        
    else:
        raise Exception
    
    return res