# Imports

In [1]:
import tarfile
import numpy as np
import codecs
import zlib
from bs4 import BeautifulSoup

import spinn3rApi_pb2 as proto_api
import protoStream_pb2 as proto_stream
from google.protobuf.internal.decoder import _DecodeVarint32

# Helper Functions

In [2]:
# Each .protostream file contains exactly 200 distinct records

def read_protostream_file(file):
    decoder = _DecodeVarint32
    
    ## get the three types of protobuf messages we expect to see
    header    = proto_stream.ProtoStreamHeader()
    delimiter = proto_stream.ProtoStreamDelimiter()
    
    ## get the header
    # length, pos = decoder(file, pos) => pos is starting point, length is the length of the record
    length, pos = decoder(file, 0)
    header.ParseFromString(file[pos:pos + length])
    # print(header)
    ## should check its contents

    pos += length
    data_rows = []

    while pos < len(file):
        length, pos = decoder(file, pos)
        delimiter.ParseFromString(file[pos:pos + length])
        if delimiter.delimiter_type == delimiter.END:
            break
        elif delimiter.delimiter_type == delimiter.ENTRY:
            pos += length
            length, pos = decoder(file, pos)
            entry = proto_api.Entry()
            entry.ParseFromString(file[pos:pos + length])
            data = {}
            
            # Language
            try:
                data['Lang_Code'] = entry.feed_entry.lang[0].code
            except:
                pass
            
            # Language Probability
            try:
                data['Lang_Prob'] = entry.feed_entry.lang[0].probability
            except:
                pass
            
            # Title
            try:
                data['Post_Title'] = entry.feed_entry.title
            except:
                pass
            
            # Content
            try:
                data['Post_Content'] = BeautifulSoup(zlib.decompress(
                    entry.feed_entry.content.data), "html.parser").text
            except:
                pass
            
            # Link
            try:
                data['Post_Link'] = entry.feed_entry.link[0].href
            except:
                pass
            
            # Author Name
            try:
                data['Author_Name'] = entry.feed_entry.author[0].name
            except:
                pass
            
            # Author Link
            try:
                data['Author_Link'] = entry.feed_entry.author[0].link[0].href
            except:
                pass
            
            # Date & Time
            try:
                data['Datetime'] = entry.feed_entry.last_published
            except:
                pass
            
            # Identifier
            try:
                data['Identifier'] = entry.feed_entry.identifier
            except:
                pass
            
            # Spam Probability
            try:
                data['Spam'] = entry.feed_entry.spam_probability
            except:
                pass
            
            # Publisher Type
            try:
                data['Type'] = entry.source.publisher_type
            except:
                pass
            
            # Category
            try:
                data['Category'] = entry.feed_entry.category
            except:
                pass
            
            data_rows.append(data)
        pos += length
    return data_rows

# Main

In [None]:
PREFIX = '/Volumes/Khalil Mrini/'
FILES = '01-14-OTHER 01-14-SOCIAL_MEDIA 02-11-OTHER 02-11-SOCIAL_MEDIA'.split(' ')
EXTENSION = '.tar.gz'

data_rows = []

for file_name in FILES:
    print('Opening', file_name, '...')
    tar = tarfile.open(PREFIX + file_name + EXTENSION, "r:gz")
    members = tar.getmembers()
    member_count = len(members)
    print('There are {} files in {}.'.format(member_count, file_name + EXTENSION))
    for member_index in range(member_count):
        print(member_index, 'out of', member_count, end='\r', flush=True)
        file_data = tar.extractfile(members[member_index])
        if file_data is not None:
            content = file_data.read()
            data_rows.extend(read_protostream_file(content))

Opening 01-14-OTHER ...
There are 3577 files in 01-14-OTHER.tar.gz.
39 out of 3577

  ' Beautiful Soup.' % markup)


375 out of 3577

  ' that document to Beautiful Soup.' % decoded_markup


473 out of 3577

  ' that document to Beautiful Soup.' % decoded_markup


482 out of 3577

  ' that document to Beautiful Soup.' % decoded_markup


633 out of 3577

  ' that document to Beautiful Soup.' % decoded_markup


752 out of 3577

  ' that document to Beautiful Soup.' % decoded_markup


930 out of 3577

  ' that document to Beautiful Soup.' % decoded_markup


1020 out of 3577

  ' that document to Beautiful Soup.' % decoded_markup


1167 out of 3577


http://sphotos.ak.fbcdn.net/hphotos-ak-snc4/hs766.snc4/66604_445849826119_262902376119_6061706_7256736_n.jpg" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup


1317 out of 3577

  ' that document to Beautiful Soup.' % decoded_markup


1340 out of 3577


http://pics.plentyoffish.com/dating/78/54/2wyorrb2rj_116662594.jpg" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup

Thanks!!!" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup


1372 out of 3577

  ' that document to Beautiful Soup.' % decoded_markup


1381 out of 3577 out of 3577

  ' that document to Beautiful Soup.' % decoded_markup


1425 out of 3577

# Reading a Single `.protostream` File

In [None]:
# Each .protostream file contains exactly 200 distinct records

def read_protostream_entry(file):
    decoder = _DecodeVarint32
    
    ## get the three types of protobuf messages we expect to see
    header    = proto_stream.ProtoStreamHeader()
    delimiter = proto_stream.ProtoStreamDelimiter()
    
    ## get the header
    # length, pos = decoder(file, pos) => pos is starting point, length is the length of the record
    length, pos = decoder(file, 0)
    header.ParseFromString(file[pos:pos + length])
    # print(header)
    ## should check its contents

    pos += length
    data_rows = []

    while pos < len(file):
        length, pos = decoder(file, pos)
        delimiter.ParseFromString(file[pos:pos + length])
        if delimiter.delimiter_type == delimiter.END:
            break
        elif delimiter.delimiter_type == delimiter.ENTRY:
            pos += length
            length, pos = decoder(file, pos)
            entry = proto_api.Entry()
            entry.ParseFromString(file[pos:pos + length])
            data_rows.append(entry)
        pos += length
    return data_rows

Languages: en, fr, ar
Keywords:
- Tunisia: tunis+
- Egypt: egypt