# OAB to CSV
Convert the mythical OAB format into a CSV.
Note that the conversion is a bit scuffed, leaving some fields raw, and some as bits.

This should help you open the file and extract the information that you want, but is in no way an exhaustive solution.

Just drop your `udetails.oab` file in the root of this notebook.

OAB Parser code from:
https://github.com/byteDJINN/BOA

In [None]:
from struct import unpack, error
from io import BytesIO
import math
import binascii
from schema import PidTagSchema
import json
import re 
from tqdm import tqdm

In [None]:
# this is the table of tags and codes
def hexify(PropID):
  return "{0:#0{1}x}".format(PropID, 10).upper()[2:]

def lookup(ulPropID):
  if hexify(ulPropID) in PidTagSchema:
    (PropertyName, PropertyType) = PidTagSchema[hexify(ulPropID)]
    return PropertyName
  else:
    return hex(ulPropID)
# Originally, this script used to just ouput the DisplayName of each entry. 
# It was stored on this variable
d = {}

In [None]:
# When reading a binary file, always add a 'b' to the file open mode
with open('udetails.oab', 'rb') as f:
  ###############
  # BEGIN MAGIC #
  ###############
  (ulVersion, ulSerial, ulTotRecs) = unpack('<III', f.read(4 * 3))
  assert ulVersion == 32, 'This only supports OAB Version 4 Details File'
  print("Total Record Count: ", ulTotRecs)
  # OAB_META_DATA
  cbSize = unpack('<I', f.read(4))[0]
  # print "OAB_META_DATA",
  meta = BytesIO(f.read(cbSize - 4))
  # the length of the header attributes
  # we don't know and don't really need to know how to parse these
  HDR_cAtts = unpack('<I', meta.read(4))[0]
  print("rgHdrAtt HDR_cAtts",HDR_cAtts)
  for rgProp in range(HDR_cAtts):
    ulPropID = unpack('<I', meta.read(4))[0]
    ulFlags  = unpack('<I', meta.read(4))[0]
    # print rgProp, lookup(ulPropID), ulFlags
  # these are the attributes that we actually care about
  OAB_cAtts = unpack('<I', meta.read(4))[0]
  OAB_Atts = []
  print("rgOabAtts OAB_cAtts", OAB_cAtts)
  for rgProp in range(OAB_cAtts):
    ulPropID = unpack('<I', meta.read(4))[0]
    ulFlags  = unpack('<I', meta.read(4))[0]
    # print rgProp, lookup(ulPropID), ulFlags
    OAB_Atts.append(ulPropID)
  print("Actual Count", len(OAB_Atts))
  # OAB_V4_REC (Header Properties)
  cbSize = unpack('<I', f.read(4))[0]
  f.read(cbSize - 4)
  ###############
  #  END  MAGIC #
  ###############

  # Stores users that have all their data intact and with no read errors.
  users = []
  # Stores users that errored out on reading. 
  # On my side, they were virtually the same as the normal ones, and I don't know enough bitmanip to know why.
  # You should probably double check they're ok with your .oab file
  broken_users = []

  error_count = 0
  seen_emails = set()

  # Although the number 900000 might sound like magic, from what I can gather from the original script, 
  # it's just a placeholder that kind of represents the amount of records on the file. If you make it too high, the script used to 
  # start duplicating the last entry until it got to the end. I've removed this edge case, so feel free to put it as high as you want.
  for counter in tqdm(range(900000)):
    try:
      read = f.read(4)
      if read == '':
        print(f"Stopped at {counter}")
        break
      # this is the size of the chunk, incidentally its inclusive
      cbSize = unpack('<I', read)[0]
      # so to read the rest, we subtract four
      chunk = BytesIO(f.read(cbSize - 4))
      # wow such bit op
      presenceBitArray = bytearray(chunk.read(int(math.ceil(OAB_cAtts / 8.0))))
      indices = [i for i in range(OAB_cAtts) if (presenceBitArray[i // 8] >> (7 - (i % 8))) & 1 == 1]
      #print("\n----------------------------------------")
      # print "Chunk Size: ", cbSize

      def read_str():
        # strings in the OAB format are null-terminated
        buf = b""
        while True:
          n = chunk.read(1)
          if n == b"\0" or n == b"":
            break
          buf += n
        return buf.decode('utf-8') # problem here
        # return unicode(buf, errors="ignore")

      def read_int():
        # integers are cool aren't they
        byte_count = unpack('<B', chunk.read(1))[0]
        if 0x81 <= byte_count <= 0x84:
          byte_count = unpack('<I', (chunk.read(byte_count - 0x80) + b"\0\0\0")[0:4])[0]
        else:
          if byte_count > 127:
            return -1
        return byte_count

      user = {}

      # Start building a user by manually scanning the file for key:value pairs
      for i in indices:
        PropID = hexify(OAB_Atts[i])
        if PropID not in PidTagSchema:
          continue
          raise ValueError("This property id (" + PropID + ") does not exist in the schema")

        (Name, Type) = PidTagSchema[PropID]

        if Type == "PtypString8" or Type == "PtypString":
          val = read_str()
          user[Name] = val
          # print(Name, val)
        elif Type == "PtypBoolean":
          val = unpack('<?', chunk.read(1))[0]
          user[Name] = val
          # print (Name, val)
        elif Type == "PtypInteger32":
          val = read_int()
          user[Name] = val
          # print(Name, val)
        elif Type == "PtypBinary":
          bin = chunk.read(read_int())
          user[Name] = binascii.b2a_hex(bin)
          # print(Name, len(bin), binascii.b2a_hex(bin))
        elif Type == "PtypMultipleString" or Type == "PtypMultipleString8":
          byte_count = read_int()
          # print (Name, byte_count)
          arr = []
          for i in range(byte_count):
            val = read_str()
            arr.append(val)
            # print(i, "\t", val)
          user[Name] = arr
        elif Type == "PtypMultipleInteger32":
          byte_count = read_int()
          # print(Name, byte_count)
          arr = []
          for i in range(byte_count):
            val = read_int()
            if Name == "OfflineAddressBookTruncatedProperties":
              val = hexify(val)
              if val in PidTagSchema:
                val = PidTagSchema[val][0]
            arr.append(val)
            # print(i, "\t", val)
          user[Name] = arr

        elif Type == "PtypMultipleBinary":
          byte_count = read_int()
          # print(Name, byte_count)
          arr = []
          for i in range(byte_count):
            bin_len = read_int()
            bin = chunk.read(bin_len)
            arr.append(binascii.b2a_hex(bin))
            #print(i, "\t", bin_len, binascii.b2a_hex(bin))
          user[Name] = arr
        else:
          raise "Unknown property type (" + Type + ")"
        # This is the previous script default behavior. I've left it here in case someone actually makes it work.
        if Name == "DisplayName":
          # check if name matches regex [A-Za-z with space] ([0-9]) get match of the first and second part assign to varibles
          a = re.match(r"^([A-Za-z ]+) (\([0-9]+\))$", val)
          if a:
            d[a.group(1)] = a.group(2)[1:-1]
      remains = chunk.read()
      users.append(user)
    except KeyboardInterrupt:
      exit(1)
    except error:
      # Yeah, I know it sucks, but again, I've got no time of checking what magicky migicky the bits are doing.
      # Log and move on.
      if user["SmtpAddress"] in seen_emails:
          print(f"Duplicate {user["SmtpAddress"]} found. Stopping now")
          break
      broken_users.append(user)
      seen_emails.add(user["SmtpAddress"])
      error_count += 1
      pass
   

In [None]:
     
# Old writing behavior. I've never gotten it to work, since Json doesn't accept bytes
#json_out = open('test.json', 'w')
#json_out.write(json.dumps(d, indent=4))
#json_out.close()


# Cache the files if you want
with open("broken_users.txt", "w", encoding="utf-8") as file:
  for user in broken_users:
    file.write(str(user) + "\n")
    
with open("users.txt", "w", encoding="utf-8") as file:
  for user in users:
    file.write(str(user) + "\n")


# Data Processing
Here we convert our raw JSON into a pretty CSV

In [None]:
import pandas as pd
pd.set_option('display.max_columns', 50)
data = broken_users + users
df = pd.DataFrame(data)

In [None]:
# Here's the Dataframe! You can edit it, remove columns, search and do other DS stuff to it!
df

In [None]:
# The money: output it into a CSV.
df.to_csv("converted.csv", encoding="utf-8")