#### Processing 'personal_dataset.txt' into a dictionary and saving to a json file

In [7]:
import re
import json

# Default file to work on is personal_dataset.txt
def to_json(file="personal_dataset.txt"):
    
    # Opening file containing data.
    with open(file,"r") as f:
        data = f.read()
        f.close()
    
    # Getting the content of the file line by line. Entire data for each person is 
    # contained within blocks of characters separated with new lines
    content = re.findall(r'.+',data)
    
    # Creating empty lists for various data
    phone, raw_names, email, raw_job, date = [],[],[],[],[]
    
    # Looping through data elements (blocks of characters) in content
    for i in content:
        
        # [0] so in some cases we dont append this as list of lists
        phone.append(re.findall(r"\d-\d{3}-\d{3}-\d{4}",i)[0])
        raw_names.append(re.findall(r"[A-Z][a-z]+_[A-Z][a-z]+",i)[0].split('_'))
        email.append(re.findall(r"[A-Z][a-z]+_[A-Z][a-z]+\d{4}@[a-z0-9]+\.[a-z]+",i)[0])
        raw_job.append(re.findall(r"[A-Z]+[a-z]+[A-Z]*[a-z]*",i))
        date.append(re.findall(r"\d{1,2}/\d{1,2}/\d{4}",i)[0])

    # Flattening list of names
    flat_names = [] 
    for people in raw_names:
        for first_last_name in people:
            flat_names.append(first_last_name)


    # Minig job titles parts 1-4

    # 1. Joining results into one string without any spaces
    join_job = [''.join(x) for x in raw_job]

    # 2. Splitting string by the uppercase letters
    split_job = []
    for i in join_job:
        # no double uppercase words "IT" "HR" etc
        split_job.append(re.findall('[A-Z][^A-Z]*',i))

    # 3. Cross-referencing results with flat_names to remove first and last names
    job = [' '.join([x for x in line if x not in flat_names]) for line in split_job]

    # 4. Joining together single characters (and only them) like "I T" or " H R"s
    for i in range(len(job)):
        job[i] = re.sub(r'(?<=\b[A-Z]) (?=[A-Z]\b)', '', job[i])

        # RegEx synax explanation:
        # (?<=\b[A-Z])   assert that what precedes is a single letter, which itself
        #                is preceded by a word boundary
        # [ ]            match a single space (brackets used for clarity only)
        # (?=[A-Z]\b)    assert that what follows is also a single letter, which again
        #                is followed by a word boundary

    # Creating dictionary and populating it with mined data
    data_dict = {}
    for i in range(len(content)):
        for p,d,n,e,j in zip(raw_names,phone,date,email,job):
            data_dict[i+1] = {"First Name":raw_names[i][0],
                      "Last Name":raw_names[i][1],
                      "Email":email[i],
                      "Date":date[i],
                      "Job Title":job[i]}

    # Exporting data_dict to json format
    with open("personal_dataset.json", "w") as f:
        json.dump(data_dict, f)
        f.close()
            
to_json()