In [1]:
import pandas as pd
import re

In [2]:
# --- Day 4: Passport Processing ---
# You arrive at the airport only to realize that you grabbed your North Pole Credentials instead of your passport. While these documents are extremely similar, North Pole Credentials aren't issued by a country and therefore aren't actually valid documentation for travel in most of the world.

# It seems like you're not the only one having problems, though; a very long line has formed for the automatic passport scanners, and the delay could upset your travel itinerary.

# Due to some questionable network security, you realize you might be able to solve both of these problems at the same time.

# The automatic passport scanners are slow because they're having trouble detecting which passports have all required fields. The expected fields are as follows:

# byr (Birth Year)
# iyr (Issue Year)
# eyr (Expiration Year)
# hgt (Height)
# hcl (Hair Color)
# ecl (Eye Color)
# pid (Passport ID)
# cid (Country ID)
# Passport data is validated in batch files (your puzzle input). Each passport is represented as a sequence of key:value pairs separated by spaces or newlines. Passports are separated by blank lines.

# Here is an example batch file containing four passports:

# ecl:gry pid:860033327 eyr:2020 hcl:#fffffd
# byr:1937 iyr:2017 cid:147 hgt:183cm

# iyr:2013 ecl:amb cid:350 eyr:2023 pid:028048884
# hcl:#cfa07d byr:1929

# hcl:#ae17e1 iyr:2013
# eyr:2024
# ecl:brn pid:760753108 byr:1931
# hgt:179cm

# hcl:#cfa07d eyr:2025 pid:166559648
# iyr:2011 ecl:brn hgt:59in
# The first passport is valid - all eight fields are present. The second passport is invalid - it is missing hgt (the Height field).

# The third passport is interesting; the only missing field is cid, so it looks like data from North Pole Credentials, not a passport at all! Surely, nobody would mind if you made the system temporarily ignore missing cid fields. Treat this "passport" as valid.

# The fourth passport is missing two fields, cid and byr. Missing cid is fine, but missing any other field is not, so this passport is invalid.

# According to the above rules, your improved system would report 2 valid passports.

# Count the number of valid passports - those that have all required fields. Treat cid as optional. In your batch file, how many passports are valid?

In [3]:
with open('verify.txt') as f:
    lines = f.read()

ids = lines.split("\n\n")

In [4]:
df = pd.DataFrame(ids)

In [5]:
df = df.rename(columns = {0:"ids"})
df.head()

Unnamed: 0,ids
0,hgt:176cm\niyr:2013\nhcl:#fffffd ecl:amb\nbyr:...
1,hcl:#b5c3db ecl:grn hgt:155cm pid:#baec97 iyr:...
2,pid:526669252 eyr:1972\nhgt:152cm ecl:dne byr:...
3,eyr:2028 hcl:#c0946f hgt:73in byr:1926 ecl:#47...
4,pid:472686027 ecl:oth iyr:2019\ncid:277 byr:19...


In [6]:
def remove_line_break(value):
    
    value = re.sub(r'\n',' ',value)
    
    return value

In [7]:
df['ids'] = df.ids.apply(remove_line_break)
df.head()

Unnamed: 0,ids
0,hgt:176cm iyr:2013 hcl:#fffffd ecl:amb byr:200...
1,hcl:#b5c3db ecl:grn hgt:155cm pid:#baec97 iyr:...
2,pid:526669252 eyr:1972 hgt:152cm ecl:dne byr:1...
3,eyr:2028 hcl:#c0946f hgt:73in byr:1926 ecl:#47...
4,pid:472686027 ecl:oth iyr:2019 cid:277 byr:194...


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255 entries, 0 to 254
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ids     255 non-null    object
dtypes: object(1)
memory usage: 2.1+ KB


In [9]:
df = df.astype('string',copy=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255 entries, 0 to 254
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ids     255 non-null    string
dtypes: string(1)
memory usage: 2.1 KB


In [10]:
# byr (Birth Year)
# iyr (Issue Year)
# eyr (Expiration Year)
# hgt (Height)
# hcl (Hair Color)
# ecl (Eye Color)
# pid (Passport ID)
# cid (Country ID)

In [11]:
value = 'pid:526669252 eyr:1972 hgt:152cm ecl:dne byr:1'

'hgt:' not in value

False

In [12]:
   
def check_ids(value):
    
    items = ['byr:','iyr:','eyr:','hgt:','hcl:','ecl:','pid:']

    for item in items:

        if item not in value:

            print(item)
            print(value)
            return False

    return True



In [13]:
df['id_check'] = df.ids.apply(check_ids)

ecl:
hcl:#cfa07d hgt:157cm byr:1994 eyr:2027 pid:344443856 iyr:2016
eyr:
byr:2014 hcl:z iyr:2029 cid:279 pid:28914607 hgt:75cm ecl:xry
ecl:
hgt:167cm byr:2009 eyr:1975 cid:295 pid:174cm iyr:2029 hcl:z
byr:
hcl:#b6652a pid:485327267 ecl:brn hgt:155cm eyr:2028 iyr:2019
byr:
hcl:#b6652a ecl:hzl eyr:2023 iyr:2012 pid:513268492 hgt:159cm
byr:
cid:84 hgt:71in ecl:blu pid:982719716 eyr:2020 iyr:2014
byr:
ecl:gry hcl:#888785 eyr:2020 pid:442479017 iyr:2016
iyr:
ecl:grn hgt:64in pid:796889811 hcl:#18171d byr:1929 eyr:2027
eyr:
iyr:2018 hgt:73in pid:652356158 hcl:#c0946f ecl:grn byr:1973
pid:
cid:210 ecl:brn iyr:2017 eyr:2030 hgt:176cm hcl:#efcc98 byr:1965
byr:
iyr:2015 hcl:#602927 cid:268 eyr:2021 ecl:amb hgt:186cm pid:318676962
pid:
eyr:2028 byr:1974 ecl:brn iyr:2010 hcl:#18171d hgt:160cm
byr:
iyr:2026 pid:184cm ecl:gmt hcl:z hgt:71cm eyr:2029
hgt:
byr:1961 iyr:2010 ecl:blu eyr:2023 pid:245858010
iyr:
hgt:193cm pid:821303249 eyr:2020 hcl:#6b5442 cid:130 byr:1946
byr:
hgt:164in pid:953500867 ey

In [14]:
df.id_check.value_counts()

True     230
False     25
Name: id_check, dtype: int64

In [15]:
# --- Day 4: Passport Processing ---
# You arrive at the airport only to realize that you grabbed your North Pole Credentials instead of your passport. While these documents are extremely similar, North Pole Credentials aren't issued by a country and therefore aren't actually valid documentation for travel in most of the world.

# It seems like you're not the only one having problems, though; a very long line has formed for the automatic passport scanners, and the delay could upset your travel itinerary.

# Due to some questionable network security, you realize you might be able to solve both of these problems at the same time.

# The automatic passport scanners are slow because they're having trouble detecting which passports have all required fields. The expected fields are as follows:

# byr (Birth Year)
# iyr (Issue Year)
# eyr (Expiration Year)
# hgt (Height)
# hcl (Hair Color)
# ecl (Eye Color)
# pid (Passport ID)
# cid (Country ID)
# Passport data is validated in batch files (your puzzle input). Each passport is represented as a sequence of key:value pairs separated by spaces or newlines. Passports are separated by blank lines.

# Here is an example batch file containing four passports:

# ecl:gry pid:860033327 eyr:2020 hcl:#fffffd
# byr:1937 iyr:2017 cid:147 hgt:183cm

# iyr:2013 ecl:amb cid:350 eyr:2023 pid:028048884
# hcl:#cfa07d byr:1929

# hcl:#ae17e1 iyr:2013
# eyr:2024
# ecl:brn pid:760753108 byr:1931
# hgt:179cm

# hcl:#cfa07d eyr:2025 pid:166559648
# iyr:2011 ecl:brn hgt:59in
# The first passport is valid - all eight fields are present. The second passport is invalid - it is missing hgt (the Height field).

# The third passport is interesting; the only missing field is cid, so it looks like data from North Pole Credentials, not a passport at all! Surely, nobody would mind if you made the system temporarily ignore missing cid fields. Treat this "passport" as valid.

# The fourth passport is missing two fields, cid and byr. Missing cid is fine, but missing any other field is not, so this passport is invalid.

# According to the above rules, your improved system would report 2 valid passports.

# Count the number of valid passports - those that have all required fields. Treat cid as optional. In your batch file, how many passports are valid?

# Your puzzle answer was 230.

# The first half of this puzzle is complete! It provides one gold star: *

# --- Part Two ---
# The line is moving more quickly now, but you overhear airport security talking about how passports with invalid data are getting through. Better add some data validation, quick!

# You can continue to ignore the cid field, but each other field has strict rules about what values are valid for automatic validation:

# byr (Birth Year) - four digits; at least 1920 and at most 2002.
# iyr (Issue Year) - four digits; at least 2010 and at most 2020.
# eyr (Expiration Year) - four digits; at least 2020 and at most 2030.
# hgt (Height) - a number followed by either cm or in:
# If cm, the number must be at least 150 and at most 193.
# If in, the number must be at least 59 and at most 76.
# hcl (Hair Color) - a # followed by exactly six characters 0-9 or a-f.
# ecl (Eye Color) - exactly one of: amb blu brn gry grn hzl oth.
# pid (Passport ID) - a nine-digit number, including leading zeroes.
# cid (Country ID) - ignored, missing or not.
# Your job is to count the passports where all required fields are both present and valid according to the above rules. Here are some example values:

# byr valid:   2002
# byr invalid: 2003

# hgt valid:   60in
# hgt valid:   190cm
# hgt invalid: 190in
# hgt invalid: 190

# hcl valid:   #123abc
# hcl invalid: #123abz
# hcl invalid: 123abc

# ecl valid:   brn
# ecl invalid: wat

# pid valid:   000000001
# pid invalid: 0123456789
# Here are some invalid passports:

# eyr:1972 cid:100
# hcl:#18171d ecl:amb hgt:170 pid:186cm iyr:2018 byr:1926

# iyr:2019
# hcl:#602927 eyr:1967 hgt:170cm
# ecl:grn pid:012533040 byr:1946

# hcl:dab227 iyr:2012
# ecl:brn hgt:182cm pid:021572410 eyr:2020 byr:1992 cid:277

# hgt:59cm ecl:zzz
# eyr:2038 hcl:74454a iyr:2023
# pid:3556412378 byr:2007
# Here are some valid passports:

# pid:087499704 hgt:74in ecl:grn iyr:2012 eyr:2030 byr:1980
# hcl:#623a2f

# eyr:2029 ecl:blu cid:129 byr:1989
# iyr:2014 pid:896056539 hcl:#a97842 hgt:165cm

# hcl:#888785
# hgt:164cm byr:2001 iyr:2015 cid:88
# pid:545766238 ecl:hzl
# eyr:2022

# iyr:2010 hgt:158cm hcl:#b6652a ecl:blu byr:1944 eyr:2021 pid:093154719
# Count the number of valid passports - those that have all required fields and valid values. Continue to treat cid as optional. In your batch file, how many passports are valid?

In [16]:
# byr (Birth Year) - four digits; at least 1920 and at most 2002.
# iyr (Issue Year) - four digits; at least 2010 and at most 2020.
# eyr (Expiration Year) - four digits; at least 2020 and at most 2030.
# hgt (Height) - a number followed by either cm or in:
# hcl (Hair Color) - a # followed by exactly six characters 0-9 or a-f.
# ecl (Eye Color) - exactly one of: amb blu brn gry grn hzl oth.
# pid (Passport ID) - a nine-digit number, including leading zeroes.

In [17]:
df.ids[1]

'hcl:#b5c3db ecl:grn hgt:155cm pid:#baec97 iyr:2017 byr:1939 eyr:2020'

In [18]:
df = df[df.id_check == True].copy()
df.id_check.value_counts()

True    230
Name: id_check, dtype: int64

In [19]:
value = 'hgt:176cm iyr:2013 hcl:#fffffd ecl:amb byr:2000 eyr:2034 cid:89 pid:934693255'

count = 0

hgt = re.findall(r'hgt:(\S+\w\w)',value)[0]
iyr = re.findall(r'iyr:(\S+)',value)[0]
hcl = re.findall(r'hcl:(\S+)',value)[0]
ecl = re.findall(r'ecl:(\S+)',value)[0]
byr = re.findall(r'byr:(\S+)',value)[0]
eyr = re.findall(r'eyr:(\S+)',value)[0]
cid = re.findall(r'cid:(\S+)',value)[0]
pid = re.findall(r'pid:(\S+)',value)[0]

# byr (Birth Year) - four digits; at least 1920 and at most 2002.

if ((len(byr) == 4) & (int(byr) >= 1920) & (int(byr) <= 2002)):
    
    count += 1

# iyr (Issue Year) - four digits; at least 2010 and at most 2020.

if ((len(iyr) == 4) & (int(iyr) >= 2010) & (int(iyr) <= 2020)):
    
    count += 1
    
# eyr (Expiration Year) - four digits; at least 2020 and at most 2030.

if ((len(eyr) == 4) & (int(eyr) >= 2020) & (int(eyr) <= 2030)):
    
    count += 1

# hgt (Height) - a number followed by either cm or in:
# If cm, the number must be at least 150 and at most 193.
# If in, the number must be at least 59 and at most 76.

hight_type = hgt[-2:]

hight_number = int(hgt[0:3])

if hight_type == 'cm':
    
    if hight_number >= 150 & hight_number <= 193:
        
        count += 1
        
elif hight_typ == 'in':
    
    if hight_number >= 59 & hight_number <= 76:
        
        count += 1


# hcl (Hair Color) - a # followed by exactly six characters 0-9 or a-f.

hashtag = hcl[0]

alph_num = hcl[1:]

if (hashtag == "#") & (len(alph_num) == 6) & (alph_num.isalnum() == True):
    
    count += 1
# ecl (Eye Color) - exactly one of: amb blu brn gry grn hzl oth.

if (ecl == "amb") | (ecl == "blu") | (ecl == "brn") | (ecl == "gry") | (ecl == "grn") | (ecl == "hzl") | (ecl == "oth"):
    
    count += 1
# pid (Passport ID) - a nine-digit number, including leading zeroes.

if len(pid) == 9:
    
    count += 1

count == 7

False

In [26]:

def valid_ids(value):

    count = 0

    # byr (Birth Year) - four digits; at least 1920 and at most 2002.

    byr = re.findall(r'byr:(\S+)',value)[0]
    
    if ((len(byr) == 4) & (int(byr) >= 1920) & (int(byr) <= 2002)):

        count += 1

    # iyr (Issue Year) - four digits; at least 2010 and at most 2020.

    iyr = re.findall(r'iyr:(\S+)',value)[0]
    
    if ((len(iyr) == 4) & (int(iyr) >= 2010) & (int(iyr) <= 2020)):

        count += 1

    # eyr (Expiration Year) - four digits; at least 2020 and at most 2030.

    eyr = re.findall(r'eyr:(\S+)',value)[0]
    
    if ((len(eyr) == 4) & (int(eyr) >= 2020) & (int(eyr) <= 2030)):

        count += 1

    # hgt (Height) - a number followed by either cm or in:
    # If cm, the number must be at least 150 and at most 193.
    # If in, the number must be at least 59 and at most 76.

    hgt = re.findall(r'hgt:(\S*\w\w)',value)[0]
    
    hight_type = hgt[-2:]
    
    if (hight_type == 'cm') | (hight_type == 'in'):
        
        hight_number = int(hgt[:-2])
    
        if hight_type == 'cm':

            if (hight_number >= 150) & (hight_number <= 193):

                count += 1

        elif hight_type == 'in':

            if (hight_number >= 59) & (hight_number <= 76):

                count += 1

    # hcl (Hair Color) - a # followed by exactly six characters 0-9 or a-f.

    hcl = re.findall(r'hcl:(\S+)',value)[0]
    
    hashtag = hcl[0]

    alph_num = hcl[1:]

    if (hashtag == "#") & (len(alph_num) == 6) & (alph_num.isalnum() == True):

        count += 1
    # ecl (Eye Color) - exactly one of: amb blu brn gry grn hzl oth.

    ecl = re.findall(r'ecl:(\S+)',value)[0]
    
    if (ecl == "amb") | (ecl == "blu") | (ecl == "brn") | (ecl == "gry") | (ecl == "grn") | (ecl == "hzl") | (ecl == "oth"):

        count += 1
    # pid (Passport ID) - a nine-digit number, including leading zeroes.

    pid = re.findall(r'pid:(\S+)',value)[0]
    
    if len(pid) == 9:

        count += 1

    return count == 7

In [27]:
df['id_check_2'] = df.ids.apply(valid_ids)

In [28]:
df.head()

Unnamed: 0,ids,id_check,id_check_2
0,hgt:176cm iyr:2013 hcl:#fffffd ecl:amb byr:200...,True,False
1,hcl:#b5c3db ecl:grn hgt:155cm pid:#baec97 iyr:...,True,False
2,pid:526669252 eyr:1972 hgt:152cm ecl:dne byr:1...,True,False
3,eyr:2028 hcl:#c0946f hgt:73in byr:1926 ecl:#47...,True,False
4,pid:472686027 ecl:oth iyr:2019 cid:277 byr:194...,True,True


In [29]:
df.id_check_2.value_counts()

True     156
False     74
Name: id_check_2, dtype: int64