In [109]:
from random import randint, seed
from time import clock

"""
This program will generate random phone numbers with different formats
The formats I will use for this are:
(xxx)xxx-xxxx
xxx-xxx-xxxx
xxx.xxx.xxxx
xxx xxx xxxx

Each number will also have a chance for a leading 1, ex:
1 (xxx)xxx-xxxx
"""

seed(clock())

def get_number(style):
    # The dictionary holds the styles, .format at the end puts in numbers
    return {
        1: '({0}{1}{2}){3}{4}{5}-{6}{7}{8}{9}',
        2: '{0}{1}{2}-{3}{4}{5}-{6}{7}{8}{9}',
        3: '{0}{1}{2}.{3}{4}{5}.{6}{7}{8}{9}',
        4: '{0}{1}{2} {3}{4}{5} {6}{7}{8}{9}',
        'default': 'Error {0}{1}{2}{3}{4}{5}{6}{7}{8}{9}'
    }.get(style,'default').format(*[randint(0,9) for _ in range(10)])

First I wrote a method that returns random phone numbers in different formats

In [110]:
for i in range(1,5): print(get_number(i))

(911)303-4058
931-978-2183
923.465.0013
261 086 2399


Then I created a method to generate the file, the assignment calls for thousands of numbers, but for testing I'm only doing 100.

In [111]:
def Generate_File(fileName, size):
    seed(clock())
    with open(fileName, 'w') as write_file:
        write_file.write(get_number(randint(1,4)))
        for _ in range(size-1):
            #if randint(0,1):
             #   write_file.write('1 '+get_number(randint(1,4))+'\n')
            #else:
            write_file.write('\n'+get_number(randint(1,4)))

How many numbers should we generate? If we want a 50% chance that there will be 2 numbers that are the same, we will need to generate 117,740 phone numbers. To get to this answer, consider the probability P(Pair). If we consider the inverse of this probability, P(NO Pair), we can write P(Pair) as 1-P(NO Pair). There is a nearly 100% chance that two numbers will not be a pair, 1 > ((10^10-1)/10^10). Now we know that P(Pair) = 1 - ((10^10-1)/10^10)^(_some_function_of_n_)). The function of n we need to use in this case is (1/2)(n)(n-1), which is recognizable as the sum of positive integers. This makes sense, because each phone number will have a chance to match with each other number in the file. In a file of 4 phone numbers, the first has an opportunity to match 3 numbers, the second has an opportunity to match 2 numbers, the third only has 1 number, and the fourth has exhausted it's options before we even get to it. The sum of opportunities for matching nummbers is the function of n we need, in this case the sum of positive integers. The final formula therefore, for a 50% chance of having matching numbers in the generated files, is: 0.5 = 1 - ((10^10-1)/10^10)^((1/2)(n)(n-1)), which yields the result n = 117,740.

In [112]:
import re

def fix_file(in_file, out_file):
    r = re.compile('^\(?(\d{3})\)?[\s.-]?(\d{3})[\s.-]?(\d{4})$')
    ins = []
    with open(in_file,'r') as reader:
        for line in reader:
            tmp = r.match(line)
            if tmp: ins.append(tmp.group(1)+tmp.group(2)+tmp.group(3)+'\n')
        reader.close()
    with open(out_file,'w') as writer:
        for i in ins:
            writer.write(i)
        writer.close()

In [119]:
def remove_matching(in_file):
    fix_file(in_file,in_file)
    nums = []
    with open(in_file, 'r') as reader:
        for line in reader:
            nums.append(int(line))
        reader.close()
    prev = 0
    dups = []
    for n in sorted(nums):
        if n == prev:
            dups.append(n)
        prev = n
    print(dups)
    for i in dups: nums.remove(i)
    with open(in_file, 'w') as writer:
        for n in nums:
            tmp = '{0:010d}'.format(n)
            writer.write('({0}){1}-{2}\n'.format(tmp[:3],tmp[3:6],tmp[6:10]))
        writer.close()
    return dups

In [120]:
def find_803s(in_file):
    area_code = re.compile('^\((\d{3})\)')
    number = re.compile('^\((\d{3})\)(\d{3})-(\d{4})$')
    res = []
    with open(in_file,'r') as reader:
        for line in reader:
            if line:
                code = area_code.match(line).group(1)
            if  int(code) == 803:
                tmp = number.match(line)
                res.append(tmp.group(1)+tmp.group(2)+tmp.group(3))
    return res

In [121]:
'''
Testing the methods written above:
1) Generate a File
2) Fix the File format
3) Remove the duplicates
4) Find all values beginning with (803)
'''
# 1)
Generate_File('phones.txt',117740)

In [122]:
#2)
fix_file('phones.txt','phones_fixed.txt')

In [123]:
#3)
# note - this method would work whether you inputted the file as fixed or not
remove_matching('phones_fixed.txt')

[7213351575, 7260298253]


[7213351575, 7260298253]

In [124]:
#4)
find_803s('phones_fixed.txt')

['8037000203',
 '8039368206',
 '8033455174',
 '8034116881',
 '8038405083',
 '8031469178',
 '8031089531',
 '8039272361',
 '8030632752',
 '8030784994',
 '8030434934',
 '8038711360',
 '8032184480',
 '8034983431',
 '8037673750',
 '8035670042',
 '8036961971',
 '8034036155',
 '8031980159',
 '8036712736',
 '8030183999',
 '8033765414',
 '8039516317',
 '8037183531',
 '8033439306',
 '8037653211',
 '8039528615',
 '8034596166',
 '8036479992',
 '8031081271',
 '8032542362',
 '8039065069',
 '8034924317',
 '8031077526',
 '8034269201',
 '8038952874',
 '8039780972',
 '8038136522',
 '8030660418',
 '8035962207',
 '8035102552',
 '8030045718',
 '8030294606',
 '8037078423',
 '8033801584',
 '8038425242',
 '8039822679',
 '8039566626',
 '8031774619',
 '8036680332',
 '8036291346',
 '8038554656',
 '8038373901',
 '8036013561',
 '8037724946',
 '8032283882',
 '8036513176',
 '8033713777',
 '8033247374',
 '8033126942',
 '8031783055',
 '8036575510',
 '8038112037',
 '8036400621',
 '8036442115',
 '8039636937',
 '80373060

In [23]:
# Can take a while to run
# Generates 100 files, calculates the probability of a duplicate in those files through testing
s = 0
print('Duplicates removed:')
for n in range(1,101):
    print('{0}) '.format(n),end='')
    Generate_File('phones_probability_tester.txt', 117740)
    s += bool(remove_matching('phones_probability_tester.txt'))
print(s/100.0)

Duplicates removed:
1) []
2) [9074599889, 9792731918]
3) []
4) []
5) [4771124965]
6) []
7) []
8) [9930137773]
9) [3386607888]
10) [6497765312]
11) []
12) []
13) [7838100691]
14) [6508757365]
15) [4106699662]
16) []
17) [8516741729]
18) []
19) []
20) []
21) []
22) [7294070941, 7910167824]
23) []
24) []
25) [409567005]
26) []
27) [534985521]
28) [1050575041]
29) [1960902843, 6606442797]
30) [114027139, 4068900186, 6312971157]
31) []
32) [9544615295]
33) [2829340897]
34) []
35) [9990472800]
36) []
37) []
38) []
39) []
40) []
41) [3317604268]
42) []
43) [2366421743]
44) [574842247, 6530386374]
45) [3080130309, 7955175765, 9002881799]
46) []
47) []
48) []
49) [9312072590]
50) [3441991388]
51) [4871515523]
52) [1781154541]
53) []
54) [2281364392, 2990868187]
55) [8951890394]
56) [212197687]
57) [990872428, 8038484942]
58) [3380746980]
59) [2119146070, 4192971310]
60) []
61) []
62) []
63) [9055193738]
64) []
65) [1121384204]
66) []
67) [5627320140]
68) []
69) []
70) [545387672, 8221671971]
71

In [68]:
area_code = re.compile('^\((\d{3})\)')
area_code.match('(803)867-5309').group(1)

'803'