# Count the number of lines in Python for each file

## 1) Command Line

In [1]:
! pwd

/home/dsc/Data/challenge


In [2]:
! ls -l

total 1013892
-rw-rw-r--. 1 dsc dsc 554970628 Jan  4  2016 bookings.csv.bz2
-rw-rw-r--. 1 dsc dsc      5310 Dec  2 17:10 ch_01.ipynb
-rw-rw-r--. 1 dsc dsc     12930 Dec  2 14:36 ch_02.ipynb
-rw-rw-r--. 1 dsc dsc     10832 Dec  2 14:36 ch_03.ipynb
-rw-rw-r--. 1 dsc dsc      8019 Dec  2 14:36 ch_04.ipynb
-rw-rw-r--. 1 dsc dsc 483188920 Jan  4  2016 searches.csv.bz2


In [3]:
# make a sample
! bzcat bookings.csv.bz2 | head -1000 > bookings.sample.csv


bzcat: I/O or other error, bailing out.  Possible reason follows.
bzcat: Broken pipe
	Input file = bookings.csv.bz2, output file = (stdout)


In [4]:
! ls -l

total 1014308
-rw-rw-r--. 1 dsc dsc 554970628 Jan  4  2016 bookings.csv.bz2
-rw-rw-r--. 1 dsc dsc    425006 Dec  2 17:13 bookings.sample.csv
-rw-rw-r--. 1 dsc dsc      6610 Dec  2 17:12 ch_01.ipynb
-rw-rw-r--. 1 dsc dsc     12930 Dec  2 14:36 ch_02.ipynb
-rw-rw-r--. 1 dsc dsc     10832 Dec  2 14:36 ch_03.ipynb
-rw-rw-r--. 1 dsc dsc      8019 Dec  2 14:36 ch_04.ipynb
-rw-rw-r--. 1 dsc dsc 483188920 Jan  4  2016 searches.csv.bz2


In [5]:
#the sample should be of the same type as origin so we compress it wuth bzip2
! bzip2 bookings.sample.csv

In [7]:
!bzcat bookings.sample.csv.bz2 | wc -l

1000


## 2) Python:

#### 2a) Python without uncompressing

In [8]:
import bz2

In [9]:
fileBz2=bz2.BZ2File('./bookings.sample.csv.bz2')

In [10]:
k=0
for line in fileBz2:
    k+=1
print "%s has %d lines."%(fileBz2.name,k)

./bookings.sample.csv.bz2 has 1000 lines.


#### 2b) Python on row uncompressed file

In [11]:
! cp ./bookings.sample.csv.bz2 ./bookings.sample.csv.csv.bz2

In [12]:
! bunzip2 ./bookings.sample.csv.csv.bz2

In [13]:
! ls -l

total 1014368
-rw-rw-r--. 1 dsc dsc 554970628 Jan  4  2016 bookings.csv.bz2
-rw-rw-r--. 1 dsc dsc     54834 Dec  2 17:13 bookings.sample.csv.bz2
-rw-rw-r--. 1 dsc dsc    425006 Dec  2 17:23 bookings.sample.csv.csv
-rw-rw-r--. 1 dsc dsc      8213 Dec  2 17:22 ch_01.ipynb
-rw-rw-r--. 1 dsc dsc     12930 Dec  2 14:36 ch_02.ipynb
-rw-rw-r--. 1 dsc dsc     10832 Dec  2 14:36 ch_03.ipynb
-rw-rw-r--. 1 dsc dsc      8019 Dec  2 14:36 ch_04.ipynb
-rw-rw-r--. 1 dsc dsc 483188920 Jan  4  2016 searches.csv.bz2


In [15]:
with open('./bookings.sample.csv.csv') as input_file:
    k=0
    for like in input_file:
        if k==10:
            break
        k+=1
print "File has %d lines."%k

File has 10 lines.


In [16]:
%whos

Variable     Type       Data/Info
---------------------------------
bz2          module     <module 'bz2' from '/home<...>n2.7/lib-dynload/bz2.so'>
fileBz2      BZ2File    <bz2.BZ2File object at 0x7ff09c2099f0>
input_file   file       <closed file './bookings.<...>de 'r' at 0x7ff09c2b0c90>
k            int        10
like         str        2013-03-25 00:00:00^1V   <...>3:56^2^2013^3^NULL     \n
line         str        2013-03-06 00:00:00^1P   <...>1:52^1^2013^3^NULL     \n


## 3) What if the file didnt exist? Use Try-except...

In [23]:
try:
    with open('./bookbnings.sample.csv.csv') as input_file:
        for k, line in enumerate(input_file):
            pass
    int ('sadasd')
    print "File has %d lines" %(k+1)
except IOError:
    print "Did not open"
except ValueError:
    print " Could not convert a int"
except:
    print "Unknown error"

Did not open


## 4) Did he mean each csv file? Find file size for all csv files in the path... Use glob library

https://docs.python.org/2/library/glob.html

The glob module finds all the pathnames matching a specified pattern according to the rules used by the Unix shell, although results are returned in arbitrary order. No tilde expansion is done, but *, ?, and character ranges expressed with [] will be correctly matched. 

In [24]:
import glob

In [25]:
files_to_read=glob.glob("*.ipynb")

In [26]:
files_to_read

['ch_02.ipynb', 'ch_01.ipynb', 'ch_04.ipynb', 'ch_03.ipynb']

In [27]:
ls -l

total 1014368
-rw-rw-r--. 1 dsc dsc 554970628 Jan  4  2016 [0m[01;31mbookings.csv.bz2[0m
-rw-rw-r--. 1 dsc dsc     54834 Dec  2 17:13 [01;31mbookings.sample.csv.bz2[0m
-rw-rw-r--. 1 dsc dsc    425006 Dec  2 17:23 bookings.sample.csv.csv
-rw-rw-r--. 1 dsc dsc     10649 Dec  2 17:38 ch_01.ipynb
-rw-rw-r--. 1 dsc dsc     12930 Dec  2 14:36 ch_02.ipynb
-rw-rw-r--. 1 dsc dsc     10832 Dec  2 14:36 ch_03.ipynb
-rw-rw-r--. 1 dsc dsc      8019 Dec  2 14:36 ch_04.ipynb
-rw-rw-r--. 1 dsc dsc 483188920 Jan  4  2016 [01;31msearches.csv.bz2[0m


In [None]:
def number_of_line_csv(filename):
    with open(filename, "r") as file_input:
        for k, line in enumerate(file_input):
            pass
    return k+1

In [None]:
import glob
files_to_read = glob.glob("*.ipynb")
for file_name in files_to_read:
    print "number of lines in %s : %d"% (file_name, number_of_line_csv(file_name))

In [None]:
import glob
print glob.glob("")
print glob.glob("*")
print glob.glob("*.bz2")