# File input and output (IO)

In [None]:
from IPython.display import Image
slide = Image(filename = 'fileFormats.jpg')
slide

### Reading text files

In [None]:
help(open)

In [None]:
# A text file can be thought of as a sequence of lines

file_object = open('demo.txt', 'r') 

# What type of object is file_object?
print "file_object is of type",(type(file_object))

# Note: file_object can be treated as a sequence of strings

In [None]:
dir(file_object) # attributes and methods of file objects

In [None]:
# Examples
print file_object.name
print file_object.mode
print file_object.closed

In [None]:
my_file = open ('demo.txt','r')
count = 0
for line in my_file: # treating my_file as a sequence of strings
    count = count + 1
    print line
print "My file has "+str(count)+" lines."    
my_file.close()

#### Note: Each line includes a non-printing character called the newline character "\n"

### Exercise 0: 
Read the text file demo.txt and count the number of lines excluding empty lines.

In [None]:
my_file = open ('demo.txt','r')
count = 0
for line in my_file: # treating my_file as a sequence of strings
    if not line.startswith('\n'):
        count = count + 1
        print line.strip()
print "My file has "+str(count)+" lines."    
my_file.close()

In [None]:
my_file = open ('demo.txt','r')

# read() reads the _entire_ file, returns a string object
data = my_file.read()           
print "Contents of file are of type",(type(data))

# close file handle
my_file.close()

# Now data is in memory
heading="Contents of file"
print "\n"+heading+"\n"+"-"*len(heading)
print (data)

In [None]:
help(my_file.read)

In [None]:
my_file = open('demo.txt', 'r')
data = my_file.read()           # read in 64-byte chunk sizes
print (data)
while data != "":
   data = my_file.read(64)
   print data
my_file.close()

In [None]:
my_file = open ('demo.txt')
data = my_file.readline()        # reads one line
print (data)
my_file.close()

In [None]:
my_file = open ('demo.txt')
data = my_file.readlines() # reads all the lines 
                           # returns a list
print (data)
my_file.close()


### Exercise 1: 
Read the text file demo.txt and find all instances of the word "Luke"

In [None]:
my_file = open ('demo.txt','r')
count = 0
for line in my_file: # treating my_file as a sequence of strings
    if 'Luke' in line:
        count = count + 1
print "Luke appears "+str(count)+" times."    
my_file.close()

### Automatically closing files

In [None]:

with open ('demo.txt', 'r') as f:
    data = f.read() # file is closed after exiting this block of code

print (f.closed)
print (f.mode)

In [None]:
with open ('demo.txt', 'r') as f:
    data = f.readline()
    print (data)                # print adds a newline character '\n'
    data = f.readline()
    print (data)

### File position

In [None]:
with open ('demo.txt') as f:
    f.seek(5)                   # seek(offset) Changes file object's position
    data = f.readline() 
    print (data)

In [None]:
with open ('demo.txt') as f:
    f.seek(5)
    data = f.readline() 
    print (data)
    k = f.tell()              # returns current position in file
    print (k)

### File access modes

<table style="width:100%">
  <tr>
    <th>Mode</th>
    <th>Description</th> 
  </tr>
  <tr>
    <td>r</td>
    <td>Opens a file for reading only. Default mode.</td> 
  </tr>
  <tr>
    <td>rb</td>
    <td>Opens a file for reading only in binary format.</td> 
  </tr>
  <tr>
    <td>r+</td>
    <td>Opens a file for both reading and writing.</td> 
  </tr>
  <tr>
    <td>rb+</td>
    <td>Opens a file for both reading and writing in binary format.</td> 
  </tr>
  <tr>
    <td>w</td>
    <td>Opens a file for writing only. Overwrites file if it exists. Creates a new file if it does not exist.</td> 
  </tr>
  <tr>
    <td>wb</td>
    <td>Opens a file for writing only in binary format.</td> 
  </tr>
  <tr>
    <td>w+</td>
    <td>Opens a file for both writing and reading.</td> 
  </tr>
  <tr>
    <td>wb+</td>
    <td>Opens a file for both writing and reading in binary format.</td> 
  </tr>
    <td>a</td>
    <td>Opens a file for appending. The file pointer is at the end of the file if it exists. </td> 
  </tr>
  <tr>
    <td>ab</td>
    <td>Opens a file for appending in binary format.</td> 
  </tr>
  <tr>
    <td>a+</td>
    <td>Opens a file for both appending and reading.</td> 
  </tr>
  <tr>
    <td>ab+</td>
    <td>Opens a file for both appending and reading in binary format.</td> 
  </tr>
</table>

### Writing text files

In [None]:
with open ('newfile.txt', 'w') as f:  # 'w' creates a new file
    f.write('Hello world!')           # Note: python will not write '\n' for you
print "Created file :",f.name

In [None]:
# Example:
with open('elements.txt', 'w') as f:
    f.write('Noble gases:')              
    f.writelines(['He', 'Ne', 'Ar'])  # writelines writes each element on its own...but no '\n'

In [None]:
!cat elements.txt

In [None]:
with open('elements.txt', 'a+') as f:  # 'a' append mode, 'a+' appending and reading
    contents = f.read()               
    print 'Size (in bytes) of contents: ',len(contents)
    print (f.tell())                   # file pointer is at EOF
    f.write('Kr\n')

In [None]:
!cat elements.txt

In [None]:
with open('elements.txt', 'r+') as f: # 'r+' reading and writing
    print (f.tell())                  # file pointer is at 'beginning of file'
    f.write('Halogens:\n')          
    f.writelines(['F\n', 'Cl\n'])

In [None]:
!cat elements.txt

### Using print >>

In [None]:
with open ('elements.txt','w') as f:
    print >> f, 'Noble gases'        # print >> automatically adds newline
    for gas in ['He', 'Ne', 'Ar', 'Kr']:
        print >> f, gas

## Summary of basic file IO functions and methods

<table style="width:100%">
  <tr>
    <th>Methods and functions</th>
    <th>Description</th> 
  </tr>
  <tr>
    <td>open()</td>
    <td>Returns a file object and is most commonly used with two arguments: open(filename, mode)</td> 
  </tr>
  <tr>
    <td>file.close()</td>
    <td>Close the file.</td> 
  </tr>
  <tr>
    <td>file.read([size])</td>
    <td>Read the entire file. If size is specified then read at most size bytes.</td> 
  </tr>
  <tr>
    <td>file.readline([size])</td>
    <td>Read one line from the file. If size is specified then read at most size bytes.</td> 
  </tr>
  <tr>
    <td>file.readlines([size])</td>
    <td>Read all the lines from the file. If size is specified then read at most size bytes.</td> 
  </tr>
  <tr>
    <td>file.write(string)</td>
    <td>Writes the contents of string to the file.</td> 
  </tr>
  <tr>
    <td>file.tell()</td>
    <td>Returns file object's current position in the file.</td> 
  </tr>
  <tr>
    <td>file.seek(int)</td>
    <td>Changes the file object's current position to the given int.</td> 
</table>

### Exercise 2:
Write a program that reads file 'demo.txt' and writes out a new file with the lines in reversed order (i.e. the first line in the old file becomes the last one in the new file.)

### Handling delimited files

In [None]:
import csv
help(csv)

In [None]:
import csv
fileObj = open('grades.csv', 'r')
csvObj = csv.reader(fileObj)            # creates CSV reader object
for i, row in enumerate(csvObj):
    print row    # Each row of the input data is parsed and converted to a list of strings.
fileObj.close()

In [None]:
with open('grades.csv', 'a') as fileObj:
    writer = csv.writer(fileObj)
    writer.writerow( ('5040','80','90','95','A') )

print open('grades.csv', 'r').read()

In [None]:
# Example: searching CSV file

with open('grades.csv', 'r') as fileObj:
    studentID = raw_input("Enter SID: ") 
    csvObj = csv.reader(fileObj)          
    for row in csvObj:
        for field in row:
            if field == studentID:
                print row

### Binary data IO

In [None]:
s = b"Hello world!"

with open('hello.bin','wb') as f:
    f.write(s)
with open('hello.bin','rb') as f:
    byte = f.read(1)
    print byte,
    while byte != "":
        byte = f.read(1)
        print byte,


# The differences between binary and ascii encoding won't be obvious for simple alphanumeric 
# strings, but will become important if you're processing text that includes characters not 
# in the ascii character set.


In [None]:
with open('cat.jpg', 'rb') as f:
    data = f.readline()
print (data)

# Print data as hex bytes
':'.join(x.encode('hex') for x in data)

In [None]:
with open('cat.jpg', 'rb') as f:
    data = f.read()
 
    if data.startswith(b'\xff\xd8'):
        info = 'This is a jpeg file (%d bytes long)'
    else:
        info = 'This is a random file (%d bytes long)'

    print (info % len(data))

In [None]:
from IPython.display import Image
kitty = Image(filename = 'cat.jpg')
kitty

### OS dependent functions

In [None]:
import os
help(os.read)
fd = os.open('demo.txt', os.O_RDWR)
ret = os.read(fd, 15)
print 'Result from os.read:'+'\n'+20*'-'+'\n'+ret
os.close(fd)

In [None]:
# FILE TESTING
# You need to test whether or not a file or directory exists.
import os
print os.path.exists('/etc/passwd')
print os.path.exists('/etc/spam')

In [None]:
filename = '/etc/spam'
if os.path.exists(filename):
    with open(filename) as f:
        data = f.readline()
    print data    
else:
    print (filename + ' does not exist')

In [None]:
print os.path.isfile('/etc/passwd')
print os.path.isdir('/etc/passwd')
print os.path.islink('python')
print os.path.realpath('python')
print os.path.getsize('/etc/passwd')

In [None]:
listdir =  os.listdir(".")
for file in listdir:
   print file

In [None]:
# The shutil module offers a number of high-level operations on files and collections of files. 
# In particular, functions are provided which support file copying and removal. 
# For operations on individual files, see also the os module.
import shutil
shutil.copy('cat.jpg','catcopy.jpg')

In [None]:
# Python os module provides methods that help you perform file-processing operations, 
# such as renaming and deleting files.

In [None]:
help (shutil.copy)
help (shutil.move)
help (os.rename)

# Rename a file
shutil.copy('demo.txt','demo.txt.save')
os.rename( 'demo.txt', 'ep7intro.txt' )

In [None]:
# Delete file
os.remove('ep7intro.txt')
shutil.copy('demo.txt.save','demo.txt')

In [None]:
# Create a directory "test"
os.mkdir("newdir")
os.chdir("newdir")
print os.getcwd()

In [None]:
os.chdir("..")
print os.getcwd()

In [None]:
os.rmdir("newdir")

In [None]:
print os.environ['HOME']

### Python object serialization

The pickle module turns an arbitrary Python object into a series of bytes. This process is also called serialization. 
  - Useful for storing data
  - Inter process communication

In [None]:
# import the pickle module
import pickle

data1 = [ { 'a':'A', 'b':2, 'c':3.0 } ]
print 'DATA:',
print(data1)

# Use pickle.dumps() to create a string representation of the value of the object.
data_string = pickle.dumps(data1)
print 'PICKLE:', data_string
# By default, the pickle will contain only ASCII characters. 

In [None]:
# Once the data is serialized, you can write it to a file, socket, pipe, etc. 
# Then later you can read the file and unpickle the data to construct a new 
# object with the same values.
data1 = [ { 'a':'A', 'b':2, 'c':3.0 } ]
print 'BEFORE:',
print(data1)

data2 = pickle.loads(data_string)
print 'AFTER:',
print(data2)

print 'EQUAL?:', (data1 == data2)
print 'SAME ?:', (data1 is data2)


In [None]:
import pickle
with open('pickledData.bin', 'wb') as f:
    pickle.dump(data1, f)
with open('pickledData.bin', 'rb') as f:
    data2 = pickle.load(f)
print 'Data:',
print(data1)
print 'EQUAL?:', (data1 == data2)