# Interacting with the OS and filesystem

In [1]:
import os
import math

In [2]:
# checking the present working directory
os.getcwd()

'C:\\Users\\joe_h\\Projects - Data Analysis\\Loans Project'

In [3]:
# get the list of files in a directory:
help(os.listdir)

Help on built-in function listdir in module nt:

listdir(path=None)
    Return a list containing the names of the files in the directory.
    
    path can be specified as either str, bytes, or a path-like object.  If path is bytes,
      the filenames returned will also be bytes; in all other circumstances
      the filenames returned will be str.
    If path is None, uses the path='.'.
    On some platforms, path may also be specified as an open file descriptor;\
      the file descriptor must refer to a directory.
      If this functionality is unavailable, using it raises NotImplementedError.
    
    The list is in arbitrary order.  It does not include the special
    entries '.' and '..' even if they are present in the directory.



In [4]:
# here ('.') refers to the current directory
os.listdir('.')

['.ipynb_checkpoints',
 'Arithmetic operations and broadcasting.ipynb',
 'climate.csv',
 'climate_results.csv',
 'data',
 'Loans Project (Part 1).ipynb',
 'Loans Project (Part 2).ipynb']

In [5]:
os.listdir('C:/Users/joe_h/Projects - Data Analysis/Data Analysis - Useful Calculation Basics')

[]

In [6]:
# can also create new files in new directories
os.makedirs('./data', exist_ok=True)
os.listdir('./data')

['loans1.txt', 'loans2.txt', 'loans3.txt']

In [7]:
# downloading some files into the newly created 'data' directory using urllib module

In [8]:
# these are the urls to import from
url1 = 'https://hub.jovian.ml/wp-content/uploads/2020/08/loans1.txt'
url2 = 'https://hub.jovian.ml/wp-content/uploads/2020/08/loans2.txt'
url3 = 'https://hub.jovian.ml/wp-content/uploads/2020/08/loans3.txt'

In [9]:
import urllib.request

In [10]:
urllib.request.urlretrieve(url1, './data/loans1.txt')

('./data/loans1.txt', <http.client.HTTPMessage at 0x1e53060cdf0>)

In [11]:
urllib.request.urlretrieve(url1, './data/loans2.txt')

('./data/loans2.txt', <http.client.HTTPMessage at 0x1e53060c070>)

In [12]:
urllib.request.urlretrieve(url1, './data/loans3.txt')

('./data/loans3.txt', <http.client.HTTPMessage at 0x1e530720850>)

## Reading from a file
#### to read it first needs to be opened

In [13]:
# opening the file with read (r) and saving in the variable 'file1'

file1 = open('./data/loans1.txt', mode = 'r')

In [14]:
# file1.read gets the contents of the entire file into a single string

file1_contents = file1.read()

file1_contents

'amount,duration,rate,down_payment\n100000,36,0.08,20000\n200000,12,0.1,\n628400,120,0.12,100000\n4637400,240,0.06,\n42900,90,0.07,8900\n916000,16,0.13,\n45230,48,0.08,4300\n991360,99,0.08,\n423000,27,0.09,47200'

In [15]:
print(file1_contents)

amount,duration,rate,down_payment
100000,36,0.08,20000
200000,12,0.1,
628400,120,0.12,100000
4637400,240,0.06,
42900,90,0.07,8900
916000,16,0.13,
45230,48,0.08,4300
991360,99,0.08,
423000,27,0.09,47200


In [16]:
# after you are finished with a file it's important to close the file!
# this is because the file takes up memory so it should be closed to speed things up

file1.close()

## Closing files automatically with the 'with' statement

In [17]:
#

with open('./data/loans2.txt', 'r') as file2:
    file2_contents = file2.read()
    print(file2_contents)

amount,duration,rate,down_payment
100000,36,0.08,20000
200000,12,0.1,
628400,120,0.12,100000
4637400,240,0.06,
42900,90,0.07,8900
916000,16,0.13,
45230,48,0.08,4300
991360,99,0.08,
423000,27,0.09,47200


## Reading a file line by line

In [18]:
# this will print the file line by line

with open('./data/loans3.txt', 'r') as file3:
    file3_lines = file3.readlines()

## Processing Data from Files

In [19]:
print(file2_contents)

amount,duration,rate,down_payment
100000,36,0.08,20000
200000,12,0.1,
628400,120,0.12,100000
4637400,240,0.06,
42900,90,0.07,8900
916000,16,0.13,
45230,48,0.08,4300
991360,99,0.08,
423000,27,0.09,47200


In [20]:
# split the line into its parts
'100000,36,0.08,20000'.split(',')

['100000', '36', '0.08', '20000']

In [21]:
# create a dictionary for each loan and using the headers as keys
loan1 = {
    'amount': 100000,
    'duration': 36,
    'rate': 0.08,
    'down_payment': 20000
}

In [22]:
# creating a list of dictionaries to keep track of all the loans
# since we're performing the same operations on multiple files, define a function to do this.

# this function parse_header will take a line as input and return a list of column headers as output

def parse_headers(header_line):
    return header_line.strip().split(',')

In [23]:
file3_lines[0]

'amount,duration,rate,down_payment\n'

In [24]:
# using the function on the headers so now the 'headers' object is a list of the headers
headers = parse_headers(file3_lines[0])

In [25]:
headers

['amount', 'duration', 'rate', 'down_payment']

In [26]:
# now defining a function which takes a line and returns a list of floating point numbers

def parse_values(data_line):
    values = []
    for item in data_line.strip().split(','):
        values.append(float(item))
    return values

In [27]:
# now each line is stored as an index in the list 'file3_lines', this case calling line 5(4)

file3_lines[3]

'628400,120,0.12,100000\n'

In [28]:
file3_lines[3].strip().split(',')

['628400', '120', '0.12', '100000']

In [29]:
# now using the new function on particular lines to convert the strings to floats

parse_values(file3_lines[3])

[628400.0, 120.0, 0.12, 100000.0]

In [30]:
# however, this method doesn't work with empty strings, e.g. the third line (2)
# it throws an error as line 3 doesn't have a down_payment

file3_lines[2].strip().split(',')
parse_values(file3_lines[2])

ValueError: could not convert string to float: ''

In [32]:
# to control for errors like this we write an if statement, inside the 'parse_values' function, which takes this into account
# in the if statement - if item is an empty string then append it with 0.0

def parse_values(data_line):
    values = []
    for item in data_line.strip().split(','):
        if item == '':
            values.append(0.0)
        else:
            values.append(float(item))
    return values

In [33]:
# now the third line from file3 can be parsed this time! (0.0 got appended as the down_payment value)

file3_lines[2].strip().split(',')
parse_values(file3_lines[2])

[200000.0, 12.0, 0.1, 0.0]

In [34]:
# defining a function to create an item dictionary which takes a list of values and the list of headers and
# returns a dictionary with the values associated with their respective header keys

# the zip() function takes iterables (can be zero or more), aggregates them in a tuple, and returns it

def create_item_dict(values, headers):
    result = {}
    for value, header in zip(values, headers):
        result[header] = value
    return result

In [35]:
values1 = parse_values(file3_lines[2])
values1

[200000.0, 12.0, 0.1, 0.0]

In [36]:
# this function takes the headers and the values and creates a dictionary
create_item_dict(values1, headers)

{'amount': 200000.0, 'duration': 12.0, 'rate': 0.1, 'down_payment': 0.0}

## Combining the functions together in a new function called 'read_csv'

In [37]:
def read_csv(path):
    result = []
    # open the file in read mode
    with open(path, 'r') as f:
        # get a list of lines
        lines = f.readlines()
        # parse the header
        headers = parse_headers(lines[0])
        # loop over the remaining lines
        for data_line in lines[1:]:
            # parse the values
            values = parse_values(data_line)
            # create a dictionary using values and headers
            item_dict = create_item_dict(values, headers)
            # add the dictionary to the result
            result.append(item_dict)
        return result

In [38]:
# lets test it out

with open('./data/loans2.txt') as file2:
    print(file2.read())

amount,duration,rate,down_payment
100000,36,0.08,20000
200000,12,0.1,
628400,120,0.12,100000
4637400,240,0.06,
42900,90,0.07,8900
916000,16,0.13,
45230,48,0.08,4300
991360,99,0.08,
423000,27,0.09,47200


In [39]:
# it works! we now have a list of dictionaries including every aspect of every transaction

read_csv('./data/loans2.txt')

[{'amount': 100000.0, 'duration': 36.0, 'rate': 0.08, 'down_payment': 20000.0},
 {'amount': 200000.0, 'duration': 12.0, 'rate': 0.1, 'down_payment': 0.0},
 {'amount': 628400.0,
  'duration': 120.0,
  'rate': 0.12,
  'down_payment': 100000.0},
 {'amount': 4637400.0, 'duration': 240.0, 'rate': 0.06, 'down_payment': 0.0},
 {'amount': 42900.0, 'duration': 90.0, 'rate': 0.07, 'down_payment': 8900.0},
 {'amount': 916000.0, 'duration': 16.0, 'rate': 0.13, 'down_payment': 0.0},
 {'amount': 45230.0, 'duration': 48.0, 'rate': 0.08, 'down_payment': 4300.0},
 {'amount': 991360.0, 'duration': 99.0, 'rate': 0.08, 'down_payment': 0.0},
 {'amount': 423000.0, 'duration': 27.0, 'rate': 0.09, 'down_payment': 47200.0}]

## The function below is from the 'Loans Project (Part 1)' and we can now use it
## to calculate EMIs for all loans in the file

In [40]:
def loan_emi(amount, duration, rate, down_payment=0):
    
    """""Calculates the equal monthly installment(EMI) for a loan.
    
    Arguments:
    amount = total amount to be spent (loan + down payment)
    duration = duration of loan in months
    rate = rate of monthly interest
    down payment = optional initial payment - deducted from amount
    """
    
    loan_amount = amount - down_payment
    try:
        emi = loan_amount * rate * ((1+rate)**duration) / (((1+rate)**duration-1))
    except ZeroDivisionError:
        emi = loan_amount / duration
    emi = math.ceil(emi)
    return emi

In [41]:
# the function 'read_csv' with the loans2 file is stored in a new variable 'loans2'

loans2 = read_csv('./data/loans2.txt')

In [42]:
# now we can create a function to calculate the equal monthly installments for each of the loans in the list

for loan in loans2:
    loan['emi'] = loan_emi(loan['amount'],
                           loan['duration'],
                           loan['rate']/12, # monthly rate of interest (file contains yearly)
                           loan['down_payment'])

In [43]:
# Now 'loans2' object contains the EMI for ever loan in the list!

loans2

[{'amount': 100000.0,
  'duration': 36.0,
  'rate': 0.08,
  'down_payment': 20000.0,
  'emi': 2507},
 {'amount': 200000.0,
  'duration': 12.0,
  'rate': 0.1,
  'down_payment': 0.0,
  'emi': 17584},
 {'amount': 628400.0,
  'duration': 120.0,
  'rate': 0.12,
  'down_payment': 100000.0,
  'emi': 7582},
 {'amount': 4637400.0,
  'duration': 240.0,
  'rate': 0.06,
  'down_payment': 0.0,
  'emi': 33224},
 {'amount': 42900.0,
  'duration': 90.0,
  'rate': 0.07,
  'down_payment': 8900.0,
  'emi': 487},
 {'amount': 916000.0,
  'duration': 16.0,
  'rate': 0.13,
  'down_payment': 0.0,
  'emi': 62664},
 {'amount': 45230.0,
  'duration': 48.0,
  'rate': 0.08,
  'down_payment': 4300.0,
  'emi': 1000},
 {'amount': 991360.0,
  'duration': 99.0,
  'rate': 0.08,
  'down_payment': 0.0,
  'emi': 13712},
 {'amount': 423000.0,
  'duration': 27.0,
  'rate': 0.09,
  'down_payment': 47200.0,
  'emi': 15428}]

## Extracting the logic into a new function so it can be used for other files too
## Now this function can work out the EMI of any loan files with the same format

In [44]:
def compute_emis(loans):
    for loan in loans:
        loan['emi'] = loan_emi(
            loan['amount'],
            loan['duration'],
            loan['rate']/12, # monthly rate of interest (file contains yearly)
            loan['down_payment'])

### Writing the results back to a csv file

In [45]:
# getting the loans2 csv

loans2 = read_csv('./data/loans2.txt')

In [46]:
# passing it through the function we made to work out all the EMIs

compute_emis(loans2)

In [47]:
# now loans2 contains the emis for every loan in the file

loans2

[{'amount': 100000.0,
  'duration': 36.0,
  'rate': 0.08,
  'down_payment': 20000.0,
  'emi': 2507},
 {'amount': 200000.0,
  'duration': 12.0,
  'rate': 0.1,
  'down_payment': 0.0,
  'emi': 17584},
 {'amount': 628400.0,
  'duration': 120.0,
  'rate': 0.12,
  'down_payment': 100000.0,
  'emi': 7582},
 {'amount': 4637400.0,
  'duration': 240.0,
  'rate': 0.06,
  'down_payment': 0.0,
  'emi': 33224},
 {'amount': 42900.0,
  'duration': 90.0,
  'rate': 0.07,
  'down_payment': 8900.0,
  'emi': 487},
 {'amount': 916000.0,
  'duration': 16.0,
  'rate': 0.13,
  'down_payment': 0.0,
  'emi': 62664},
 {'amount': 45230.0,
  'duration': 48.0,
  'rate': 0.08,
  'down_payment': 4300.0,
  'emi': 1000},
 {'amount': 991360.0,
  'duration': 99.0,
  'rate': 0.08,
  'down_payment': 0.0,
  'emi': 13712},
 {'amount': 423000.0,
  'duration': 27.0,
  'rate': 0.09,
  'down_payment': 47200.0,
  'emi': 15428}]

In [49]:
# now we write this back to a new file called 'emis2'

with open('./data/emis2.txt', 'w') as f:
    for loan in loans2:
        f.write('{},{},{},{},{}\n'.format(
            loan['amount'],
            loan['duration'],
            loan['rate'],
            loan['down_payment'],
            loan['emi']))

In [50]:
# now verifying that the file was created and written to as expected

os.listdir('data')

# we can see the file includes emis2.txt which confirms it

['emis2.txt', 'loans1.txt', 'loans2.txt', 'loans3.txt']

## Creating a generic function for taking a list of dictionaries and writes it to a file in csv format

In [53]:
def write_csv(items, path):
    # opens the file in write mode
    with open(path, 'w') as f:
        # return if there's nothing to write
        if len(items) == 0:
            return
        
        # writing the headers in the first line
        headers = list(items[0].keys())
        f.write(','.join(headers) + '\n')
        
        # write one item per line
        for item in items:
            values = []
            for header in headers:
                values.append(str(item.get(header, "")))
            f.write(','.join(values) + "\n")

In [55]:
# trying out the function on a another file (loans3)

loans3 = read_csv('./data/loans3.txt')

In [56]:
# passing loans3 into the function that calculates emis

compute_emis(loans3)

In [57]:
# writing the results to a new csv called 'emis3'

write_csv(loans3, './data/emis3.txt')

In [58]:
# reading the newly made file to verify it has been created correctly

with open('./data/emis3.txt', 'r') as f:
    print(f.read())

amount,duration,rate,down_payment,emi
100000.0,36.0,0.08,20000.0,2507
200000.0,12.0,0.1,0.0,17584
628400.0,120.0,0.12,100000.0,7582
4637400.0,240.0,0.06,0.0,33224
42900.0,90.0,0.07,8900.0,487
916000.0,16.0,0.13,0.0,62664
45230.0,48.0,0.08,4300.0,1000
991360.0,99.0,0.08,0.0,13712
423000.0,27.0,0.09,47200.0,15428



# Creating a for loop that:
1. reads each downloaded file
2. calculates the EMI
3. and writes results back to new files

In [63]:
# for loans 1,2 and 3, read the csv, compute the EMIs and write results back to a new csv

for i in range(1,4):
    loans = read_csv('./data/loans{}.txt'.format(i))
    compute_emis(loans)
    write_csv(loans, './data/emis{}.txt'.format(i))

In [64]:
# checking the directory to verify the new csvs have been written

os.listdir('./data')

['emis1.txt',
 'emis2.txt',
 'emis3.txt',
 'loans1.txt',
 'loans2.txt',
 'loans3.txt']