## 0. Prerequisite

In [1]:
# from google.colab import drive

# drive.mount('/content/gdrive/', force_remount=True)


## p4. Reading Text Files

In [2]:
# !cat "./AIP_05_Data Processing and File Access_data/FirstPresidents.txt"

In [3]:
def main():
  '''
  Display the names of the first three presidents.
  '''
  file = "./AIP_05_Data Processing and File Access_data/FirstPresidents.txt"
  displayWithForLoop(file)
  print()
  displayWithListComprehension(file)
  print()
  displayWithReadline(file)

def displayWithForLoop(file):
  infile = open(file, 'r')
  for line in infile:
    print(line.rstrip())
  infile.close()

def displayWithListComprehension(file):
  infile = open(file, 'r')
  listPres = [line.rstrip() for line in infile]
  infile.close()
  print(listPres)

def displayWithReadline(file):
  infile = open(file, 'r')
  line = infile.readline()
  while line != "":
    print(line.rstrip())
    line = infile.readline()
  infile.close()

main()

George Washington
John Adams
Thomas Jefferson

['George Washington', 'John Adams', 'Thomas Jefferson']

George Washington
John Adams
Thomas Jefferson


## p9. Creating Text Files

In [4]:
# !cat /content/gdrive/MyDrive/States.txt

In [5]:
def main():
  '''
  Create a text file containing the 50 states in alphabetical order.
  '''
  fileName = './AIP_05_Data Processing and File Access_data/States.txt'
  sortedFileName = './AIP_05_Data Processing and File Access_data/StatesAlpha.txt'
  statesList = createListFromFile(fileName)
  print('statesList', statesList)
  createSortedFile(statesList, sortedFileName)
  sortedStatesList = createListFromFile(sortedFileName)
  print('sortedStatesList', sortedStatesList)

def createListFromFile(fileName):
  infile = open(fileName, 'r')
  desiredList = [line.rstrip() for line in infile]
  infile.close()
  return desiredList

def createSortedFile(listName, fileName):
  listName.sort()
  for i in range(len(listName)):
    listName[i] = listName[i] + "\n"
  outfile = open(fileName, 'w')
  outfile.writelines(listName)
  outfile.close()

main()

statesList ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
sortedStatesList ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New

## p26. Accessing the Data in a CSV File

In [6]:
# !cat /content/gdrive/MyDrive/UN.txt

In [7]:
import csv
from itertools import groupby
first = lambda x: x[0]
fileName = './AIP_05_Data Processing and File Access_data/UN.txt'

def getContinentList():
  contents = list()
  with open(fileName, 'r') as f_read:
    reader = csv.reader(f_read)
    for row in reader:
      contents.append(row)

  return set(map(first, groupby(contents, lambda x: x[1])))



In [8]:
def main():
  '''
  Display the countries in a specified continent.
  '''
  continent = input(f"Enter the name of a continent: {*getContinentList(),} ")
  continent = continent.title() # Allow for all lower
  if continent != "Antarctica": # case letters. (남극대륙)
    infile = open(fileName, 'r')
    for line in infile:
      data = line.split(',')
      if data[1] == continent:
        print(data[0])
    infile.close()
  else:
    print("There are no countries in Antarctica.")

main()

Afghanistan
Armenia
Azerbaijan
Bahrain
Bangladesh
Bhutan
Brunei Darussalam
Cambodia
China
Cyprus
Democratic People's Republic of Korea
Georgia
India
Indonesia
Iran
Iraq
Israel
Japan
Jordan
Kazakhstan
Kuwait
Kyrgyzstan
Lao People's Democratic Republic
Lebanon
Malaysia
Maldives
Mongolia
Myanmar
Nepal
Oman
Pakistan
Philippines
Qatar
Republic of Korea
Saudi Arabia
Singapore
Sri Lanka
Syrian Arab Republic
Tajikistan
Thailand
Timor-Leste
Turkey
Turkmenistan
United Arab Emirates
Uzbekistan
Vietnam
Yemen


## p.28 Analyzing the Data in a CSV File with a List

In [9]:
fileName = './AIP_05_Data Processing and File Access_data/UN.txt'
outputFileName = './AIP_05_Data Processing and File Access_data/UNByArea.txt'

from enum import Enum
class Column(Enum):
  COUNTRY = 0
  CONTINENT = 1
  POPULATION = 2
  AREA = 3

def main():
  '''
  Create a file containing all countries and areas, ordered by area.
  Display first five lines of the file.
  '''
  countries = placeRecordsIntoList(fileName)
  countries.sort(key=lambda country: country[3], reverse=True) # sort by area

  displayFiveLargestCountries(countries)
  createNewFile(countries)

def placeRecordsIntoList(fileName):

  infile = open(fileName, 'r')
  listOfRecords = [line.rstrip() for line in infile]
  infile.close()
  for i in range(len(listOfRecords)):
    listOfRecords[i] = listOfRecords[i].split(',')
    listOfRecords[i][Column.POPULATION.value] = eval(listOfRecords[i][Column.POPULATION.value]) # population
    listOfRecords[i][Column.AREA.value] = eval(listOfRecords[i][Column.AREA.value]) # area

  return listOfRecords

def displayFiveLargestCountries(countries, n=5):
  print("{0:20}{1:9}".format("Country", "Area (sq. mi.)"))
  for i in range(n):
    print("{0:20}{1:9,d}".format(countries[i][0], countries[i][3]))

def createNewFile(countries):
  '''
  Create file of countries and their areas.
  '''
  outfile = open(outputFileName, 'w')
  for country in countries:
    outfile.write(country[Column.COUNTRY.value] + ',' + str(country[Column.AREA.value])+ "\n")
  outfile.close()

main()

Country             Area (sq. mi.)
Russian Federation  6,592,800
Canada              3,855,000
United States       3,794,066
China               3,696,100
Brazil              3,287,597


In [10]:
# !cat /content/gdrive/MyDrive/UNbyArea.txt

## p.39 Using a Dictionary as a Frequency Table

In [11]:
# !cat /content/gdrive/MyDrive/Gettysburg.txt

In [12]:
fileName = './AIP_05_Data Processing and File Access_data/Gettysburg.txt'

def main():
  '''
  Analyze word frequencies in the Gettysburg Address,
  which is written in a single line.
  '''
  listOfWords = formListOfWords(fileName)
  freq = createFrequencyDictionary(listOfWords)
  displayWordCount(listOfWords, freq)
  displayMostCommonWords(freq)

def formListOfWords(fileName):
  infile = open(fileName)
  originalLine = infile.read().lower()
  infile.close()
  # Remove punctuation marks from the line.
  line = ""
  for ch in originalLine:
    if ('a' <= ch <= 'z') or (ch == " "):
      line += ch
  # Place the individual words into a list.
  listOfWords = line.split()
  return listOfWords

def createFrequencyDictionary(listOfWords):
  '''
  Create dictionary with each item having the form word:word frequency.
  '''
  freq = {}
  # an empty dictionary
  for word in listOfWords:
    freq[word] = 0
  for word in listOfWords:
    freq[word] = freq[word] + 1
  return freq

def displayWordCount(listOfWords, freq):
  print("The Gettysburg Address contains", len(listOfWords), "words.")
  print("The Gettysburg Address contains", len(freq), "different words.")
  print()


def displayMostCommonWords(freq):
  '''
  Common words are those with frequency > 5.
  '''
  print("The most common words and their frequencies are:")
  listOfMostCommonWords = []
  for word in freq.keys():
    if freq[word] >= 6:
      listOfMostCommonWords.append((word, freq[word]))
  listOfMostCommonWords.sort(key=lambda x: x[1], reverse=True)
  for item in listOfMostCommonWords:
    print("    ", item[0] + ':', item[1])

main()

The Gettysburg Address contains 268 words.
The Gettysburg Address contains 139 different words.

The most common words and their frequencies are:
     that: 13
     the: 11
     we: 10
     to: 8
     here: 8
     a: 7
     and: 6


## p.45 Dictionary-Valued Dictionaries

In [13]:
# !cat /content/gdrive/MyDrive/UNDict.dat

In [14]:
import pickle

fileName = './AIP_05_Data Processing and File Access_data/UNDict.dat'

def main():
  '''
  Display countries (and their population) from a specified continent.
  '''
  nations = getDictionary(fileName)
  print("Enter the name of a continent", end='')
  continent = input("other than Antarctica: ('Europe', 'Asia', 'North America', 'South America', 'Africa', 'Australia/Oceania') ")
  continentDict = constructContinentNations(nations, continent)
  displaySortedResults(continentDict)

def getDictionary(fileName):
  infile = open(fileName, 'rb')
  countries = pickle.load(infile)
  infile.close()
  return countries

def constructContinentNations(nations, continent):
  '''
  Reduce the full 193 item dictionary to a dictionary
  consisting solely of the countries in the specified continent.
  '''
  continentDict = {} # an empty dictionary
  for nation in nations: # or nations.keys()
    if nations[nation]["cont"] == continent:
      continentDict[nation] = nations[nation]
  return continentDict

def displaySortedResults(dictionaryName):
  '''
  Display countries in descending order by population.
  '''
  continentList = sorted(dictionaryName.items(), key=lambda k: k[1]["popl"], reverse=True)
  for k in continentList:
    print(" {0:s}: {1:,.2f}".format(k[0], k[1]["popl"]))

main()

Enter the name of a continent China: 1,355.70
 India: 1,236.30
 Indonesia: 253.60
 Pakistan: 196.20
 Bangladesh: 166.30
 Japan: 127.10
 Philippines: 107.70
 Vietnam: 93.40
 Turkey: 81.60
 Iran: 80.80
 Thailand: 67.70
 Myanmar: 55.70
 Republic of Korea: 49.00
 Iraq: 32.60
 Afghanistan: 31.80
 Nepal: 31.00
 Malaysia: 30.10
 Uzbekistan: 28.90
 Saudi Arabia: 27.30
 Yemen: 26.10
 Democratic People's Republic of Korea: 24.90
 Sri Lanka: 22.90
 Syrian Arab Republic: 18.00
 Kazakhstan: 18.00
 Cambodia: 15.50
 Azerbaijan: 9.70
 Tajikistan: 8.10
 Jordan: 7.90
 Israel: 7.80
 Lao People's Democratic Republic: 6.80
 Lebanon: 5.90
 Singapore: 5.60
 Kyrgyzstan: 5.60
 United Arab Emirates: 5.60
 Turkmenistan: 5.10
 Georgia: 4.90
 Oman: 3.20
 Armenia: 3.10
 Mongolia: 3.00
 Kuwait: 2.70
 Qatar: 2.10
 Bahrain: 1.30
 Timor-Leste: 1.20
 Cyprus: 1.20
 Bhutan: 0.73
 Brunei Darussalam: 0.42
 Maldives: 0.39


## p.48 Using a Dictionary with Tuples as Keys

In [15]:
# !cat /content/gdrive/MyDrive/USpresStatesDict.dat

In [16]:
import pickle

fileName = './AIP_05_Data Processing and File Access_data/USpresStatesDict.dat'

def main():
  '''
  Displays the presidents from the given state ordered alphabetically by their last names.
  '''
  presDict = createDictFromBinaryFile(fileName)
  state = getState(presDict)
  displayOutput(state, presDict)

def createDictFromBinaryFile(fileName):
  infile = open(fileName, 'rb')
  dictionary = pickle.load(infile)
  infile.close()
  return dictionary

def getState(dictName):
  state = input("Enter the name of a state: ")
  if state in dictName.values():
    return state
  else:
    return "There are no presidents from " + state + '.'

def displayOutput(state, dictName):
  if state.startswith("There"):
    print(state)
  else:
    print("Presidents from", state + ':')
    for pres in sorted(dictName):# in sorted list of names
      if dictName[pres] == state:
        print(" " + pres[1] + " " + pres[0])

main()


There are no presidents from Asia.


## p.22 Sets

In [17]:
# !cat /content/gdrive/MyDrive/VPres.txt

In [18]:
# !cat /content/gdrive/MyDrive/USPres.txt

In [19]:

vPresFileName = './AIP_05_Data Processing and File Access_data/VPres.txt'
usPresFileName = './AIP_05_Data Processing and File Access_data/USPres.txt'
writtenFileName = './AIP_05_Data Processing and File Access_data/PresAndVPres.txt'

def main():
  '''
  Create a file of the presidents who also served as vice-presidents.
  '''
  vicePresSet = createSetFromFile(vPresFileName)
  presSet = createSetFromFile(usPresFileName)
  bothPresAndVPresSet = createIntersection(vicePresSet, presSet)
  writeNamesToFile(bothPresAndVPresSet, writtenFileName)

def createSetFromFile(fileName):
  '''
  Assume that the last line of the file ends with a newline character.
  '''
  infile = open(fileName, 'r')
  namesSet = {name for name in infile}
  infile.close()
  return namesSet

def createIntersection(set1, set2):
  return set1.intersection(set2)

def writeNamesToFile(setName, fileName):
  outfile = open(fileName, 'w')
  outfile.writelines(setName)
  outfile.close()

main()

In [20]:
# !cat /content/gdrive/MyDrive/PresAndVPres.txt

# Lab

In [21]:
# from google.colab import drive

# drive.mount('/content/gdrive/', force_remount=True)



## 1. Count Unique Words

Given a text file, count the unique words present in the file.








In [22]:
# !cat /content/gdrive/MyDrive/input1.txt

## 2.  Find the Frequency of Each Word

Given a text file, find the frequency of each word present in the file.




In [23]:
# !cat /content/gdrive/MyDrive/input1.txt

## 3. Read a CSV File into a List of Dictionaries

Given a CSV file, read its content into a  dictionary where keys are column names.



In [24]:
# !cat /content/gdrive/MyDrive/input3.txt

## 4. Replace Specific Words

Given a text file and a dictionary of words to replace, replace specific words in the file and write new words to 'output5.txt'.







In [25]:
# !cat /content/gdrive/MyDrive/input5.txt

In [26]:
dict_words = {
    "intelligence": "Intel",
    "computer": "Computer",
    "artificial": "Artificial",
    "human": "Human",
    "problem": "Issue",
    "learning": "Studying"
}

In [27]:
# !cat /content/gdrive/MyDrive/output5.txt