# 03_02: Loading Text Files

In [1]:
import math
import collections

import numpy as np
import pandas as pd
import matplotlib.pyplot as pp

%matplotlib inline

In [2]:
# iterating over an open file yields its lines, one by one

words = []
for line in open('words.txt', 'r'):
    words.append(line)

In [3]:
len(words)

235886

In [4]:
words[:10]

['A\n',
 'a\n',
 'aa\n',
 'aal\n',
 'aalii\n',
 'aam\n',
 'Aani\n',
 'aardvark\n',
 'aardwolf\n',
 'Aaron\n']

In [5]:
'Aaron\n'.strip()

'Aaron'

In [6]:
'Aaron\n'.strip().lower()

'aaron'

In [7]:
words = []
for line in open('words.txt', 'r'):
    words.append(line.strip().lower())

In [8]:
words[:10]

['a',
 'a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'aani',
 'aardvark',
 'aardwolf',
 'aaron']

In [9]:
words = set()
for line in open('words.txt', 'r'):
    words.add(line.strip().lower())

In [10]:
# a set comprehension that collects stripped and lowercased lines...
words = {line.strip().lower() for line in open('words.txt', 'r')}

In [11]:
words

{'pennatae',
 'interpolitical',
 'sketching',
 'stockbrokerage',
 'sack',
 'aegipan',
 'impale',
 'sorehearted',
 'nonexaggeration',
 'admonish',
 'misarchist',
 'bavarian',
 'intrarachidian',
 'proterothesis',
 'perjury',
 'bivoltine',
 'unexplorative',
 'shagpate',
 'restuff',
 'unfrequency',
 'policemanlike',
 'definitize',
 'liturgiologist',
 'bladewise',
 'zabaism',
 'axhead',
 'resinol',
 'inflexibleness',
 'faradism',
 'flavid',
 'vulcanite',
 'loo',
 'implacable',
 'mulley',
 'prosubmission',
 'kalandariyah',
 'gymnastically',
 'irrepressible',
 'pulsojet',
 'sensorivasomotor',
 'slumberful',
 'ennobler',
 'bouw',
 'semieremitical',
 'negrotic',
 'palaeoencephalon',
 'smashage',
 'sheeted',
 'gammexane',
 'sufferable',
 'gaufrette',
 'hypnotize',
 'limnoria',
 'inclinatory',
 'temporizer',
 'turkmen',
 'disproportionalness',
 'favositidae',
 'nursingly',
 'jolly',
 'montanic',
 'phlogistian',
 'myosotis',
 'musicophobia',
 'limpingness',
 'originality',
 'maniac',
 'hostless',


In [12]:
# ...turned into a sorted list
words = sorted({line.strip().lower() for line in open('words.txt', 'r')})

In [13]:
words

['a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'aani',
 'aardvark',
 'aardwolf',
 'aaron',
 'aaronic',
 'aaronical',
 'aaronite',
 'aaronitic',
 'aaru',
 'ab',
 'aba',
 'ababdeh',
 'ababua',
 'abac',
 'abaca',
 'abacate',
 'abacay',
 'abacinate',
 'abacination',
 'abaciscus',
 'abacist',
 'aback',
 'abactinal',
 'abactinally',
 'abaction',
 'abactor',
 'abaculus',
 'abacus',
 'abadite',
 'abaff',
 'abaft',
 'abaisance',
 'abaiser',
 'abaissed',
 'abalienate',
 'abalienation',
 'abalone',
 'abama',
 'abampere',
 'abandon',
 'abandonable',
 'abandoned',
 'abandonedly',
 'abandonee',
 'abandoner',
 'abandonment',
 'abanic',
 'abantes',
 'abaptiston',
 'abarambo',
 'abaris',
 'abarthrosis',
 'abarticular',
 'abarticulation',
 'abas',
 'abase',
 'abased',
 'abasedly',
 'abasedness',
 'abasement',
 'abaser',
 'abasgi',
 'abash',
 'abashed',
 'abashedly',
 'abashedness',
 'abashless',
 'abashlessly',
 'abashment',
 'abasia',
 'abasic',
 'abask',
 'abassin',
 'abastardize',
 'abatable',
 'abate',
 'a

In [14]:
# return a list of all lines from an open file 
open('francais.txt', 'r', encoding='latin1').readlines()

['\n',
 'a\n',
 'ab\n',
 'abaissa\n',
 'abaissai\n',
 'abaissaient\n',
 'abaissais\n',
 'abaissait\n',
 'abaissant\n',
 'abaissas\n',
 'abaissasse\n',
 'abaissassent\n',
 'abaissasses\n',
 'abaissassiez\n',
 'abaissassions\n',
 'abaissâmes\n',
 'abaissât\n',
 'abaissâtes\n',
 'abaisse\n',
 'abaissement\n',
 'abaissements\n',
 'abaissent\n',
 'abaisser\n',
 'abaissera\n',
 'abaisserai\n',
 'abaisseraient\n',
 'abaisserais\n',
 'abaisserait\n',
 'abaisseras\n',
 'abaisserez\n',
 'abaisseriez\n',
 'abaisserions\n',
 'abaisserons\n',
 'abaisseront\n',
 'abaisses\n',
 'abaisseur\n',
 'abaisseurs\n',
 'abaissez\n',
 'abaissé\n',
 'abaissée\n',
 'abaissées\n',
 'abaissés\n',
 'abaissèrent\n',
 'abaissiez\n',
 'abaissions\n',
 'abaissons\n',
 'abandon\n',
 'abandonna\n',
 'abandonnai\n',
 'abandonnaient\n',
 'abandonnais\n',
 'abandonnait\n',
 'abandonnant\n',
 'abandonnas\n',
 'abandonnasse\n',
 'abandonnassent\n',
 'abandonnasses\n',
 'abandonnassiez\n',
 'abandonnassions\n',
 'abandonnâmes\