# Getting Comfortable with Different Kinds of Data Sources

In [1]:
# import packages
import numpy as np
import pandas as pd

In [2]:
# read a csv file with header
df1 = pd.read_csv('datasets/CSV_EX_1.csv')
df1

Unnamed: 0,Bedroom,Sq. foot,Locality,Price ($)
0,2,1500,Good,300000
1,3,1300,Fair,240000
2,3,1900,Very good,450000
3,3,1850,Bad,280000
4,2,1640,Good,310000


In [3]:
# read a csv file without header
df2 = pd.read_csv('datasets/CSV_EX_2.csv')
df2

Unnamed: 0,2,1500,Good,300000
0,3,1300,Fair,240000
1,3,1900,Very good,450000
2,3,1850,Bad,280000
3,2,1640,Good,310000


In [4]:
# Read the .csv file by setting the header to None
df2 = pd.read_csv('datasets/CSV_EX_2.csv', header=None)
df2

Unnamed: 0,0,1,2,3
0,2,1500,Good,300000
1,3,1300,Fair,240000
2,3,1900,Very good,450000
3,3,1850,Bad,280000
4,2,1640,Good,310000


In [5]:
# Add the names argument to get the correct headers
df2 = pd.read_csv('datasets/CSV_EX_2.csv',
                  header=None,
                  names=['Bedroom', 'Sq.fit', 'Locality', 'Price($)'])
df2

Unnamed: 0,Bedroom,Sq.fit,Locality,Price($)
0,2,1500,Good,300000
1,3,1300,Fair,240000
2,3,1900,Very good,450000
3,3,1850,Bad,280000
4,2,1640,Good,310000


In [6]:
# read a csv file
df3 = pd.read_csv('datasets/CSV_EX_3.csv')
df3

Unnamed: 0,Bedroom; Sq. foot; Locality; Price ($)
0,2; 1500; Good; 300000
1,3; 1300; Fair; 240000
2,3; 1900; Very good; 450000
3,3; 1850; Bad; 280000
4,2; 1640; Good; 310000


In [7]:
# Specify the delimiter
df3 = pd.read_csv('datasets/CSV_EX_3.csv', sep=';')
df3

Unnamed: 0,Bedroom,Sq. foot,Locality,Price ($)
0,2,1500,Good,300000
1,3,1300,Fair,240000
2,3,1900,Very good,450000
3,3,1850,Bad,280000
4,2,1640,Good,310000


In [8]:
# Add names to a .csv file that has headers
df4 = pd.read_csv('datasets/CSV_EX_1.csv', names=['A','B','C','D'])
df4

Unnamed: 0,A,B,C,D
0,Bedroom,Sq. foot,Locality,Price ($)
1,2,1500,Good,300000
2,3,1300,Fair,240000
3,3,1900,Very good,450000
4,3,1850,Bad,280000
5,2,1640,Good,310000


In [9]:
# set header to zero and provide a names list
df4 = pd.read_csv('datasets/CSV_EX_1.csv',
                 header=0,
                 names=['A','B','C','D'])
df4

Unnamed: 0,A,B,C,D
0,2,1500,Good,300000
1,3,1300,Fair,240000
2,3,1900,Very good,450000
3,3,1850,Bad,280000
4,2,1640,Good,310000


In [10]:
# read a csv file
df5 = pd.read_csv('datasets/CSV_EX_skiprows.csv')
df5

Unnamed: 0,Filetype: CSV,Unnamed: 1,Unnamed: 2,Unnamed: 3
0,,Info about some houses,,
1,Bedroom,Sq. foot,Locality,Price ($)
2,2,1500,Good,300000
3,3,1300,Fair,240000
4,3,1900,Very good,450000
5,3,1850,Bad,280000
6,2,1640,Good,310000


In [11]:
# Skip the first two rows and read the file
df5 = pd.read_csv('datasets/CSV_EX_skiprows.csv', skiprows=2)
df5

Unnamed: 0,Bedroom,Sq. foot,Locality,Price ($)
0,2,1500,Good,300000
1,3,1300,Fair,240000
2,3,1900,Very good,450000
3,3,1850,Bad,280000
4,2,1640,Good,310000


In [12]:
# read a csv file
df6 = pd.read_csv('datasets/CSV_EX_skipfooter.csv')
df6

Unnamed: 0,Filetype: CSV,Unnamed: 1,Unnamed: 2,Unnamed: 3
0,,Info about some houses,,
1,Bedroom,Sq. foot,Locality,Price ($)
2,2,1500,Good,300000
3,3,1300,Fair,240000
4,3,1900,Very good,450000
5,3,1850,Bad,280000
6,2,1640,Good,310000
7,,This is the end of file,,


In [13]:
# Use the skipfooter option
df6 = pd.read_csv('datasets/CSV_EX_skipfooter.csv',
                 skiprows=2,
                 skipfooter=1,
                 engine='python')
df6

Unnamed: 0,Bedroom,Sq. foot,Locality,Price ($)
0,2,1500,Good,300000
1,3,1300,Fair,240000
2,3,1900,Very good,450000
3,3,1850,Bad,280000
4,2,1640,Good,310000


In [14]:
# Create a list where DataFrames will be stored
list_of_dataframes = []

In [15]:
# Store the number of rows to be read into a variable
rows_in_a_chunk = 10

In [16]:
# Create a variable to store the number of chunks to be read
num_chunks = 5

In [17]:
# Create a dummy DataFrame to get the column names
df_dummy = pd.read_csv('datasets/Boston_housing.csv', nrows=2)
colnames = df_dummy.columns

In [18]:
# Loop over the CSV file to read only a fixed number of rows at a time
for i in range(0, num_chunks*rows_in_a_chunk, rows_in_a_chunk):
    df = pd.read_csv('datasets/Boston_housing.csv',
                    header=0,
                    skiprows=i,
                    nrows=rows_in_a_chunk,
                    names=colnames)
    list_of_dataframes.append(df)

list_of_dataframes

[      CRIM    ZN  INDUS  CHAS    NOX     RM    AGE     DIS  RAD  TAX  PTRATIO  \
 0  0.00632  18.0   2.31     0  0.538  6.575   65.2  4.0900    1  296     15.3   
 1  0.02731   0.0   7.07     0  0.469  6.421   78.9  4.9671    2  242     17.8   
 2  0.02729   0.0   7.07     0  0.469  7.185   61.1  4.9671    2  242     17.8   
 3  0.03237   0.0   2.18     0  0.458  6.998   45.8  6.0622    3  222     18.7   
 4  0.06905   0.0   2.18     0  0.458  7.147   54.2  6.0622    3  222     18.7   
 5  0.02985   0.0   2.18     0  0.458  6.430   58.7  6.0622    3  222     18.7   
 6  0.08829  12.5   7.87     0  0.524  6.012   66.6  5.5605    5  311     15.2   
 7  0.14455  12.5   7.87     0  0.524  6.172   96.1  5.9505    5  311     15.2   
 8  0.21124  12.5   7.87     0  0.524  5.631  100.0  6.0821    5  311     15.2   
 9  0.17004  12.5   7.87     0  0.524  6.004   85.9  6.5921    5  311     15.2   
 
         B  LSTAT  PRICE  
 0  396.90   4.98   24.0  
 1  396.90   9.14   21.6  
 2  392.83   4.

In [19]:
# Read the data from a .txt file
df13 = pd.read_table('datasets/Table_EX_1.txt')
df13

Unnamed: 0,"Bedroom, Sq. foot, Locality, Price ($)"
0,"2, 1500, Good, 300000"
1,"3, 1300, Fair, 240000"
2,"3, 1900, Very good, 450000"
3,"3, 1850, Bad, 280000"
4,"2, 1640, Good, 310000"


In [20]:
# Set the separator as a comma in the sep variable
df13 = pd.read_table('datasets/Table_EX_1.txt', sep=',')
df13

Unnamed: 0,Bedroom,Sq. foot,Locality,Price ($)
0,2,1500,Good,300000
1,3,1300,Fair,240000
2,3,1900,Very good,450000
3,3,1850,Bad,280000
4,2,1640,Good,310000


In [21]:
# Use the read_html command to read
list_of_df = pd.read_html("https://en.wikipedia.org/wiki/2016_Summer_Olympics_medal_table", header=0)
len(list_of_df)

6

In [22]:
# Extract the second element from the table
df15 = list_of_df[1]
df15.head()

Unnamed: 0,Rank,NOC,Gold,Silver,Bronze,Total
0,1,United States (USA),46,37,38,121
1,2,Great Britain (GBR),27,23,17,67
2,3,China (CHN),26,18,26,70
3,4,Russia (RUS),19,17,20,56
4,5,Germany (GER),17,10,15,42


In [23]:
# Extract the list of movies from the file into a DataFrame
df16 = pd.read_json('datasets/movies.json')
df16.head()

Unnamed: 0,title,year,cast,genres
0,After Dark in Central Park,1900,[],[]
1,Boarding School Girls' Pajama Parade,1900,[],[]
2,Buffalo Bill's Wild West Parad,1900,[],[]
3,Caught,1900,[],[]
4,Clowns Spinning Hats,1900,[],[]


In [24]:
# To look for the cast where the title is Avengers
cast_of_avengers = df16[(df16['title']=='The Avengers') & (df16['year']==2012)]['cast']
print(list(cast_of_avengers))

[['Robert Downey, Jr.', 'Chris Evans', 'Mark Ruffalo', 'Chris Hemsworth', 'Scarlett Johansson', 'Jeremy Renner', 'Tom Hiddleston', 'Clark Gregg', 'Cobie Smulders', 'Stellan Skarsgård', 'Samuel L. Jackson']]


In [25]:
# The following code retrieves the tables from two pages and joins them to make one table
from tabula import read_pdf
df18_1 = read_pdf('datasets/Housing_data.pdf',
                 pages=[1],
                 pandas_options={'header':None})

df18_1

[         0     1     2  3      4      5     6       7  8    9
 0  0.17004  12.5  7.87  0  0.524  6.004  85.9  6.5921  5  311
 1  0.22489  12.5  7.87  0  0.524  6.377  94.3  6.3467  5  311
 2  0.11747  12.5  7.87  0  0.524  6.009  82.9  6.2267  5  311
 3  0.09378  12.5  7.87  0  0.524  5.889  39.0  5.4509  5  311]

In [26]:
# Retrieve the table from another page of the same PDF
df18_2 = read_pdf('datasets/Housing_data.pdf',
                 pages=[2],
                 pandas_options={'header':None})

df18_2

[      0       1      2     3
 0  15.2  386.71  17.10  18.9
 1  15.2  392.52  20.45  15.0
 2  15.2  396.90  13.27  18.9
 3  15.2  390.50  15.71  21.7]

In [27]:
#  import library
from bs4 import BeautifulSoup

In [28]:
# use bs4 to read a HTML file
with open('datasets/test.html', 'r') as df:
    soup = BeautifulSoup(df)
    print(type(soup))

<class 'bs4.BeautifulSoup'>


In [29]:
# Print the contents of the file in a nice way
print(soup.prettify())

<html>
 <body>
  <h1>
   Lorem ipsum dolor sit amet consectetuer adipiscing 
elit
  </h1>
  <p>
   Lorem ipsum dolor sit amet, consectetuer adipiscing 
elit. Aenean commodo ligula eget dolor. Aenean massa
   <strong>
    strong
   </strong>
   . Cum sociis natoque penatibus 
et magnis dis parturient montes, nascetur ridiculus 
mus. Donec quam felis, ultricies nec, pellentesque 
eu, pretium quis, sem. Nulla consequat massa quis 
enim. Donec pede justo, fringilla vel, aliquet nec, 
vulputate eget, arcu. In enim justo, rhoncus ut, 
imperdiet a, venenatis vitae, justo. Nullam dictum 
felis eu pede
   <a class="external ext" href="#">
    link
   </a>
   mollis pretium. Integer tincidunt. Cras dapibus. 
Vivamus elementum semper nisi. Aenean vulputate 
eleifend tellus. Aenean leo ligula, porttitor eu, 
consequat vitae, eleifend ac, enim. Aliquam lorem ante, 
dapibus in, viverra quis, feugiat a, tellus. Phasellus 
viverra nulla ut metus varius laoreet. Quisque rutrum. 
Aenean imperdiet. Etiam

In [30]:
# Read the HTML file
with open('datasets/test.html', 'r') as df:
    soup = BeautifulSoup(df)
    print(soup.p)

<p>Lorem ipsum dolor sit amet, consectetuer adipiscing 
elit. Aenean commodo ligula eget dolor. Aenean massa 
<strong>strong</strong>. Cum sociis natoque penatibus 
et magnis dis parturient montes, nascetur ridiculus 
mus. Donec quam felis, ultricies nec, pellentesque 
eu, pretium quis, sem. Nulla consequat massa quis 
enim. Donec pede justo, fringilla vel, aliquet nec, 
vulputate eget, arcu. In enim justo, rhoncus ut, 
imperdiet a, venenatis vitae, justo. Nullam dictum 
felis eu pede <a class="external ext" href="#">link</a> 
mollis pretium. Integer tincidunt. Cras dapibus. 
Vivamus elementum semper nisi. Aenean vulputate 
eleifend tellus. Aenean leo ligula, porttitor eu, 
consequat vitae, eleifend ac, enim. Aliquam lorem ante, 
dapibus in, viverra quis, feugiat a, tellus. Phasellus 
viverra nulla ut metus varius laoreet. Quisque rutrum. 
Aenean imperdiet. Etiam ultricies nisi vel augue. 
Curabitur ullamcorper ultricies nisi.</p>


In [31]:
# Use the findall method to extract the content from the tag
with open('datasets/test.html', 'r') as df:
    soup = BeautifulSoup(df)
    all_ps = soup.find_all('p')
    print(f'Total number of <p>: {len(all_ps)}')

Total number of <p>: 6


In [32]:
# get the contents of a particular HTML tag
with open('datasets/test.html', 'r') as df:
    soup = BeautifulSoup(df)
    table = soup.table
    print(table.contents)

['\n', <tr>
<th>Entry Header 1</th>
<th>Entry Header 2</th>
<th>Entry Header 3</th>
<th>Entry Header 4</th>
</tr>, '\n', <tr>
<td>Entry First Line 1</td>
<td>Entry First Line 2</td>
<td>Entry First Line 3</td>
<td>Entry First Line 4</td>
</tr>, '\n', <tr>
<td>Entry Line 1</td>
<td>Entry Line 2</td>
<td>Entry Line 3</td>
<td>Entry Line 4</td>
</tr>, '\n', <tr>
<td>Entry Last Line 1</td>
<td>Entry Last Line 2</td>
<td>Entry Last Line 3</td>
<td>Entry Last Line 4</td>
</tr>, '\n']


In [33]:
with open("datasets/test.html", "r") as fd:
    soup = BeautifulSoup(fd)
    table = soup.table
    for child in table.children:
        print(child)
        print("*****")



*****
<tr>
<th>Entry Header 1</th>
<th>Entry Header 2</th>
<th>Entry Header 3</th>
<th>Entry Header 4</th>
</tr>
*****


*****
<tr>
<td>Entry First Line 1</td>
<td>Entry First Line 2</td>
<td>Entry First Line 3</td>
<td>Entry First Line 4</td>
</tr>
*****


*****
<tr>
<td>Entry Line 1</td>
<td>Entry Line 2</td>
<td>Entry Line 3</td>
<td>Entry Line 4</td>
</tr>
*****


*****
<tr>
<td>Entry Last Line 1</td>
<td>Entry Last Line 2</td>
<td>Entry Last Line 3</td>
<td>Entry Last Line 4</td>
</tr>
*****


*****


In [34]:
with open("datasets/test.html", "r") as fd:
    soup = BeautifulSoup(fd)
    table = soup.table
    children = table.children
    des = table.descendants
    print(len(list(children)), len(list(des)))

9 61


In [35]:
fd = open("datasets/test.html", "r")
soup = BeautifulSoup(fd)
data = soup.findAll('tr')
print(f"Data is a {type(data)} and {len(data)} items long")

Data is a <class 'bs4.element.ResultSet'> and 4 items long


In [36]:
# Check the original table structure in the HTML source
data_without_header = data[1:]
headers = data[0]
headers

<tr>
<th>Entry Header 1</th>
<th>Entry Header 2</th>
<th>Entry Header 3</th>
<th>Entry Header 4</th>
</tr>

In [37]:
col_headers = [th.getText() for th in headers.findAll('th')]
col_headers

['Entry Header 1', 'Entry Header 2', 'Entry Header 3', 'Entry Header 4']

In [38]:
df_data = [[td.getText() for td in tr.findAll('td')] for tr in data_without_header]
df_data

[['Entry First Line 1',
  'Entry First Line 2',
  'Entry First Line 3',
  'Entry First Line 4'],
 ['Entry Line 1', 'Entry Line 2', 'Entry Line 3', 'Entry Line 4'],
 ['Entry Last Line 1',
  'Entry Last Line 2',
  'Entry Last Line 3',
  'Entry Last Line 4']]

In [39]:
df = pd.DataFrame(df_data, columns=col_headers)
df.head()

Unnamed: 0,Entry Header 1,Entry Header 2,Entry Header 3,Entry Header 4
0,Entry First Line 1,Entry First Line 2,Entry First Line 3,Entry First Line 4
1,Entry Line 1,Entry Line 2,Entry Line 3,Entry Line 4
2,Entry Last Line 1,Entry Last Line 2,Entry Last Line 3,Entry Last Line 4


In [40]:
# save the DataFrame as an Excel file
writer = pd.ExcelWriter('datasets/test_output.xlsx')
df.to_excel(writer, 'Sheet1')
writer.save()
writer

<pandas.io.excel._xlsxwriter._XlsxWriter at 0x21ad5aa0548>