# <font color='red'> All the required steps to work with Data in Python
***

## <font color='blue'>1- Working with CSV files (local or online)

### <font color='green'>1.1 import all required packages

In [22]:
import numpy as np # for math
import pandas as pd #for dataframes and many math and stat
%matplotlib inline 
import matplotlib as mpl # for plotting all
import matplotlib.pyplot as plt # import for pyplot

### <font color='green'>1.2 import your csv file into a dataframe from url

In [None]:
dfcovid = pd.read_csv('https://covid.ourworldindata.org/data/owid-covid-data.csv')

### <font color='green'>1.3 show data columns and types

In [None]:
dfcovid.dtypes

### <font color='green'>1.4 clean and transform data

#### <font color='purple'>1.4.1 format any date fields into datetime type

In [None]:
dfcovid['date']=pd.to_datetime(dfcovid['date'])
dfcovid.dtypes

## <font color='blue'>2 Working with XML files

### <font color='green'>2.1 Working with local xml files

#### <font color='purple'>2.1.1 Import all required packages

In [None]:
import xml.etree.ElementTree as ET

#### 2.1.2 <font color='purple'>Import your csv file into a dataframe from url

In [None]:
tree = ET.parse('covid.xml')

#### <font color='purple'>2.1.3 Get the root of the document

In [None]:
root = tree.getroot()
root

#### <font color='purple'>2.1.4 Get the root subchild

In [None]:
root.tag

#### <font color='purple'>2.1.5 Get the root attributes if it has any

In [None]:
root.attrib

#### <font color='purple'>2.1.6 Get the root childs tags and text if it has any

In [None]:
[elem.tag for elem in root.iter()]

#### <font color='purple'>2.1.7 Get the document and butify the look

In [None]:
print(ET.tostring(root, encoding='utf8').decode('utf8'))

#### <font color='purple'>2.1.8 Get the child and subchilds tags and text(value)

In [None]:
for child in root:
    for childs in child:
        print(childs.tag, childs.text)

#### <font color='purple'>2.1.9 Get a specific subchild tags and text(value)

In [None]:
for subchild in root.iter('cases'):
    print(subchild.tag,subchild.text)

#### <font color='purple'>2.1.10 Convert the document to a dataframe and select teh columns you want

In [None]:
df_cols = ["date", "country", "continent", "population", "cases"]
rows = []
for node in root: 
    dateRep = node.find("dateRep").text
    countriesAndTerritories = node.find("countriesAndTerritories").text
    continentExp = node.find("continentExp").text
    popData2020 = node.find("popData2020").text
    cases= node.find("cases").text
    rows.append({"date": dateRep, "country": countriesAndTerritories, "continent": continentExp, "population": popData2020, "cases": cases})

df = pd.DataFrame(rows, columns = df_cols)
df

### <font color='green'>2.2 Online XML scrapping from page

#### <font color='purple'>2.2.1 Using the urlib to get data

##### 2.2.1.1 <font color='orange'>Import the required packages

In [None]:
import xml.etree.ElementTree as ET
import urllib
import pandas

##### <font color='orange'>2.2.1.2 call url and get reposnse and change it to text Using requests libaray

In [None]:
url = 'https://opendata.ecdc.europa.eu/covid19/nationalcasedeath_eueea_daily_ei/xml/'
response = urllib.request.urlopen(url).read()
tree = ET.fromstring(response)
tree

##### <font color='orange'>2.2.1.3 Find the main tag

In [None]:
tree.tag

##### <font color='orange'>2.2.1.4 Cound teh child tags (records)

In [None]:
len(list(tree))

##### <font color='orange'>2.2.1.5 You can print the childs of the child if you want

In [None]:
for child in tree:
    for childs in child:
        print(childs.tag, childs.text)

##### <font color='orange'>2.2.1.6 Altrernativly use elem

In [None]:
[elem.tag for elem in tree.iter()]

##### <font color='orange'>2.2.1.7 Use iterator to get all the nodes

In [None]:
for node in tree.iter('*'):
    print(node.tag)

##### <font color='orange'>2.2.1.8 Butify and view the document

In [None]:
print(ET.tostring(tree, encoding='utf8').decode('utf8'))

##### <font color='orange'>2.2.1.9 Copy the fields you are interested with to a data frame

In [None]:
cols = ["date", "country", "continent", "population", "cases"]
rows = []
for node in tree: 
    dateRep = node.find("dateRep").text
    countriesAndTerritories = node.find("countriesAndTerritories").text
    continentExp = node.find("continentExp").text
    popData2020 = node.find("popData2020").text
    cases= node.find("cases").text
    rows.append({"date": dateRep, "country": countriesAndTerritories, "continent": continentExp, "population": popData2020, "cases": cases})

df = pd.DataFrame(rows, columns = df_cols)
df

#### <font color='purple'>2.2.2 Using bs4 libaray

##### <font color='orange'>2.2.2.1 call url and get reposnse and change it to text 

In [None]:
response = requests.get('https://opendata.ecdc.europa.eu/covid19/nationalcasedeath_eueea_daily_ei/xml/').text

##### <font color='orange'>2.2.2.2 View the tags

In [None]:
tree = bs(response, 'xml')
print(tree)

##### <font color='orange'>2.2.2.3 Use for loop in BS4 to get all tags

In [None]:
for tag in tree.findChildren():
    print(tag.name)

##### <font color='orange'>2.2.2.4 Butify it

In [None]:
tree = bs(response, 'xml').prettify()
print(tree)

##### <font color='orange'>2.2.2.5 get the child count of the document

In [None]:
len(list(tree))

##### <font color='orange'>2.2.2.6 Find a specific subchild text

In [None]:
tree = bs(response, 'xml')

dateRep = tree.find_all('dateRep')
countriesAndTerritories = tree.find_all('countriesAndTerritories')
continentExp = tree.find_all('continentExp')
popData2020 = tree.find_all('popData2020')
cases= tree.find_all('cases')

df_cols = ['date', 'country', 'continent', 'population', 'cases']
rows = []
for i in range(len(dateRep)): 
    row= [dateRep[i].get_text(), countriesAndTerritories[i].get_text(), continentExp[i].get_text(), popData2020[i].get_text(), cases[i].get_text(), ]
    rows.append(row)
df = pd.DataFrame(rows, columns = df_cols, dtype = float)
df.head()

## <font color='blue'>3 Import Data from a webpage

### <font color='green'>3 Import Table from a webpage

#### <font color='purple'>3.1.1 import libraries 

In [None]:
import requests
import pandas as pd

#### <font color='purple'>3.1.2 call url

In [None]:
url='https://en.wikipedia.org/wiki/List_of_2018_box_office_number-one_films_in_France'
req=requests.get(url)
req.status_code

#### <font color='purple'>3.1.3 read the response html file as text tables

In [None]:
data=pd.read_html(req.text)

#### <font color='purple'>3.1.4 Find how many tables in teh pages and query them for quick view

In [None]:
print('number of tables are : ' + str(len(data)))
for tables in data:
    print(tables)

#### <font color='purple'>3.1.5 import the table you want into a dataframe

In [None]:
df=data[0] #first table in the page
df.head()

#### <font color='purple'>3.1.6 more examples

In [None]:
#importing the libraries
import requests
import pandas as pd

url='http://www.omafra.gov.on.ca/english/engineer/facts/12-051.htm'
req=requests.get(url)
data=pd.read_html(req.text)
for tables in data:
    print(tables)
    
df=data[6] #first table in the page
df.head()

In [None]:
#importing the libraries
import requests
import pandas as pd

url='https://www.worldometers.info/world-population/population-by-country/'
req=requests.get(url)
data=pd.read_html(req.text)
for tables in data:
    print(tables)
    
df=data[0] #first table in the page
df.head()

### <font color='green'>3 Import data from a webpage using webscraping

#### <font color='purple'>3.1.1 import libraries 

In [10]:
import pandas as pd
#import numpy as np
#import matplotlib.pyplot as plt
#import seaborn as sns
#%matplotlib inline
#import re
#import time
#from datetime import datetime
#import matplotlib.dates as mdates
#import matplotlib.ticker as ticker
#from urllib.request import urlopen
from bs4 import BeautifulSoup as bs
import requests

#### <font color='purple'>3.1.2 Example search Amazon best electroinc sellers and get back pages info

In [44]:
#define how many pages to scape, usually best sellers of any item in Amazon are 2 pages

no_pages = 2

def get_data(pageNo):  
   # Call teh page and get the response into a soup object
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}

    r = requests.get('https://www.amazon.ca/Best-Sellers-Electronics/zgbs/electronics/ref=zg_bs_pg_'+str(pageNo)+'?_encoding=UTF8&pg='+str(pageNo), headers=headers)#, proxies=proxies)
    content = r.content
    soup = bs(content)

#loop throught the soup object and get certain tags that satisfy your needs, usuall look into the class of the tag that has the vlaue

    alls = []
    #fin the class of the tag that is common between all items
    for d in soup.findAll('div', attrs={'class':'a-section a-spacing-none aok-relative'}):
        #in that tag the childs will have the items you want
        name = d.find('span', attrs={'class':'zg-text-center-align'}) #name of teh item
        n = name.find_all('img', alt=True)
        rating = d.find('span', attrs={'class':'a-icon-alt'}) # rating out of 5
        users_rated = d.find('a', attrs={'class':'a-size-small a-link-normal'}) # total number of reviews
        price = d.find('span', attrs={'class':'p13n-sc-price'})   # the price

        all1=[]
#set the condition to get the data back
        if name is not None:
            #print(n[0]['alt'])
            all1.append(n[0]['alt'])
        else:
            all1.append("unknown-product")
        if rating is not None:
            #print(rating.text)
            all1.append(rating.text)
        else:
            all1.append('-1')

        if users_rated is not None:
            #print(price.text)
            all1.append(users_rated.text)
        else:
            all1.append('0')     

        if price is not None:
            #print(price.text)
            all1.append(price.text)
        else:
            all1.append('0')
        alls.append(all1)        
    return alls

results = []
#loop in number of pages and run the extract function
for i in range(1, no_pages+1):
    results.append(get_data(i))
#format the output and save into a dataframe
flatten = lambda l: [item for sublist in l for item in sublist]
df = pd.DataFrame(flatten(results),columns=['Name','Rating','Customers_Rated', 'Price'])

#you can save in a csv file for future use
# df.to_csv('amazon_products.csv', index=False, encoding='utf-8')

#A widget to select the rating of items to view them
select_tech1 = widgets.Dropdown(
    options=df['Rating'].unique(),
    description='Select a Page:',
    disabled=False
)
output=widgets.Output()
display(select_tech1, output)
def on_value_change(change):
    with output:
        output.clear_output()
        df1 = df[df['Rating']==select_tech1.value]
        print('You have ' + str(len(df1)) + ' products with that rating')
        print(df1.head())
#when change the value of drop down run teh function
select_tech1.observe(on_value_change, names= 'value')

Dropdown(description='Select a Page:', options=('4.7 out of 5 stars', '4.6 out of 5 stars', '4.0 out of 5 star…

Output()

In [16]:
df.head()

Unnamed: 0,Book Name,Author,Rating,Customers_Rated,Price
0,Fire TV Stick 4K streaming device with Alexa b...,0,4.7 out of 5 stars,30687,$69.99
1,Fire TV Stick (3rd Gen) with Alexa Voice Remot...,0,4.6 out of 5 stars,683,$59.99
2,Fire TV Stick Lite with Alexa Voice Remote Lit...,0,4.7 out of 5 stars,9017,$49.99
3,TP-Link AC750 WiFi Range Extender RE200 - Cove...,0,4.0 out of 5 stars,16201,$29.99
4,Seagate Portable 2TB External Hard Drive Porta...,0,4.7 out of 5 stars,114196,$79.99


#### <font color='purple'> 3.1.3 Example search Amazon for any item and return the serach data using Selemium

In [48]:
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

def extract(item):
    atag = item.h2.a
    description = atag.text.strip()
    
    url = 'https://www.amazon.com' + atag.get('href')
    try:
        price_parent = item.find('span','a-price')
        price = price_parent.find('span','a-offscreen').text
    except AttributeError:
        return
    
    try:
        rating = item.find('span',{'class': 'a-icon-alt'}).text
        num_review = item.find('span',{'class': 'a-size-base'}).text
    except:
        rating = ''
        num_review = 0
    result = (description,price,rating,num_review,url)
    
    return result  

search = widgets.Text(
    value='Watch',
    placeholder='Type something',
    description='Search Text:',
    disabled=False
)
recs = widgets.IntText(
    value=7,
    description='# of Pages:',
    disabled=False
)
print('enter the serach item you want to search in Amazon, and the number od pages to serach')
display(search, recs)

enter the serach item you want to search in Amazon, and the number od pages to serach


Text(value='Watch', description='Search Text:', placeholder='Type something')

IntText(value=7, description='# of Pages:')

In [36]:

#Create an extraction model that will retrieve the desired product information 


#Main program function where the the search and extract functions are used to apply the extraction model to the first 6 pages of amazon.
#The data extracted is formatted and added to a csv file named after the desired product. 
#search_term = input('What would you like to search Amazon for? ')
driver = webdriver.Chrome(ChromeDriverManager().install())
    
records = []
general = 'https://www.amazon.com/s?k={}&ref=nb_sb_noss_2'
new_search = (search.value).replace(' ', '+')
new = general.format(new_search)
new += '&page={}'
#print('number of pages' + str(len(new)))
for page in range(1,recs.value):
    driver.get(new.format(page))
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    results = soup.find_all('div',{'data-component-type':"s-search-result"})
        
    for i in results:
        record = extract(i)
        if record:
            record =record+(page,)
            records.append(record)
driver.close()
df = pd.DataFrame(records,columns=['description','Price','Rating','Review Count', 'URL', 'page'])
print(len(df))
df.head()

[WDM] - Current google-chrome version is 91.0.4472
[WDM] - Get LATEST driver version for 91.0.4472
[WDM] - Driver [C:\Users\nalla\.wdm\drivers\chromedriver\win32\91.0.4472.101\chromedriver.exe] found in cache


 
30


Unnamed: 0,description,Price,Rating,Review Count,URL,page
0,Beurer EM44 TENS UNIT Muscle Stimulator for Ba...,$49.43,4.6 out of 5 stars,494,https://www.amazon.com/gp/slredirect/picassoRe...,1
1,Omron M3 Medical Accessory Bp Testing Digital ...,$135.98,4.4 out of 5 stars,8,https://www.amazon.com/Omron-Medical-Accessory...,1
2,Omron Body Composition Monitor and Scale with ...,$60.18,4.2 out of 5 stars,259,https://www.amazon.com/Omron-Composition-Monit...,1
3,Omron Body Composition Monitor with Scale - 7 ...,$79.99,4.3 out of 5 stars,1520,https://www.amazon.com/Omron-Body-Composition-...,1
4,Omron Blood Pressure Monitor - M2 Classic,$83.04,4.2 out of 5 stars,26,https://www.amazon.com/Omron-Blood-Pressure-Mo...,1


In [39]:
field1 = widgets.Dropdown(
    options=df.columns,
    value=df.columns[1],
    description='Select X',
    disabled=False,
)
field2= widgets.Dropdown(
    options=df.columns,
    value=df.columns[5],
    description='Select Y',
    disabled=False,
)
display(field1, field2)
output1=widgets.Output()
output1
def on_value_change(change):
    with output1:
        output1.clear_output()

        plt.title(field1.label + " VS " + field2.label) 
        plt.xlabel(field1.label)
        plt.ylabel(field2.label)
        
#         plt.plot(df[field1.value].values,df[field2.value].values, 
#                 'o', color='red')
        plt.scatter(df[field1.value].values,df[field2.value].values,
            c=df['Price'], cmap='viridis',
            s=df['page'], linewidth=0, alpha=0.5)
        plt.axis(aspect='equal')
        plt.xlabel(field1.value)
        plt.ylabel(field2.value)
        plt.colorbar(label='log$_{10}$(df[field2.value])')
        plt.clim(3, 7)
        plt.title(field1.value+" VS" + field2.value)
        
#         fig =plt.figure(figsize=(12, 12))
#         ax=plt.axes()
#         ax.plot(daily[field1.value].values,daily[field2.value].values, 
#                 color='red', linestyle='dotted')

field1.observe(on_value_change, names= 'value')
field2.observe(on_value_change, names= 'value')

Dropdown(description='Select X', index=1, options=('description', 'Price', 'Rating', 'Review Count', 'URL', 'p…

Dropdown(description='Select Y', index=5, options=('description', 'Price', 'Rating', 'Review Count', 'URL', 'p…

## 4 Working with json files

### 4.1 Local json files

#### 4.1.1 Import libraries

In [None]:
import json

In [None]:

df = pd.read_json('owid-covid-data.json').T

df.head()

In [None]:
# with open('owid-covid-data.json','r') as inputFile:
#   tree = json.loads(inputFile.read())
# df = pd.json_normalize(tree)

### 4.2 Online json files

#### 4.2.1 Import libraries

In [None]:
import json
import requests
import pandas as pd

In [None]:
response = requests.get("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.json")
tree = json.loads(response.text)
print(tree.keys())

In [None]:
tree = response.json()

In [None]:
for countires in tree.keys():
    #if tree.keys() =='AFG':
        print(countries['data'])
# data1 = data['data']
# data1

In [None]:
df = pd.DataFrame(tree).T
df.head()

In [None]:
r.content

In [None]:
r.status_code

In [None]:
r.raw

In [None]:
r.raw.read(10)

In [None]:
for head in r.headers:
    print(head)

In [None]:
r.headers['Content-Type']

In [None]:
r.json

In [49]:
## 4 Working with Parquet files

In [None]:
df= pd.read_parquet('userdata1.parquet')
df.head()

In [50]:
df= pd.read_parquet('https://github.com/Teradata/kylo/blob/master/samples/sample-data/parquet/userdata1.parquet?raw=true')
df.head()

Unnamed: 0,registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments
0,2016-02-03 07:55:29,1,Amanda,Jordan,ajordan0@com.com,Female,1.197.201.2,6759521864920116.0,Indonesia,3/8/1971,49756.53,Internal Auditor,100.0
1,2016-02-03 17:04:03,2,Albert,Freeman,afreeman1@is.gd,Male,218.111.175.34,,Canada,1/16/1968,150280.17,Accountant IV,
2,2016-02-03 01:09:31,3,Evelyn,Morgan,emorgan2@altervista.org,Female,7.161.136.94,6767119071901597.0,Russia,2/1/1960,144972.51,Structural Engineer,
3,2016-02-03 00:36:21,4,Denise,Riley,driley3@gmpg.org,Female,140.35.109.83,3576031598965625.0,China,4/8/1997,90263.05,Senior Cost Accountant,
4,2016-02-03 05:05:31,5,Carlos,Burns,cburns4@miitbeian.gov.cn,,169.113.235.40,5602256255204850.0,South Africa,,,,


In [72]:
## 4 Working with pdf files

In [89]:
import PyPDF2
import tabula

In [75]:
dfList = tabula.read_pdf("https://github.com/chezou/tabula-py/raw/master/tests/resources/data.pdf", pages='all', stream=True)
len(dfList)


4

In [76]:
dfList[0]

Unnamed: 0.1,Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
5,Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1
6,Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
7,Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
8,Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
9,Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4


In [82]:
df =pd.DataFrame(dfList[0])

In [87]:
df = df.rename(columns = {'Unnamed: 0':'Models'})
df

Unnamed: 0,Models,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
5,Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1
6,Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
7,Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
8,Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
9,Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4


In [97]:
pdfFile = open('data.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFile)
print(pdfReader.numPages)

3


In [96]:
pages = pdfReader.getPage(0)
page1= pages.extractText()
page1

'mpg\ncyl\ndisp\nhp\ndrat\nwt\nqsec\nvs\nam\ngear\ncarb\nMazdaRX4\n21.0\n6\n160.0\n110\n3.90\n2.620\n16.46\n0\n1\n4\n4\nMazdaRX4Wag\n21.0\n6\n160.0\n110\n3.90\n2.875\n17.02\n0\n1\n4\n4\nDatsun710\n22.8\n4\n108.0\n93\n3.85\n2.320\n18.61\n1\n1\n4\n1\nHornet4Drive\n21.4\n6\n258.0\n110\n3.08\n3.215\n19.44\n1\n0\n3\n1\nHornetSportabout\n18.7\n8\n360.0\n175\n3.15\n3.440\n17.02\n0\n0\n3\n2\nValiant\n18.1\n6\n225.0\n105\n2.76\n3.460\n20.22\n1\n0\n3\n1\nDuster360\n14.3\n8\n360.0\n245\n3.21\n3.570\n15.84\n0\n0\n3\n4\nMerc240D\n24.4\n4\n146.7\n62\n3.69\n3.190\n20.00\n1\n0\n4\n2\nMerc230\n22.8\n4\n140.8\n95\n3.92\n3.150\n22.90\n1\n0\n4\n2\nMerc280\n19.2\n6\n167.6\n123\n3.92\n3.440\n18.30\n1\n0\n4\n4\nMerc280C\n17.8\n6\n167.6\n123\n3.92\n3.440\n18.90\n1\n0\n4\n4\nMerc450SE\n16.4\n8\n275.8\n180\n3.07\n4.070\n17.40\n0\n0\n3\n3\nMerc450SL\n17.3\n8\n275.8\n180\n3.07\n3.730\n17.60\n0\n0\n3\n3\nMerc450SLC\n15.2\n8\n275.8\n180\n3.07\n3.780\n18.00\n0\n0\n3\n3\nCadillacFleetwood\n10.4\n8\n472.0\n205\n2.93\n

## 5 Working with Word files

In [4]:
from docx import Document
document = Document('data.docx')

In [75]:
dataTable = document.tables[0]
print(len(dataTable.rows),len(dataTable.columns))

33 12


In [76]:
tableCells =[]
for row in dataTable.rows:
    for cell in row.cells:
        for text in cell.paragraphs:
            tableCells.append(text.text)
len(tableCells)
tableCells[0]='Models'

In [78]:
cellsArray=np.array(tableCells)
cellsArrayReshaped=cellsArray.reshape(33,12)
df=pd.DataFrame(cellsArrayReshaped)
df.columns = df.iloc[0]
df=df.drop(df.index [ [ 0 ] ])
df

Unnamed: 0,Models,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
1,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
2,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
3,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
4,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
5,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
6,Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1
7,Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
8,Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
9,Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
10,Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4


## 6 Working with ORC files

In [2]:
import findspark
from pyspark.sql import SparkSession

findspark.init()
spark = SparkSession.builder.getOrCreate()
df_spark = spark.read.orc('userdata1_orc')
df_pandas = df_spark.toPandas()

ValueError: Couldn't find Spark, make sure SPARK_HOME env is set or Spark is in an expected location (e.g. from homebrew installation).

## 6 Working with AVRO files

In [33]:
import copy
import json
import avro
from avro.datafile import DataFileReader
from avro.io import DatumReader

In [34]:
# Read data from an avro file
with open('covtypeNorm_binary.avro', 'rb') as f:
    reader = DataFileReader(f, DatumReader())
    metadata = copy.deepcopy(reader.meta)
    schema = json.loads(metadata['avro.schema'])

In [35]:
print(metadata)

{'avro.schema': b'{"type":"record","name":"Covertype","namespace":"com.yahoo.labs.samoa.avro.covertype","fields":[{"name":"Elevation","type":"double"},{"name":"Aspect","type":"double"},{"name":"Slope","type":"double"},{"name":"Horizontal_Distance_To_Hydrology","type":"double"},{"name":"Vertical_Distance_To_Hydrology","type":"double"},{"name":"Horizontal_Distance_To_Roadways","type":"double"},{"name":"Hillshade_9am","type":"double"},{"name":"Hillshade_Noon","type":"double"},{"name":"Hillshade_3pm","type":"double"},{"name":"Horizontal_Distance_To_Fire_Points","type":"double"},{"name":"Wilderness_Area1","type":"int"},{"name":"Wilderness_Area2","type":"int"},{"name":"Wilderness_Area3","type":"int"},{"name":"Wilderness_Area4","type":"int"},{"name":"Soil_Type1","type":"int"},{"name":"Soil_Type2","type":"int"},{"name":"Soil_Type3","type":"int"},{"name":"Soil_Type4","type":"int"},{"name":"Soil_Type5","type":"int"},{"name":"Soil_Type6","type":"int"},{"name":"Soil_Type7","type":"int"},{"name":"S

In [36]:
print(schema_from_file)

{'type': 'record', 'name': 'Covertype', 'namespace': 'com.yahoo.labs.samoa.avro.covertype', 'fields': [{'name': 'Elevation', 'type': 'double'}, {'name': 'Aspect', 'type': 'double'}, {'name': 'Slope', 'type': 'double'}, {'name': 'Horizontal_Distance_To_Hydrology', 'type': 'double'}, {'name': 'Vertical_Distance_To_Hydrology', 'type': 'double'}, {'name': 'Horizontal_Distance_To_Roadways', 'type': 'double'}, {'name': 'Hillshade_9am', 'type': 'double'}, {'name': 'Hillshade_Noon', 'type': 'double'}, {'name': 'Hillshade_3pm', 'type': 'double'}, {'name': 'Horizontal_Distance_To_Fire_Points', 'type': 'double'}, {'name': 'Wilderness_Area1', 'type': 'int'}, {'name': 'Wilderness_Area2', 'type': 'int'}, {'name': 'Wilderness_Area3', 'type': 'int'}, {'name': 'Wilderness_Area4', 'type': 'int'}, {'name': 'Soil_Type1', 'type': 'int'}, {'name': 'Soil_Type2', 'type': 'int'}, {'name': 'Soil_Type3', 'type': 'int'}, {'name': 'Soil_Type4', 'type': 'int'}, {'name': 'Soil_Type5', 'type': 'int'}, {'name': 'Soil_

In [37]:
# Read data from an avro file
with open('covtypeNorm_binary.avro', 'rb') as f:
    reader = DataFileReader(f, DatumReader())
    records = [record for record in reader]
    df = pd.DataFrame.from_records(records)

In [38]:
df.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,class
0,0.368684,0.141667,0.045455,0.184681,0.223514,0.071659,0.870079,0.913386,0.582677,0.875366,...,0,0,0,0,0,0,0,0,0,e
1,0.365683,0.155556,0.030303,0.151754,0.215762,0.054798,0.866142,0.925197,0.594488,0.867838,...,0,0,0,0,0,0,0,0,0,e
2,0.472736,0.386111,0.136364,0.19184,0.307494,0.446817,0.92126,0.937008,0.531496,0.853339,...,0,0,0,0,0,0,0,0,0,b
3,0.463232,0.430556,0.272727,0.173228,0.375969,0.434172,0.937008,0.937008,0.480315,0.865886,...,0,0,0,0,0,0,0,0,0,b
4,0.368184,0.125,0.030303,0.10952,0.222222,0.054939,0.866142,0.92126,0.590551,0.860449,...,0,0,0,0,0,0,0,0,0,e


In [39]:
# Read data from an avro file
with open('covtypeNorm_binary.avro', 'rb') as f:
    reader = DataFileReader(f, DatumReader())
    metadata = copy.deepcopy(reader.meta)
    schema = json.loads(metadata['avro.schema'])

In [40]:
print(metadata)

{'avro.schema': b'{"type":"record","name":"Covertype","namespace":"com.yahoo.labs.samoa.avro.covertype","fields":[{"name":"Elevation","type":"double"},{"name":"Aspect","type":"double"},{"name":"Slope","type":"double"},{"name":"Horizontal_Distance_To_Hydrology","type":"double"},{"name":"Vertical_Distance_To_Hydrology","type":"double"},{"name":"Horizontal_Distance_To_Roadways","type":"double"},{"name":"Hillshade_9am","type":"double"},{"name":"Hillshade_Noon","type":"double"},{"name":"Hillshade_3pm","type":"double"},{"name":"Horizontal_Distance_To_Fire_Points","type":"double"},{"name":"Wilderness_Area1","type":"int"},{"name":"Wilderness_Area2","type":"int"},{"name":"Wilderness_Area3","type":"int"},{"name":"Wilderness_Area4","type":"int"},{"name":"Soil_Type1","type":"int"},{"name":"Soil_Type2","type":"int"},{"name":"Soil_Type3","type":"int"},{"name":"Soil_Type4","type":"int"},{"name":"Soil_Type5","type":"int"},{"name":"Soil_Type6","type":"int"},{"name":"Soil_Type7","type":"int"},{"name":"S

In [41]:
print(schema)

{'type': 'record', 'name': 'Covertype', 'namespace': 'com.yahoo.labs.samoa.avro.covertype', 'fields': [{'name': 'Elevation', 'type': 'double'}, {'name': 'Aspect', 'type': 'double'}, {'name': 'Slope', 'type': 'double'}, {'name': 'Horizontal_Distance_To_Hydrology', 'type': 'double'}, {'name': 'Vertical_Distance_To_Hydrology', 'type': 'double'}, {'name': 'Horizontal_Distance_To_Roadways', 'type': 'double'}, {'name': 'Hillshade_9am', 'type': 'double'}, {'name': 'Hillshade_Noon', 'type': 'double'}, {'name': 'Hillshade_3pm', 'type': 'double'}, {'name': 'Horizontal_Distance_To_Fire_Points', 'type': 'double'}, {'name': 'Wilderness_Area1', 'type': 'int'}, {'name': 'Wilderness_Area2', 'type': 'int'}, {'name': 'Wilderness_Area3', 'type': 'int'}, {'name': 'Wilderness_Area4', 'type': 'int'}, {'name': 'Soil_Type1', 'type': 'int'}, {'name': 'Soil_Type2', 'type': 'int'}, {'name': 'Soil_Type3', 'type': 'int'}, {'name': 'Soil_Type4', 'type': 'int'}, {'name': 'Soil_Type5', 'type': 'int'}, {'name': 'Soil_

In [42]:
# Read data from an avro file
with open('covtypeNorm_binary.avro', 'rb') as f:
    reader = DataFileReader(f, DatumReader())
    records = [record for record in reader]
    df = pd.DataFrame.from_records(records)

In [43]:
df.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,class
0,0.368684,0.141667,0.045455,0.184681,0.223514,0.071659,0.870079,0.913386,0.582677,0.875366,...,0,0,0,0,0,0,0,0,0,e
1,0.365683,0.155556,0.030303,0.151754,0.215762,0.054798,0.866142,0.925197,0.594488,0.867838,...,0,0,0,0,0,0,0,0,0,e
2,0.472736,0.386111,0.136364,0.19184,0.307494,0.446817,0.92126,0.937008,0.531496,0.853339,...,0,0,0,0,0,0,0,0,0,b
3,0.463232,0.430556,0.272727,0.173228,0.375969,0.434172,0.937008,0.937008,0.480315,0.865886,...,0,0,0,0,0,0,0,0,0,b
4,0.368184,0.125,0.030303,0.10952,0.222222,0.054939,0.866142,0.92126,0.590551,0.860449,...,0,0,0,0,0,0,0,0,0,e


## 6 Working with YAML files

In [58]:
import pandas as pd
import yaml

with open('Ansible_Network_Facts_Demo.yml', 'r') as f:
    df = pd.io.json.json_normalize(yaml.load(f),'tasks')

df.head(20)

  """
  """


Unnamed: 0,name,eos_facts,when,ios_facts,vyos_facts,debug.msg,debug.var,run_once,copy.content,copy.dest,register,eos_config.backup,vyos_config.backup,file.path,file.state,file.recurse,copy.src
0,Gather facts (eos),,ansible_network_os == 'eos',,,,,,,,,,,,,,
1,Gather facts (ops),,ansible_network_os == 'ios',,,,,,,,,,,,,,
2,Gather facts (vyos),,ansible_network_os == 'vyos',,,,,,,,,,,,,,
3,Display some facts,,,,,The hostname is {{ ansible_net_hostname }} and...,,,,,,,,,,,
4,Facts from a specific host,,,,,,hostvars['vyos01.example.net'],,,,,,,,,,
5,Write facts to disk using a template,,,,,,,True,#jinja2: lstrip_blocks: True\nEOS device info:...,/tmp/switch-facts,,,,,,,
6,Backup switch (eos),,ansible_network_os == 'eos',,,,,,,,backup_eos_location,True,,,,,
7,backup switch (vyos),,ansible_network_os == 'vyos',,,,,,,,backup_vyos_location,,True,,,,
8,Create backup dir,,,,,,,,,,,,,/tmp/backups/{{ inventory_hostname }},directory,True,
9,Copy backup files into /tmp/backups/ (eos),,ansible_network_os == 'eos',,,,,,,/tmp/backups/{{ inventory_hostname }}/{{ inven...,,,,,,,{{ backup_eos_location.backup_path }}


In [56]:
import yaml
from yaml.loader import SafeLoader

# Open the file and load the file
with open('Ansible_Network_Facts_Demo.yml', 'r') as f:
    data = yaml.load(f, Loader=SafeLoader)
    print(data)

[{'name': 'Demonstrate connecting to switches', 'hosts': 'switches', 'gather_facts': False, 'tasks': [{'name': 'Gather facts (eos)', 'eos_facts': None, 'when': "ansible_network_os == 'eos'"}, {'name': 'Gather facts (ops)', 'ios_facts': None, 'when': "ansible_network_os == 'ios'"}, {'name': 'Gather facts (vyos)', 'vyos_facts': None, 'when': "ansible_network_os == 'vyos'"}, {'name': 'Display some facts', 'debug': {'msg': 'The hostname is {{ ansible_net_hostname }} and the OS is {{ ansible_net_version }}'}}, {'name': 'Facts from a specific host', 'debug': {'var': "hostvars['vyos01.example.net']"}}, {'name': 'Write facts to disk using a template', 'copy': {'content': "#jinja2: lstrip_blocks: True\nEOS device info:\n  {% for host in groups['eos'] %}\n  Hostname: {{ hostvars[host].ansible_net_hostname }}\n  Version: {{ hostvars[host].ansible_net_version }}\n  Model: {{ hostvars[host].ansible_net_model }}\n  Serial: {{ hostvars[host].ansible_net_serialnum }}\n  {% endfor %}\n\nIOS device info