# XLS and XLSX files. 

Pandas documentation: https://pandas.pydata.org/docs/index.html

Pandas API reference: https://pandas.pydata.org/docs/reference/index.html#api

In [1]:
# import pandas library
import pandas as pd

### Educational facilities in Galicia

**Source**: Portal Open Data abert@s - Xunta de Galicia

https://abertos.xunta.gal/catalogo/ensino-formacion/-/dataset/0257/centros-educativos-galicia

In [4]:
# Sample dataset in XLS format built from educational facilities data in Galicia
# (only Coruña and Lugo)

df = df.to_parquet('../datasets/purchases.txt')

AttributeError: module 'pandas' has no attribute 'to_parquet'

In [5]:
# Another way of opening files and load data into a new dataframe
ficheiro = '../datasets/purchases.txt'
df = pd.read_excel(ficheiro)

ValueError: Excel file format cannot be determined, you must specify an engine manually.

In [None]:
# show the dataframe
df

In [None]:
df.head()

In [None]:
#!conda install pyarrow

In [None]:
# some functions to learn about the dataframe
#df.head()
#df.tail()
#df.describe()
df.sample()

In [None]:
# info about column data types
df.info()

In [None]:
# show column names
df.columns

In [None]:
# Select some random names
df.Nome.sample(5)

In [None]:
# Select municipality name and coordenates: first 10
df[['Concello','COORDENADA_X','COORDENADA_Y']].head(10)

In [None]:
# Select a column
df.Concello
# Note the type!!! <- Series
#df.Concello.unique()
# Note the type!!! <- array

# Look for the unique function in Pandas API reference

In [None]:
# Count elements
# Counting the elements of a series is not the same as counting the elements of an array.

#df.Concello.count()
#len(df.Concello.unique())

In [None]:
# Assign columns to new variables (Series)
concellos = df.Concello
concellos

In [None]:
# Apply conditions to Series
df.Concello == 'Ames'

In [None]:
# Select rows by their index
#df.iloc[[0,1,3,5]]
df.loc[[0,1,3,5]]

In [None]:
# Select the rows that meet a condition
df[df.Concello == 'Ames']

In [None]:
# Select the centers with type IES ('Tipo de centro')
serie_indices = df['Tipo de centro'] == 'IES'
df[serie_indices]

In [None]:
# Create a new Dataframe with all the language schools
eois = df[df['Tipo de centro'] == 'EOI']
eois

In [None]:
# Create a new DataFrame with all the language schools (EOI) or Conservatories (CMUS)
eois = df[(df['Tipo de centro'] == 'EOI') | (df['Tipo de centro'] == 'CMUS')  ]
eois

In [None]:
# Select and show all the IES in Santiago
df[(df['Tipo de centro'] == 'IES') & (df['Concello'] == 'Santiago de Compostela') ]


In [None]:
# Create a DataFrame with Name, Address and center type of all centers in Ferrol
centros_ferrol = df[df.Concello == 'Ferrol']
centros_ferrol = centros_ferrol[['Nome','Enderezo','Tipo de centro']]
# or directly
# centros_ferrol = df[df.Concello == 'Ferrol'][['Nome','Enderezo','Tipo de centro']]
centros_ferrol

In [None]:
# select a row attending to its position
#centros_ferrol.iloc[0]
centros_ferrol.iloc[1]

In [None]:
# select a row attending to its index
#centros_ferrol.loc[104]
centros_ferrol.loc[105]

In [None]:
# Maybe it could be interesting to use center code as index
df = pd.read_excel(ficheiro,index_col='Código')
df.head()

In [None]:
# Select the first row
df.iloc[0]
#df.loc[15000016]

In [None]:
# Select the four first rows
df.iloc[0:4]

In [None]:
# Select the last row
df.iloc[[-1]]
#df.tail(1)

In [None]:
# Select the two last rows
df.iloc[-2:]

In [None]:
# Select the center 15033204
# use the index

df.loc[[15033204]]

In [None]:
# Select the next centers: 15033150, 15033162
df.loc[[15033150, 15033162]]

In [None]:
# Sometimes we choose to use loc and sometimes iloc
# loc <- traballa con index/labels
# iloc <- traballa con posicións

In [None]:
# Write a XLS file with CEIP centers
centros_ceip = df[df['Tipo de centro'] == 'CEIP']
#centros_ceip.head()
#centros_ceip.to_excel('output/ceip_ferrol.xls')
centros_ceip.to_excel('output/ceip_ferrol.xlsx')

In [None]:
# Give a name to the sheet
centros_ceip.to_excel('output/ceip_ferrol.xlsx',sheet_name='A Coruña')

In [None]:
# We must to select the sheet to work with during the open operation
ficheiro = '../datasets/centros_educativos_galicia.xls'
df_corunha = pd.read_excel(ficheiro,sheet_name='A Coruña')
df_lugo = pd.read_excel(ficheiro,sheet_name='Lugo')

In [None]:
df_corunha.sample()
#df_lugo.sample()

In [None]:
# Concat dataframes (similar to concat Series)
df_total = pd.concat([df_corunha,df_lugo])
df_total

In [None]:
# Save CPR centers from the two provinces in an only file
# Only the next fields: Código, nome, enderezo, concello, provincia, código postal e teléfono
df_total[df_total['Tipo de centro'] == 'CPR'][['Código','Nome', 'Enderezo', 'Concello', 'Provincia', 'Cód. postal', 'Teléfono']].to_excel('output/centros_CPR.xlsx',sheet_name='CPR',index='Código')