# Import libraries

In [1]:
import pandas as pd
import zipfile

# Opening a .tsv File

## In native python

In [4]:
tiny_tsv = []

with open("tiny_tsv.tsv","r") as tsv_file:
    for line in tsv_file:
        tiny_tsv.append(line.rstrip("\n").split("\t"))

tiny_tsv

[['rank', 'name', 'test_type'],
 ['1', 'Adam', 'IQ'],
 ['2', 'Maria', 'EQ'],
 ['3', 'Jenny', 'IQ'],
 ['4', 'Amira', 'IQ'],
 ['5', 'Ola', 'EQ']]

## In pandas

In [6]:
df_tsv = pd.read_csv("tiny_tsv.tsv",sep="\t")
df_tsv

Unnamed: 0,rank,name,test_type
0,1,Adam,IQ
1,2,Maria,EQ
2,3,Jenny,IQ
3,4,Amira,IQ
4,5,Ola,EQ


# Open a .csv file from a zipped archive

## Using pandas with zip file containing a single file

Using pandas directly without specifying excatly how it should be read will lead to issues on how the csv file is read.

In [10]:
direct_df = pd.read_csv('tiny_csv.zip')
direct_df

Unnamed: 0,rank;name;test_type
0,1;Adam;IQ
1,2;Maria;EQ
2,3;Jenny;IQ
3,4;Amira;IQ
4,5;Ola;EQ
5,6;Jerry;No Test Taken
6,7;Kai;EQ


## Using pandas with zip file containing multiple file

This option will fail

In [11]:
pd.read_csv("multi_tiny.zip")

ValueError: Multiple files found in ZIP file. Only one file per ZIP: ['tiny_csv.csv', 'tiny_tsv.tsv']

### Instead what should be done is that the .zip file should be unzipped first 

In [12]:
with zipfile.ZipFile('./multi_tiny.zip','r') as zip_ref:
    zip_ref.extractall("./tiny_csv/")

### Now the csv file can be read

In [13]:
indirect_df = pd.read_csv('./tiny_csv/tiny_csv.csv')
indirect_df

Unnamed: 0,rank;name;test_type
0,1;Adam;IQ
1,2;Maria;EQ
2,3;Jenny;IQ
3,4;Amira;IQ
4,5;Ola;EQ


## Specifying csv options

The following needs to be done so that the data is read appropitely:

* The delimiter needs to be set to ";"
* The header file needs to be set to non-existent, done by specifying it as 0
* The data types of each individual column needs to be specified
* The rank column should be specified as the index
*  "No Test Taken" values should be read as Not available (NaNs)

*the same thing can be applied to `Using pandas with zip file containing a single file`*

In [18]:
indirect_df = pd.read_csv('./tiny_csv/tiny_csv.csv',
                         sep=";",
                         index_col='rank',
                         na_values='No Test taken',
                         dtype={'rank':'int',
                               "name":"string",
                               "test_type":'string'})

indirect_df

Unnamed: 0_level_0,name,test_type
rank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Adam,IQ
2,Maria,EQ
3,Jenny,IQ
4,Amira,IQ
5,Ola,EQ
