In [59]:
#Now we will look at data reading from a text format
#Mostly used are CSV files as they are of the same format as a pandas DataFrame
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

In [60]:
#We can use the read_csv method to read data from the csv file
df = pd.read_csv('Data Files/fields.csv')
df

Unnamed: 0,ID,Name,Field
0,1,Adam Jones,Electrical
1,2,Edward Elrich,Mechanical
2,3,Stain Steve,Computer Science
3,4,Ken Adams,Media Science
4,5,Ross Taylor,Sportsman


In [61]:
#Another method is to use the read_table function

df2 = pd.read_table('Data Files/fields.csv', sep=',')
df2

Unnamed: 0,ID,Name,Field
0,1,Adam Jones,Electrical
1,2,Edward Elrich,Mechanical
2,3,Stain Steve,Computer Science
3,4,Ken Adams,Media Science
4,5,Ross Taylor,Sportsman


In [62]:
#By default, the to_read method if there are no headers assigns them bydefault
df3 = pd.read_csv('Data Files/no_cols.csv',header=None) #Header arguments tells that the first entry is not a header in the csv provided
df3

Unnamed: 0,0,1,2
0,1,Adam Jones,Electrical
1,2,Edward Elrich,Mechanical
2,3,Stain Steve,Computer Science
3,4,Ken Adams,Media Science
4,5,Ross Taylor,Sportsman


In [63]:
#We can assign the headers to such csvs using the names argument
df3 = pd.read_csv('Data Files/no_cols.csv', names=['ID','Name','Field'])
df3

Unnamed: 0,ID,Name,Field
0,1,Adam Jones,Electrical
1,2,Edward Elrich,Mechanical
2,3,Stain Steve,Computer Science
3,4,Ken Adams,Media Science
4,5,Ross Taylor,Sportsman


In [64]:
#Suppose we want the ID column as the index of the returned dataframe then
df4 = pd.read_csv('Data Files/fields.csv', index_col='ID')
df4

Unnamed: 0_level_0,Name,Field
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Adam Jones,Electrical
2,Edward Elrich,Mechanical
3,Stain Steve,Computer Science
4,Ken Adams,Media Science
5,Ross Taylor,Sportsman


In [65]:
#Suppose we want to present a dataframe as a hierarchical form from a csv data
df5 = pd.read_csv('Data Files/new_fields.csv', index_col=['Field','ID'])
df5.sort_values(by='Field')

Unnamed: 0_level_0,Unnamed: 1_level_0,Name
Field,ID,Unnamed: 2_level_1
Computer Science,3,Edward
Computer Science,6,Steve
Electrical,1,Adam
Electrical,5,Stain
Mechanical,2,Jones
Mechanical,4,Elrich


In [66]:
#Suppose the delimiter is not specified in a particular file
list(open('Data Files/sep_fields.txt'))

['Name|Field\n',
 '1|Adam Jones|Electrical\n',
 '2|Edward Elrich|Mechanical\n',
 '3|Stain Steve|Computer Science\n',
 '4|Ken Adams|Media Science\n',
 '5|Ross Taylor|Sportsman']

In [67]:
#Here it is clear that the values are seperated with a |, thus we can pass the delimiter '|' to read_table
df6 = pd.read_table('Data Files/sep_fields.txt', delimiter='|')
df6

Unnamed: 0,Name,Field
1,Adam Jones,Electrical
2,Edward Elrich,Mechanical
3,Stain Steve,Computer Science
4,Ken Adams,Media Science
5,Ross Taylor,Sportsman


In [68]:
#In above case, since the number of headers were one less than the total column values, thus, the first set of values was set ot index by default

#Now consider this dataset
list(open('Data Files/interupt_fields.csv'))

['Hello there!\n',
 'This dataset contains the names and fields of various students \n',
 'ID,Name,Field\n',
 'Ooops! I mean not only students but professionals as well\n',
 '1,Adam Jones,Electrical\n',
 '2,Edward Elrich,Mechanical\n',
 'Ohh!! And i forgot to mention... oops i forgot about it\n',
 '3,Stain Steve,Computer Science\n',
 'Ahh yes,i remember now, but first seek the last two entries info\n',
 '4,Ken Adams,Media Science\n',
 '5,Ross Taylor,Sportsman\n',
 'Well, how are you gonna get your data nowwwww']

In [69]:
#The given data is interrupted by meaningless lines, we can remove these lines by skipping them through the skiprows argument
df7 = pd.read_csv('Data Files/interupt_fields.csv', skiprows=[0,1,3,6,8,11])
df7

Unnamed: 0,ID,Name,Field
0,1,Adam Jones,Electrical
1,2,Edward Elrich,Mechanical
2,3,Stain Steve,Computer Science
3,4,Ken Adams,Media Science
4,5,Ross Taylor,Sportsman


In [70]:
#One fundamental concept during file parsing is the handling of the missing data
#Consider a file containing nusiance values

list(open('Data Files/none_fields.csv'))

['ID,Name,Field\n',
 '1,Adam Jones,Electrical\n',
 '2,Edward Elrich,-\n',
 '3,Ken Adams,\n',
 '4,Ross Taylor,Sportsman\n',
 ',Stain Steve,']

In [71]:
df8 = pd.read_csv('Data Files/none_fields.csv')
df8

Unnamed: 0,ID,Name,Field
0,1.0,Adam Jones,Electrical
1,2.0,Edward Elrich,-
2,3.0,Ken Adams,
3,4.0,Ross Taylor,Sportsman
4,,Stain Steve,


In [72]:
df8.isnull()

Unnamed: 0,ID,Name,Field
0,False,False,False
1,False,False,False
2,False,False,True
3,False,False,False
4,True,False,True


In [73]:
#Now for given dataset the - value is also a missing data, thus we can assign that data to missing value by:
df8 = pd.read_csv('Data Files/none_fields.csv', na_values=['-'])
df8

Unnamed: 0,ID,Name,Field
0,1.0,Adam Jones,Electrical
1,2.0,Edward Elrich,
2,3.0,Ken Adams,
3,4.0,Ross Taylor,Sportsman
4,,Stain Steve,


In [75]:
#Another method that can be used to perform the same operation is:
df8 = pd.read_csv('Data Files/none_fields.csv', na_values = {'Field': ['-']})
df8

Unnamed: 0,ID,Name,Field
0,1.0,Adam Jones,Electrical
1,2.0,Edward Elrich,
2,3.0,Ken Adams,
3,4.0,Ross Taylor,Sportsman
4,,Stain Steve,
