# Basic CSV Reading

In [1]:
import pandas as pd

df = pd.read_csv('student_scores.csv')

## Viewing the first few entries

In [2]:
df.head()

Unnamed: 0,ID,Name,Attendance,HW,Test1,Project1,Test2,Project2,Final
0,27604,Joe,0.96,0.97,87.0,98.0,92.0,93.0,95.0
1,30572,Alex,1.0,0.84,92.0,89.0,94.0,92.0,91.0
2,39203,Avery,0.84,0.74,68.0,70.0,84.0,90.0,82.0
3,28592,Kris,0.96,1.0,82.0,94.0,90.0,81.0,84.0
4,27492,Rick,0.32,0.85,98.0,100.0,73.0,82.0,88.0


# Using Different Sepration values

In [3]:
df = pd.read_csv('student_scores.csv', sep=":")
df.head()

Unnamed: 0,"ID,Name,Attendance,HW,Test1,Project1,Test2,Project2,Final"
0,"27604,Joe,0.96,0.97,87.0,98.0,92.0,93.0,95.0"
1,"30572,Alex,1.0,0.84,92.0,89.0,94.0,92.0,91.0"
2,"39203,Avery,0.84,0.74,68.0,70.0,84.0,90.0,82.0"
3,"28592,Kris,0.96,1.0,82.0,94.0,90.0,81.0,84.0"
4,"27492,Rick,0.32,0.85,98.0,100.0,73.0,82.0,88.0"


Should place everything in a single column since ":" is not used as a seperator

# Specifying the head of the file

It's usually the first line but it might be good to specify a different line, like when there is extra meta information at hte top of the file

In [5]:
df = pd.read_csv("student_scores.csv", header=2)
df.head()

Unnamed: 0,30572,Alex,1.0,0.84,92.0,89.0,94.0,92.0.1,91.0
0,39203,Avery,0.84,0.74,68.0,70.0,84.0,90.0,82.0
1,28592,Kris,0.96,1.0,82.0,94.0,90.0,81.0,84.0
2,27492,Rick,0.32,0.85,98.0,100.0,73.0,82.0,88.0


## Reading CSV without a header

In [6]:
df = pd.read_csv("student_scores.csv",header=0)
df.head()

Unnamed: 0,ID,Name,Attendance,HW,Test1,Project1,Test2,Project2,Final
0,27604,Joe,0.96,0.97,87.0,98.0,92.0,93.0,95.0
1,30572,Alex,1.0,0.84,92.0,89.0,94.0,92.0,91.0
2,39203,Avery,0.84,0.74,68.0,70.0,84.0,90.0,82.0
3,28592,Kris,0.96,1.0,82.0,94.0,90.0,81.0,84.0
4,27492,Rick,0.32,0.85,98.0,100.0,73.0,82.0,88.0


# Creating custom column names

In [7]:
labels = ['id','name','attendance','hw','test1','project1','test2','project2','final']
df = pd.read_csv("student_scores.csv",names=labels)
df.head()

Unnamed: 0,id,name,attendance,hw,test1,project1,test2,project2,final
0,ID,Name,Attendance,HW,Test1,Project1,Test2,Project2,Final
1,27604,Joe,0.96,0.97,87.0,98.0,92.0,93.0,95.0
2,30572,Alex,1.0,0.84,92.0,89.0,94.0,92.0,91.0
3,39203,Avery,0.84,0.74,68.0,70.0,84.0,90.0,82.0
4,28592,Kris,0.96,1.0,82.0,94.0,90.0,81.0,84.0


## Replacing a header information with custom column names

In [10]:
labels = ['id','name','attendance','hw','test1','project1','test2','project2','final']
df = pd.read_csv('student_scores.csv',header=0,names=labels)
df.head()

Unnamed: 0,id,name,attendance,hw,test1,project1,test2,project2,final
0,27604,Joe,0.96,0.97,87.0,98.0,92.0,93.0,95.0
1,30572,Alex,1.0,0.84,92.0,89.0,94.0,92.0,91.0
2,39203,Avery,0.84,0.74,68.0,70.0,84.0,90.0,82.0
3,28592,Kris,0.96,1.0,82.0,94.0,90.0,81.0,84.0
4,27492,Rick,0.32,0.85,98.0,100.0,73.0,82.0,88.0


# Specifying the index

The default index is integers, but it can be changed to a certain column or even multiple columns

## A single column as an index

In [12]:
df = pd.read_csv("student_scores.csv",index_col='Name')
df.head()

Unnamed: 0_level_0,ID,Attendance,HW,Test1,Project1,Test2,Project2,Final
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Joe,27604,0.96,0.97,87.0,98.0,92.0,93.0,95.0
Alex,30572,1.0,0.84,92.0,89.0,94.0,92.0,91.0
Avery,39203,0.84,0.74,68.0,70.0,84.0,90.0,82.0
Kris,28592,0.96,1.0,82.0,94.0,90.0,81.0,84.0
Rick,27492,0.32,0.85,98.0,100.0,73.0,82.0,88.0


## Two columns as an index

In [14]:
df = pd.read_csv("student_scores.csv",index_col=['Name','ID'])
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Attendance,HW,Test1,Project1,Test2,Project2,Final
Name,ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Joe,27604,0.96,0.97,87.0,98.0,92.0,93.0,95.0
Alex,30572,1.0,0.84,92.0,89.0,94.0,92.0,91.0
Avery,39203,0.84,0.74,68.0,70.0,84.0,90.0,82.0
Kris,28592,0.96,1.0,82.0,94.0,90.0,81.0,84.0
Rick,27492,0.32,0.85,98.0,100.0,73.0,82.0,88.0


# Exercise 1

Read cancer_data.csv. Use an appropiate column as the index. Use .head() on the dataframe to see intially the columns and if the column was chosen correctly

In [15]:
df_cancer = pd.read_csv('cancer_data.csv')
df_cancer.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,...,radius_max,texture_max,perimeter_max,area_max,smoothness_max,compactness_max,concavity_max,concave_points_max,symmetry_max,fractal_dimension_max
0,842302,M,17.99,,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [17]:
df_cancer = pd.read_csv('cancer_data.csv',index_col='id')
df_cancer.head()

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,...,radius_max,texture_max,perimeter_max,area_max,smoothness_max,compactness_max,concavity_max,concave_points_max,symmetry_max,fractal_dimension_max
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
842302,M,17.99,,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
84348301,M,11.42,20.38,77.58,386.1,,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,,0.8663,0.6869,0.2575,0.6638,0.173
84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


# Exercise 2

Read powerplant_data.csv. Create more descriptive column names based on the features:

* AT = temperature
* V = exhaust_vacuum
* AP = pressure
* RH = humidity
* PE = energy_output

In [18]:
df_powerplant = pd.read_csv('powerplant_data.csv')
df_powerplant.head()

Unnamed: 0,AT,V,AP,RH,PE
0,8.34,40.77,1010.84,90.01,480.48
1,23.64,58.49,1011.4,74.2,445.75
2,29.74,56.9,1007.15,41.91,438.76
3,19.07,49.69,1007.22,76.79,453.09
4,11.8,40.66,1017.13,97.2,464.43


In [20]:
labels = ['temperature','exhaust_vacuum','pressure','humidity','energy_output']
df_powerplant = pd.read_csv('powerplant_data.csv',header=0,names=labels)
df_powerplant.head()

Unnamed: 0,temperature,exhaust_vacuum,pressure,humidity,energy_output
0,8.34,40.77,1010.84,90.01,480.48
1,23.64,58.49,1011.4,74.2,445.75
2,29.74,56.9,1007.15,41.91,438.76
3,19.07,49.69,1007.22,76.79,453.09
4,11.8,40.66,1017.13,97.2,464.43


# Writing CSV Files

In [21]:
df_powerplant.to_csv('powerplant_data_edited.csv')

## Check if the file has been saved correctly

In [22]:
df = pd.read_csv('powerplant_data_edited.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,temperature,exhaust_vacuum,pressure,humidity,energy_output
0,0,8.34,40.77,1010.84,90.01,480.48
1,1,23.64,58.49,1011.4,74.2,445.75
2,2,29.74,56.9,1007.15,41.91,438.76
3,3,19.07,49.69,1007.22,76.79,453.09
4,4,11.8,40.66,1017.13,97.2,464.43


## Saving without the extra index

In [25]:
df_powerplant.to_csv('powerplant_data_edited.csv',index=False)

In [26]:
df = pd.read_csv('powerplant_data_edited.csv')
df.head()

Unnamed: 0,temperature,exhaust_vacuum,pressure,humidity,energy_output
0,8.34,40.77,1010.84,90.01,480.48
1,23.64,58.49,1011.4,74.2,445.75
2,29.74,56.9,1007.15,41.91,438.76
3,19.07,49.69,1007.22,76.79,453.09
4,11.8,40.66,1017.13,97.2,464.43
