# 5. Read CSV Data using Pandas


In [1]:
import time
import pandas as pd

start_time = time.time()

data= pd.read_csv(r"D:\DANotes\MavenA\Data Playground\anime\final_animedataset.csv")

print("Read csv without chunks: ", time.time() - start_time, ' seconds')


print("\n--- HEAD ---")
print(data.head())
print("\n--- DESCRIBE ---")
print(data.describe())
print("\n--- INFO ---")
print(data.info())

Read csv without chunks:  207.8431956768036  seconds

--- HEAD ---
   username  anime_id  my_score  user_id  gender          title type source  \
0  karthiga        21         9  2255153  Female      One Piece   TV  Manga   
1  karthiga        59         7  2255153  Female        Chobits   TV  Manga   
2  karthiga        74         7  2255153  Female   Gakuen Alice   TV  Manga   
3  karthiga       120         7  2255153  Female  Fruits Basket   TV  Manga   
4  karthiga       178         7  2255153  Female   Ultra Maniac   TV  Manga   

   score  scored_by    rank  popularity  \
0   8.54     423868    91.0          35   
1   7.53     175388  1546.0         188   
2   7.77      33244   941.0        1291   
3   7.77     167968   939.0         222   
4   7.26       9663  2594.0        2490   

                                               genre  
0  Action, Adventure, Comedy, Super Power, Drama,...  
1      Sci-Fi, Comedy, Drama, Romance, Ecchi, Seinen  
2                Comedy, School, S

## Importing data chunks

In [5]:
start = time.time()
#read data in chunks of 1000000=1 million rows at a time
chunk = pd.read_csv(r"D:\DANotes\MavenA\Data Playground\anime\final_animedataset.csv",chunksize=1000000)

pd_df = pd.concat(chunk)

print("\n--- HEAD ---")
print(pd_df.head())
print("\n--- DESCRIBE ---")
print(pd_df.describe())
print("\n--- INFO ---")
print(pd_df.info())

end = time.time()
print("Read csv with chunks : ",(end-start),"sec")


--- HEAD ---
   username  anime_id  my_score  user_id  gender          title type source  \
0  karthiga        21         9  2255153  Female      One Piece   TV  Manga   
1  karthiga        59         7  2255153  Female        Chobits   TV  Manga   
2  karthiga        74         7  2255153  Female   Gakuen Alice   TV  Manga   
3  karthiga       120         7  2255153  Female  Fruits Basket   TV  Manga   
4  karthiga       178         7  2255153  Female   Ultra Maniac   TV  Manga   

   score  scored_by    rank  popularity  \
0   8.54     423868    91.0          35   
1   7.53     175388  1546.0         188   
2   7.77      33244   941.0        1291   
3   7.77     167968   939.0         222   
4   7.26       9663  2594.0        2490   

                                               genre  
0  Action, Adventure, Comedy, Super Power, Drama,...  
1      Sci-Fi, Comedy, Drama, Romance, Ecchi, Seinen  
2                Comedy, School, Shoujo, Super Power  
3  Slice of Life, Comedy, Drama,

##  Dask to emulate Pandas

In [6]:
from dask import dataframe as dd

start = time.time()
dask_df = dd.read_csv(r"D:\DANotes\MavenA\Data Playground\anime\final_animedataset.csv")
end = time.time()
print("Read csv with dask: ",(end-start),"sec")

print("\n--- HEAD ---")
print(dask_df.head())
print("\n--- DESCRIBE ---")
print(dask_df.describe())
print("\n--- INFO ---")
print(dask_df.info())

end = time.time()
print("Read csv with dask: ",(end-start),"sec")

Read csv with dask:  0.6047458648681641 sec

--- HEAD ---
   username  anime_id  my_score  user_id  gender          title type source  \
0  karthiga        21         9  2255153  Female      One Piece   TV  Manga   
1  karthiga        59         7  2255153  Female        Chobits   TV  Manga   
2  karthiga        74         7  2255153  Female   Gakuen Alice   TV  Manga   
3  karthiga       120         7  2255153  Female  Fruits Basket   TV  Manga   
4  karthiga       178         7  2255153  Female   Ultra Maniac   TV  Manga   

   score  scored_by    rank  popularity  \
0   8.54     423868    91.0          35   
1   7.53     175388  1546.0         188   
2   7.77      33244   941.0        1291   
3   7.77     167968   939.0         222   
4   7.26       9663  2594.0        2490   

                                               genre  
0  Action, Adventure, Comedy, Super Power, Drama,...  
1      Sci-Fi, Comedy, Drama, Romance, Ecchi, Seinen  
2                Comedy, School, Shoujo, Su

In [8]:
dask_df.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 13 entries, username to genre
dtypes: object(6), float64(2), int64(5)

## PyArrow

It’s still marked as experimental, and it doesn’t support all the features of the default parser—but it is faster.

In [3]:
import time
import pandas as pd

start = time.time()
df_pyarrow = pd.read_csv(r"D:\DANotes\MavenA\Data Playground\anime\final_animedataset.csv", engine="pyarrow")
end = time.time()
print("Read csv with dask: ",(end-start),"sec")

print("\n--- HEAD ---")
print(df_pyarrow.head())
print("\n--- DESCRIBE ---")
print(df_pyarrow.describe())
print("\n--- INFO ---")
print(df_pyarrow.info())

end = time.time()
print("Read csv with dask: ",(end-start),"sec")

Read csv with dask:  505.8782207965851 sec

--- HEAD ---
   username  anime_id  my_score  user_id  gender          title type source  \
0  karthiga        21         9  2255153  Female      One Piece   TV  Manga   
1  karthiga        59         7  2255153  Female        Chobits   TV  Manga   
2  karthiga        74         7  2255153  Female   Gakuen Alice   TV  Manga   
3  karthiga       120         7  2255153  Female  Fruits Basket   TV  Manga   
4  karthiga       178         7  2255153  Female   Ultra Maniac   TV  Manga   

   score  scored_by    rank  popularity  \
0   8.54     423868    91.0          35   
1   7.53     175388  1546.0         188   
2   7.77      33244   941.0        1291   
3   7.77     167968   939.0         222   
4   7.26       9663  2594.0        2490   

                                               genre  
0  Action, Adventure, Comedy, Super Power, Drama,...  
1      Sci-Fi, Comedy, Drama, Romance, Ecchi, Seinen  
2                Comedy, School, Shoujo, Sup