### Creating DataFrames

In [1]:
import pandas as pd 
import numpy as np 

file_dir = "./P87-S2-Dataset-Basics-Resources/"

In [3]:
data = np.random.random(size=(5,3))
print(data)
df = pd.DataFrame(data=data, columns=["A", "B", "C"])

df

[[0.93066976 0.15450837 0.47496532]
 [0.08966814 0.87447445 0.17657516]
 [0.84085539 0.33522662 0.12893258]
 [0.34807179 0.35541736 0.9589612 ]
 [0.20416297 0.62299069 0.15235031]]


Unnamed: 0,A,B,C
0,0.93067,0.154508,0.474965
1,0.089668,0.874474,0.176575
2,0.840855,0.335227,0.128933
3,0.348072,0.355417,0.958961
4,0.204163,0.622991,0.15235


In [4]:
df = pd.DataFrame(data={"A": [1,2,3], "B": ['Sam', 'Alex', 'John']})

df

Unnamed: 0,A,B
0,1,Sam
1,2,Alex
2,3,John


In [10]:
dtype = [("A", int), ("B", (str, 20))]
data = np.array([(1, "Sam"), (2, "Alex"), (3, "John")], dtype=dtype)

df = pd.DataFrame(data)
df

Unnamed: 0,A,B
0,1,Sam
1,2,Alex
2,3,John


### Saving and Serialising a dataframe

In [11]:
df = pd.DataFrame(np.random.random(size=(100000, 4)), columns=["A", "B", "C", "D"])

df.head()

Unnamed: 0,A,B,C,D
0,0.37992,0.434402,0.537697,0.103735
1,0.406891,0.409102,0.48995,0.904801
2,0.024077,0.060613,0.236212,0.555531
3,0.394593,0.790248,0.411609,0.107437
4,0.858013,0.045608,0.390434,0.494062


In [13]:
df.to_csv("save.csv", index=False, float_format="%0.4f")

In [14]:
df.to_pickle("save.pkl")

In [15]:
df.to_hdf("save.hdf", key="data", format="table")

In [17]:
df  = pd.read_csv(file_dir + "astronauts.csv")

In [19]:
df.tail()

Unnamed: 0,Name,Year,Group,Status,Birth Date,Birth Place,Gender,Alma Mater,Undergraduate Major,Graduate Major,Military Rank,Military Branch,Space Flights,Space Flight (hr),Space Walks,Space Walks (hr),Missions,Death Date,Death Mission
352,David A. Wolf,1990.0,13.0,Retired,8/23/1956,"Indianapolis, IN",Male,Purdue University; Indiana University,Electrical Engineering,Medicine,,,3,4044,7,41.0,STS-58 (Columbia). STS-86/89 (Atlantis/Endeavo...,,
353,Neil W. Woodward III,1998.0,17.0,Retired,7/26/1962,"Chicago, IL",Male,MIT; University of Texas-Austin; George Washin...,Physics,Physics; Business Management,Commander,US Navy,0,0,0,0.0,,,
354,Alfred M. Worden,1966.0,5.0,Retired,2/7/1932,"Jackson, MI",Male,US Military Academy; University of Michigan,Military Science,Aeronautical & Astronautical Engineering,Colonel,US Air Force (Retired),1,295,1,0.5,Apollo 15,,
355,John W. Young,1962.0,2.0,Retired,9/24/1930,"San Francisco, CA",Male,Georgia Institute of Technology,Aeronautical Engineering,,Captain,US Navy (Retired),6,835,3,20.0,"Gemini 3, Gemini 10, Apollo 10, Apollo 16, STS...",,
356,George D. Zamka,1998.0,17.0,Retired,6/29/1962,"Jersey City, NJ",Male,US Naval Academy; Florida Institute of Technology,Mathematics,Engineering Management,Colonel,US Marine Corps (Retired),2,692,0,0.0,"STS-120 (Discovery), STS-130 (Endeavor)",,


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 357 entries, 0 to 356
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Name                 357 non-null    object 
 1   Year                 330 non-null    float64
 2   Group                330 non-null    float64
 3   Status               357 non-null    object 
 4   Birth Date           357 non-null    object 
 5   Birth Place          357 non-null    object 
 6   Gender               357 non-null    object 
 7   Alma Mater           356 non-null    object 
 8   Undergraduate Major  335 non-null    object 
 9   Graduate Major       298 non-null    object 
 10  Military Rank        207 non-null    object 
 11  Military Branch      211 non-null    object 
 12  Space Flights        357 non-null    int64  
 13  Space Flight (hr)    357 non-null    int64  
 14  Space Walks          357 non-null    int64  
 15  Space Walks (hr)     357 non-null    flo

In [21]:
df.describe()

Unnamed: 0,Year,Group,Space Flights,Space Flight (hr),Space Walks,Space Walks (hr)
count,330.0,330.0,357.0,357.0,357.0,357.0
mean,1985.106061,11.409091,2.364146,1249.266106,1.246499,7.707283
std,13.216147,5.149962,1.4287,1896.759857,2.056989,13.367973
min,1959.0,1.0,0.0,0.0,0.0,0.0
25%,1978.0,8.0,1.0,289.0,0.0,0.0
50%,1987.0,12.0,2.0,590.0,0.0,0.0
75%,1996.0,16.0,3.0,1045.0,2.0,12.0
max,2009.0,20.0,7.0,12818.0,10.0,67.0


In [22]:
df.corr()

Unnamed: 0,Year,Group,Space Flights,Space Flight (hr),Space Walks,Space Walks (hr)
Year,1.0,0.980934,0.03642,0.331386,0.210073,0.253502
Group,0.980934,1.0,-0.011386,0.325683,0.217891,0.261384
Space Flights,0.03642,-0.011386,1.0,0.325233,0.257073,0.258642
Space Flight (hr),0.331386,0.325683,0.325233,1.0,0.472796,0.454408
Space Walks,0.210073,0.217891,0.257073,0.472796,1.0,0.985755
Space Walks (hr),0.253502,0.261384,0.258642,0.454408,0.985755,1.0


In [25]:
df["Year"].value_counts()

1996.0    35
1978.0    35
1998.0    25
1990.0    23
1966.0    19
1995.0    19
1992.0    19
1980.0    19
1984.0    18
2000.0    17
1987.0    15
1963.0    14
1985.0    13
2004.0    11
1967.0    11
2009.0     9
1962.0     8
1959.0     7
1969.0     7
1965.0     6
Name: Year, dtype: int64

In [26]:
df.max()

Name                 Yvonne D. Cagle
Year                          2009.0
Group                           20.0
Status                       Retired
Birth Date                  9/9/1952
Birth Place              Yonkers, NY
Gender                          Male
Space Flights                      7
Space Flight (hr)              12818
Space Walks                       10
Space Walks (hr)                67.0
dtype: object