# DataFrame [Documentation](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html)

In [1]:
import pandas as pd

In [2]:
df = pd.DataFrame()
print(df)

Empty DataFrame
Columns: []
Index: []


In [3]:
# Create a DataFrame from a list

data = [['Alex',10],['Bod',12],['Carlos',20]]
df = pd.DataFrame(data,columns=['Name','Age'])

print(df)


     Name  Age
0    Alex   10
1     Bod   12
2  Carlos   20


In [5]:
# Create a DataFrame from Dict of ndarrays

data = {'Name':['Alex','Bod','Carlos'], 'Age':[10,12,20]}
df = pd.DataFrame(data)
print(df)


     Name  Age
0    Alex   10
1     Bod   12
2  Carlos   20


In [8]:
#Create a DataFrame from List of Dict

data = [{'a':1, 'b':2},{'a': 5, 'b': 10, 'c':7}]
df = pd.DataFrame(data, index=['first', 'second'])
print(df)

        a   b    c
first   1   2  NaN
second  5  10  7.0


# Working with CSV file

In [11]:
data = pd.read_csv('cardio_train.csv', delimiter=';')

print(type(data))

<class 'pandas.core.frame.DataFrame'>


In [16]:
# Couting the number of columns and rows
print("Number of rows and columns:({},{})".format(data.shape[0],data.shape[1]))

data.head(10)

Number of rows and columns:(70000,13)


Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
5,8,21914,1,151,67.0,120,80,2,2,0,0,0,0
6,9,22113,1,157,93.0,130,80,3,1,0,0,1,0
7,12,22584,2,178,95.0,130,90,3,3,0,0,1,1
8,13,17668,1,158,71.0,110,70,1,1,0,0,1,0
9,14,19834,1,164,68.0,110,60,1,1,0,0,0,0


In [19]:
#printing the labels of your columns

print(data.columns)
print(data.keys())

Index(['id', 'age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
       'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio'],
      dtype='object')
Index(['id', 'age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
       'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio'],
      dtype='object')


In [23]:
#printing specific row using iloc function
data.iloc[[1]]

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1


In [33]:
# printing value for a particular position

print("Information of row {0:d}, and column {1:s}: {2:d}".format(60,"height",data.loc[60,'height']))

# printing the first 4 columns from the second row
data.iloc[[1],4:9]

# printing the first 4 columns from the second row
columns = ["age","height","cholesterol"]
data.loc[[4],columns]

Information of row 60, and column height: 159


Unnamed: 0,age,height,cholesterol
4,17474,156,1


In [34]:
#print dataFrame information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB


In [35]:
data.nunique()

id             70000
age             8076
gender             2
height           109
weight           287
ap_hi            153
ap_lo            157
cholesterol        3
gluc               3
smoke              2
alco               2
active             2
cardio             2
dtype: int64

In [36]:
columns_list = ['gender','weight','height','smoke']
row = list(range(300))
new_data = pd.DataFrame(data,index=row, columns=columns_list)
new_data.head()

Unnamed: 0,gender,weight,height,smoke
0,2,62.0,168,0
1,1,85.0,156,0
2,1,64.0,165,0
3,2,82.0,169,0
4,1,56.0,156,0


# Doing queries with DataFrame

In [40]:
n_row = range(data.shape[0])
columns_list = ["gender","smoke"]
gender_1 = pd.DataFrame(data, index=n_row, columns=columns_list).query("gender==1").count()

gender_2 = pd.DataFrame(data, index=n_row, columns=columns_list).query("gender==2").count()

gender_1_1 = pd.DataFrame(data,index=n_row, columns=columns_list).query("gender==1 and smoke ==1").count()

gender_2_1 = pd.DataFrame(data,index=n_row, columns=columns_list).query("gender==2 and smoke ==1").count()

print("Total number of samples gerder 1: ",gender_1)
print("Total number of samples gerder 2: ",gender_2)

print("Total number of samples gerder 1 and smoke=1: ",gender_1_1)
print("Total number of samples gerder 2 and smoke=1: ",gender_2_1)


Total number of samples gerder 1:  gender    45530
smoke     45530
dtype: int64
Total number of samples gerder 2:  gender    24470
smoke     24470
dtype: int64
Total number of samples gerder 1 and smoke=1:  gender    813
smoke     813
dtype: int64
Total number of samples gerder 2 and smoke=1:  gender    5356
smoke     5356
dtype: int64
