## Pandas Dataframe 

In [1]:
import pandas as pd
import numpy as np
import sys

In [4]:
df = pd.DataFrame([])
df

In [3]:
mylist = [1, 2, 3]
df = pd.DataFrame(mylist)
df

Unnamed: 0,0
0,1
1,2
2,3


In [5]:
df = pd.DataFrame((1, 2, 3))
df

Unnamed: 0,0
0,1
1,2
2,3


In [6]:
df = pd.DataFrame({1, 2, 3})
df

Unnamed: 0,0
0,1
1,2
2,3


In [7]:
df = pd.DataFrame({'a':1, 'b':2, 'c':3})
df

ValueError: If using all scalar values, you must pass an index

In [None]:
df = pd.DataFrame({'a':[1], 'b':[2], 'c':[3]}) # common
df

Unnamed: 0,a,b,c
0,1,2,3


In [9]:
df = pd.DataFrame({'a':1, 'b':2, 'c':3}, index = ['a','b','c'])
df

Unnamed: 0,a,b,c
a,1,2,3
b,1,2,3
c,1,2,3


In [None]:
df = pd.DataFrame({'a':1, 'b':2, 'c':3}, index = ['c','b','a'])
df

Unnamed: 0,a,b,c
c,1,2,3
b,1,2,3
a,1,2,3


In [11]:
df = pd.DataFrame({'a':1, 'b':2, 'c':3}, index = ['a','B','c'])
df

Unnamed: 0,a,b,c
a,1,2,3
B,1,2,3
c,1,2,3


In [19]:
my_df = pd.DataFrame(
    {
        "names": ("Ramesh", "Suresh", "Ganesh", "Mahesh"),
        "ages": [29, 54, 66, 22],
        'salary':tuple({1200, 1300, 1400, 1500})
    }
) 
my_df

Unnamed: 0,names,ages,salary
0,Ramesh,29,1200
1,Suresh,54,1500
2,Ganesh,66,1300
3,Mahesh,22,1400


In [20]:
my_list = [1, 2.33, "asdsd", True, False, None]  # non-homogenous data

df = pd.DataFrame(my_list)
df

Unnamed: 0,0
0,1
1,2.33
2,asdsd
3,True
4,False
5,


In [21]:
my_list = [99, 88, 77, 66, 44, 22]  # homogenous data

my_df = pd.DataFrame(my_list)
my_df

Unnamed: 0,0
0,99
1,88
2,77
3,66
4,44
5,22


In [22]:
type(my_df)

pandas.core.frame.DataFrame

In [23]:
my_df = pd.DataFrame(data = my_list)
my_df

Unnamed: 0,0
0,99
1,88
2,77
3,66
4,44
5,22


In [24]:
my_df = pd.DataFrame(data = my_list, columns = ("ages",))
my_df

Unnamed: 0,ages
0,99
1,88
2,77
3,66
4,44
5,22


In [25]:
my_df = pd.DataFrame({'ages':[12, 13, 14, 15, 16]})
my_df

Unnamed: 0,ages
0,12
1,13
2,14
3,15
4,16


In [35]:

my_df = pd.DataFrame(
    {
        "ages": [99, 88, 66, 22],
        "names": ("Ramesh", "suresh", "Ganesh", "Mahesh"),
        "randomData": (23 / 2, None, np.nan, True),
    }
)  # values must be of same length
my_df

Unnamed: 0,ages,names,randomData
0,99,Ramesh,11.5
1,88,suresh,
2,66,Ganesh,
3,22,Mahesh,True


In [36]:
import os
os.makedirs('output_datasets', exist_ok = True)

my_df.to_json("output_datasets\persons.json")

  my_df.to_json("output_datasets\persons.json")


In [37]:
import os
os.listdir()

['output_datasets\\persons.json',
 'b_pandas_dataframes.ipynb',
 'output_datasets',
 'a_pandas_series.ipynb']

In [44]:
my_df.to_csv("output_datasets/persons.csv")

In [45]:
! ls -ltr output_datasets

total 4
-rw-rw-rw- 1 codespace codespace 83 Feb 21 00:04 persons.csv


In [46]:
! cat output_datasets/persons.csv

,ages,names,randomData
0,99,Ramesh,11.5
1,88,suresh,
2,66,Ganesh,
3,22,Mahesh,True


In [47]:
my_df.to_csv("output_datasets/persons.csv", index = False)

In [48]:
! cat output_datasets/persons.csv

ages,names,randomData
99,Ramesh,11.5
88,suresh,
66,Ganesh,
22,Mahesh,True


In [49]:
my_df.to_csv("output_datasets/persons2.csv", index=False, header=False)
! cat output_datasets/persons2.csv

99,Ramesh,11.5
88,suresh,
66,Ganesh,
22,Mahesh,True


### Reading data

In [54]:
new_df = pd.read_csv("output_datasets/persons.csv")
new_df

Unnamed: 0,ages,names,randomData
0,99,Ramesh,11.5
1,88,suresh,
2,66,Ganesh,
3,22,Mahesh,True


In [55]:
for attribute in dir(pd):
    if 'read' in attribute:
        print(attribute)

read_clipboard
read_csv
read_excel
read_feather
read_fwf
read_gbq
read_hdf
read_html
read_json
read_orc
read_parquet
read_pickle
read_sas
read_spss
read_sql
read_sql_query
read_sql_table
read_stata
read_table
read_xml


In [56]:
new_df = pd.read_csv("output_datasets/persons.csv", header = None)
new_df

Unnamed: 0,0,1,2
0,ages,names,randomData
1,99,Ramesh,11.5
2,88,suresh,
3,66,Ganesh,
4,22,Mahesh,True


In [57]:
new_df = pd.read_csv("output_datasets/persons.csv", names = ['age', 'name', 'index'])
new_df

Unnamed: 0,age,name,index
0,ages,names,randomData
1,99,Ramesh,11.5
2,88,suresh,
3,66,Ganesh,
4,22,Mahesh,True


In [58]:
new_df.dtypes

age      object
name     object
index    object
dtype: object

In [59]:
new_df.age.dtypes

dtype('O')

#### Column selection

In [69]:

data = {
    "Name": ["Jai", "Princi", "Gaurav", "Anuj"],
    "Age": [27, 24, 22, 32],
    "Address": ["Delhi", "Kanpur", "Allahabad", "Kannauj"],
    "Qualification": ["Msc", "MA", "MCA", "Phd"]
}

df = pd.DataFrame(data)

df

Unnamed: 0,Name,Age,Address,Qualification
0,Jai,27,Delhi,Msc
1,Princi,24,Kanpur,MA
2,Gaurav,22,Allahabad,MCA
3,Anuj,32,Kannauj,Phd


In [62]:
df['Name']

0       Jai
1    Princi
2    Gaurav
3      Anuj
Name: Name, dtype: object

In [63]:
df[['Name', 'Age']]

Unnamed: 0,Name,Age
0,Jai,27
1,Princi,24
2,Gaurav,22
3,Anuj,32


#### Row selection

In [68]:
df

Unnamed: 0,Name,Age,Address,Qualification
0,Jai,27,Delhi,Msc
1,Princi,24,Kanpur,MA
2,Gaurav,22,Allahabad,MCA
3,Anuj,32,Kannauj,Phd


In [67]:
df.loc[2]

Name                Gaurav
Age                     22
Address          Allahabad
Qualification          MCA
Name: 2, dtype: object

In [70]:
df.loc[2:5]

Unnamed: 0,Name,Age,Address,Qualification
2,Gaurav,22,Allahabad,MCA
3,Anuj,32,Kannauj,Phd


In [None]:
df.loc[-1] # no negative indexing

KeyError: -1

In [75]:
df.iloc[2]

Name                Gaurav
Age                     22
Address          Allahabad
Qualification          MCA
Name: 2, dtype: object

In [76]:
df.iloc[2:5]

Unnamed: 0,Name,Age,Address,Qualification
2,Gaurav,22,Allahabad,MCA
3,Anuj,32,Kannauj,Phd


In [None]:
# q) df.loc vd df.iloc

In [77]:
! ls sample_data

ls: cannot access 'sample_data': No such file or directory
