# <font color="purple"><h3 align="center">DataFrame Basics Tutorial</h3></font>

## **Dataframe is most commonly used object in pandas. It is a table like datastructure containing rows and columns similar to excel spreadsheet**

In [1]:
import pandas as pd
weather_data = {
    'day': ['1/1/2017','1/2/2017','1/3/2017','1/4/2017','1/5/2017','1/6/2017'],
    'temperature': [32,35,28,24,32,31],
    'windspeed': [6,7,2,7,4,2],
    'event': ['Rain', 'Sunny', 'Snow','Snow','Sunny', 'Sunny']
}
df = pd.DataFrame(weather_data)
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Sunny
5,1/6/2017,31,2,Sunny


In [2]:
df.shape # rows, columns = df.shape

(6, 4)

In [3]:
newdf = df[2:5]
newdf

Unnamed: 0,day,temperature,windspeed,event
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Sunny


In [6]:
newdf = df.iloc[2:5, :-1]
newdf

Unnamed: 0,day,temperature,windspeed
2,1/3/2017,28,2
3,1/4/2017,24,7
4,1/5/2017,32,4


## <font color='blue'>Rows</font>

In [9]:
df.head() # df.head(3) 

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Sunny


In [11]:
df.tail(3) # df.tail(2)

Unnamed: 0,day,temperature,windspeed,event
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Sunny
5,1/6/2017,31,2,Sunny


In [13]:
df.index[df.day ==  "1/4/2017"][0]

3

In [18]:
startindex = int(df.index[df.day ==  "1/4/2017"][0])
type(startindex)

int

In [19]:
df[ startindex : startindex + 3 ]

Unnamed: 0,day,temperature,windspeed,event
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Sunny
5,1/6/2017,31,2,Sunny


## <font color='blue'>Columns</font>

In [12]:
print(df.columns)
columnNames = df.columns.to_list()
print(columnNames)

Index(['day', 'temperature', 'windspeed', 'event'], dtype='object')
['day', 'temperature', 'windspeed', 'event']


In [14]:
df.event

0     Rain
1    Sunny
2     Snow
3     Snow
4    Sunny
5    Sunny
Name: event, dtype: object

In [15]:
eventlist = df.event.to_list()
eventlist

['Rain', 'Sunny', 'Snow', 'Snow', 'Sunny', 'Sunny']

In [16]:
twodf = df[ ['day','event']  ]
twodf

Unnamed: 0,day,event
0,1/1/2017,Rain
1,1/2/2017,Sunny
2,1/3/2017,Snow
3,1/4/2017,Snow
4,1/5/2017,Sunny
5,1/6/2017,Sunny


In [17]:
df[["day", "event"]]

Unnamed: 0,day,event
0,1/1/2017,Rain
1,1/2/2017,Sunny
2,1/3/2017,Snow
3,1/4/2017,Snow
4,1/5/2017,Sunny
5,1/6/2017,Sunny


## <font color='blue'>Operations On DataFrame</font>

In [18]:
print(df.temperature.mean())
print(df['temperature'].std())

30.333333333333332
3.8297084310253524


In [20]:
len(df[ df['temperature'] > 30 ])

4

In [21]:
df['day'] [ df['temperature'] == df['temperature'].min() ] # Kinda doing SQL in pandas

3    1/4/2017
Name: day, dtype: object

In [None]:
df['day'][df['temperature'] == df['temperature'].min()] # Kinda doing SQL in pandas

In [24]:
data = df.temperature.sort_values()
data

3    24
2    28
5    31
0    32
4    32
1    35
Name: temperature, dtype: int64

In [18]:
df['temperature'].mean() # Kinda doing SQL in pandas

30.333333333333332

In [22]:
df['temperature'].std()

3.8297084310253524

In [24]:
df['event'].max() # But mean() won't work since data type is string

'Sunny'

In [25]:
# get max occuring element 

In [25]:
df.event.value_counts()

Sunny    3
Snow     2
Rain     1
Name: event, dtype: int64

In [26]:
df.event.value_counts().index[0]

'Sunny'

In [27]:
df.event.value_counts()[0]

3

In [27]:
print("Max Event : " + df.event.value_counts().index[0] + " and it occured " + str( df.event.value_counts()[0]) + " times" )

Max Event : Sunny and it occured 3 times


In [28]:
df.describe()

Unnamed: 0,temperature,windspeed
count,6.0,6.0
mean,30.333333,4.666667
std,3.829708,2.33809
min,24.0,2.0
25%,28.75,2.5
50%,31.5,5.0
75%,32.0,6.75
max,35.0,7.0


**Google pandas series operations to find out list of all operations**
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html

## <font color='blue'>set_index</font>

In [25]:
df.set_index("day", inplace=True)

In [26]:
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1/2017,32,6,Rain
1/2/2017,35,7,Sunny
1/3/2017,28,2,Snow
1/4/2017,24,7,Snow
1/5/2017,32,4,Sunny
1/6/2017,31,2,Sunny


In [27]:
df.shape

(6, 3)

In [28]:
df['day'][df['temperature'] == df['temperature'].min()] # Kinda doing SQL in pandas

KeyError: 'day'

In [29]:
df.loc["1/1/2017" : "1/4/2017"]

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1/2017,32,6,Rain
1/2/2017,35,7,Sunny
1/3/2017,28,2,Snow
1/4/2017,24,7,Snow


In [27]:
df.shape

(6, 3)

In [30]:
df.reset_index(inplace=True)
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Sunny
5,1/6/2017,31,2,Sunny


In [31]:
df.shape

(6, 4)

In [32]:
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Sunny
5,1/6/2017,31,2,Sunny


In [36]:
df.reset_index(inplace=True)

In [33]:
newdf = df.copy()
newdf.set_index("day" , inplace=True)

In [34]:
newdf

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1/2017,32,6,Rain
1/2/2017,35,7,Sunny
1/3/2017,28,2,Snow
1/4/2017,24,7,Snow
1/5/2017,32,4,Sunny
1/6/2017,31,2,Sunny


In [35]:
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,35,7,Sunny
2,1/3/2017,28,2,Snow
3,1/4/2017,24,7,Snow
4,1/5/2017,32,4,Sunny
5,1/6/2017,31,2,Sunny


In [42]:
newdf.set_index("day" , inplace=True)

In [43]:
newdf

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1/2017,32,6,Rain
1/2/2017,35,7,Sunny
1/3/2017,28,2,Snow
1/4/2017,24,7,Snow
1/5/2017,32,4,Sunny
1/6/2017,31,2,Sunny


In [36]:
newdf.reset_index(inplace=True)

In [37]:
newdf.set_index("event", inplace=True)
newdf

Unnamed: 0_level_0,day,temperature,windspeed
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Rain,1/1/2017,32,6
Sunny,1/2/2017,35,7
Snow,1/3/2017,28,2
Snow,1/4/2017,24,7
Sunny,1/5/2017,32,4
Sunny,1/6/2017,31,2


In [38]:
newdf.loc["Sunny"]

Unnamed: 0_level_0,day,temperature,windspeed
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Sunny,1/2/2017,35,7
Sunny,1/5/2017,32,4
Sunny,1/6/2017,31,2


In [41]:
newdf.loc["Snow"]

Unnamed: 0_level_0,day,temperature,windspeed
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Snow,1/3/2017,28,2
Snow,1/4/2017,24,7


In [39]:
df["NewData"] = np.arange(1,7) # [ 0,1,2,3,4,5]
df

Unnamed: 0,day,temperature,windspeed,event,NewData
0,1/1/2017,32,6,Rain,1
1,1/2/2017,35,7,Sunny,2
2,1/3/2017,28,2,Snow,3
3,1/4/2017,24,7,Snow,4
4,1/5/2017,32,4,Sunny,5
5,1/6/2017,31,2,Sunny,6


In [45]:
mylist = []
for i in df.temperature:
    if i == 32:
        mylist.append(True)
    else:
        mylist.append(False)
df["Check"] = mylist

In [46]:
df

Unnamed: 0,day,temperature,windspeed,event,NewData,Check
0,1/1/2017,32,6,Rain,0,True
1,1/2/2017,35,7,Sunny,1,False
2,1/3/2017,28,2,Snow,2,False
3,1/4/2017,24,7,Snow,3,False
4,1/5/2017,32,4,Sunny,4,True
5,1/6/2017,31,2,Sunny,5,False


In [40]:
df["Names"] = np.array(["Ahmed", "Ali", "Omar" , "Emad" , "Anas" , "Amr" ])
df

Unnamed: 0,day,temperature,windspeed,event,NewData,Names
0,1/1/2017,32,6,Rain,1,Ahmed
1,1/2/2017,35,7,Sunny,2,Ali
2,1/3/2017,28,2,Snow,3,Omar
3,1/4/2017,24,7,Snow,4,Emad
4,1/5/2017,32,4,Sunny,5,Anas
5,1/6/2017,31,2,Sunny,6,Amr


In [49]:
df.reset_index(inplace=True)
df.drop("event", inplace=True, axis= 1)
df

KeyError: "['event'] not found in axis"

In [43]:
df.drop("index", inplace=True, axis = 1)

In [44]:
df

Unnamed: 0,day,temperature,windspeed,NewData,Names
0,1/1/2017,32,6,0,Ahmed
1,1/2/2017,35,7,1,Ali
2,1/3/2017,28,2,2,Omar
3,1/4/2017,24,7,3,Emad
4,1/5/2017,32,4,4,Anas
5,1/6/2017,31,2,5,Amr


In [58]:
df.index

RangeIndex(start=0, stop=6, step=1)

In [59]:
#df.reset_index(inplace=True)
df.set_index("temperature", inplace=True)

In [60]:
df

Unnamed: 0_level_0,day,windspeed,Names
temperature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
32,1/1/2017,6,Ahmed
35,1/2/2017,7,Ali
28,1/3/2017,2,Omar
24,1/4/2017,7,Emad
32,1/5/2017,4,Anas
31,1/6/2017,2,Amr


In [61]:
df.loc[32]

Unnamed: 0_level_0,day,windspeed,Names
temperature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
32,1/1/2017,6,Ahmed
32,1/5/2017,4,Anas


In [62]:
df.head()

Unnamed: 0_level_0,day,windspeed,Names
temperature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
32,1/1/2017,6,Ahmed
35,1/2/2017,7,Ali
28,1/3/2017,2,Omar
24,1/4/2017,7,Emad
32,1/5/2017,4,Anas


In [41]:
weather_data = {
    'day': ['1/1/2017','1/2/2017','1/3/2017','1/4/2017','1/5/2017','1/6/2017'],
    'temperature': [32,35,28,24,32,31],
    'windspeed': [6,7,2,7,4,2],
    'event': ['Rain', 'Sunny', 'Snow','Snow','Sunny', 'Sunny']
}
df = pd.DataFrame(weather_data)
df.set_index('event',inplace=True) # this is kind of building a hash map using event as a key
df

Unnamed: 0_level_0,day,temperature,windspeed
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Rain,1/1/2017,32,6
Sunny,1/2/2017,35,7
Snow,1/3/2017,28,2
Snow,1/4/2017,24,7
Sunny,1/5/2017,32,4
Sunny,1/6/2017,31,2


In [42]:
df

Unnamed: 0_level_0,day,temperature,windspeed
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Rain,1/1/2017,32,6
Sunny,1/2/2017,35,7
Snow,1/3/2017,28,2
Snow,1/4/2017,24,7
Sunny,1/5/2017,32,4
Sunny,1/6/2017,31,2


In [43]:
columns = []
data = dict()

num = int(input("please enter the number of columns"))
while(num > 0):
    columns.append(input("please enter the column name: "))
    num -=1

for i in columns:
    data[i] = []

rows = int(input("please enter the number of rows: "))
while(rows > 0):
    for i in data:
        value = input(f"please enter the value of {i}: ")
        data[i].append(value)

    rows-=1
dataframe = pd.DataFrame(data)
dataframe

please enter the number of columns3
please enter the column name: Name
please enter the column name: Age
please enter the column name: Salary
please enter the number of rows: 3
please enter the value of Name: Saif
please enter the value of Age: 23
please enter the value of Salary: 1200
please enter the value of Name: Tasnim
please enter the value of Age: 20
please enter the value of Salary: 12.5
please enter the value of Name: Ahmed
please enter the value of Age: 80
please enter the value of Salary: 12345678675645342312454678


Unnamed: 0,Name,Age,Salary
0,Saif,23,1200
1,Tasnim,20,12.5
2,Ahmed,80,12345678675645342312454678


In [48]:
dataframe["Id"].dtype

dtype('O')

In [53]:
dataframe.id.astype(np.int8)

0    1
1    2
Name: id, dtype: int8

In [86]:
dataframe.Age = dataframe.Age.astype(np.int16)
dataframe.Age.dtype

dtype('int16')