<h3>Manipulating Pandas Dataframes</h3>


In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sns.set_style('darkgrid')

In [3]:
titanic_data = sns.load_dataset('titanic')
titanic_data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


<p>Selecting Data using Brackets</p>


In [4]:
titanic_data['class']

0       Third
1       First
2       Third
3       First
4       Third
        ...  
886    Second
887     First
888     Third
889     First
890     Third
Name: class, Length: 891, dtype: category
Categories (3, object): ['First', 'Second', 'Third']

In [5]:
titanic_data[['class', 'sex', 'age']]

Unnamed: 0,class,sex,age
0,Third,male,22.0
1,First,female,38.0
2,Third,female,26.0
3,First,female,35.0
4,Third,male,35.0
...,...,...,...
886,Second,male,27.0
887,First,female,19.0
888,Third,female,
889,First,male,26.0


<p>Filtering data using a specific column value</p>


In [6]:
my_df = titanic_data[titanic_data['sex'] == 'male']
my_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False


In [8]:
my_df = titanic_data[(titanic_data['sex'] == 'male') &
                     (titanic_data['class'] == 'First')]
my_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
23,1,1,male,28.0,0,0,35.5,S,First,man,True,A,Southampton,yes,True
27,0,1,male,19.0,3,2,263.0,S,First,man,True,C,Southampton,no,False
30,0,1,male,40.0,0,0,27.7208,C,First,man,True,,Cherbourg,no,True
34,0,1,male,28.0,1,0,82.1708,C,First,man,True,,Cherbourg,no,False


<p>Filtering using isin()</p>


In [9]:
ages = [20, 21, 22]

In [10]:
age_dataset = titanic_data[titanic_data['age'].isin(ages)]
age_dataset.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
12,0,3,male,20.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
37,0,3,male,21.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
51,0,3,male,21.0,0,0,7.8,S,Third,man,True,,Southampton,no,True
56,1,2,female,21.0,0,0,10.5,S,Second,woman,False,,Southampton,yes,True


<p>Indexing and slicing using the loc function</p>


In [11]:
scores = [
    {'Subject': 'Mathematics', 'Score': 85, 'Grade': 'B',
        'Remarks': 'Good', },
    {'Subject': 'History', 'Score': 98, 'Grade': 'A', 'Remarks':
        'Excellent'},
    {'Subject': 'English', 'Score': 76, 'Grade': 'C', 'Remarks':
        'Fair'},
    {'Subject': 'Science', 'Score': 72, 'Grade': 'C', 'Remarks':
        'Fair'},
    {'Subject': 'Arts', 'Score': 95, 'Grade': 'A', 'Remarks':
        'Excellent'}
]

In [12]:
my_df = pd.DataFrame(scores)
my_df.head()

Unnamed: 0,Subject,Score,Grade,Remarks
0,Mathematics,85,B,Good
1,History,98,A,Excellent
2,English,76,C,Fair
3,Science,72,C,Fair
4,Arts,95,A,Excellent


In [13]:
my_df.loc[2]

Subject    English
Score           76
Grade            C
Remarks       Fair
Name: 2, dtype: object

In [14]:
my_df.loc[2:4]

Unnamed: 0,Subject,Score,Grade,Remarks
2,English,76,C,Fair
3,Science,72,C,Fair
4,Arts,95,A,Excellent


In [15]:
my_df.loc[2:4, ['Grade', 'Score']]

Unnamed: 0,Grade,Score
2,C,76
3,C,72
4,A,95


In [16]:
scores = [
    {'Subject': 'Mathematics', 'Score': 85, 'Grade': 'B',
     'Remarks': 'Good', },
    {'Subject': 'History', 'Score': 98, 'Grade': 'A', 'Remarks':
        'Excellent'},
    {'Subject': 'English', 'Score': 76, 'Grade': 'C', 'Remarks':
        'Fair'},
    {'Subject': 'Science', 'Score': 72, 'Grade': 'C', 'Remarks':
        'Fair'},
    {'Subject': 'Arts', 'Score': 95, 'Grade': 'A', 'Remarks':
        'Excellent'},
]

In [17]:
my_df = pd.DataFrame(scores, index = ['Student1', 'Student2', 'Student3', 'Student4', 'Student5'])
my_df

Unnamed: 0,Subject,Score,Grade,Remarks
Student1,Mathematics,85,B,Good
Student2,History,98,A,Excellent
Student3,English,76,C,Fair
Student4,Science,72,C,Fair
Student5,Arts,95,A,Excellent


In [18]:
my_df.loc['Student1']

Subject    Mathematics
Score               85
Grade                B
Remarks           Good
Name: Student1, dtype: object

In [19]:
index_list = ['Student1', 'Student2']
my_df.loc[index_list]

Unnamed: 0,Subject,Score,Grade,Remarks
Student1,Mathematics,85,B,Good
Student2,History,98,A,Excellent


In [20]:
my_df.loc[index_list, 'Grade']

Student1    B
Student2    A
Name: Grade, dtype: object

In [21]:
my_df.loc[[False, False, False, True, False ]]

Unnamed: 0,Subject,Score,Grade,Remarks
Student4,Science,72,C,Fair


In [22]:
my_df['Score'] > 80

Student1     True
Student2     True
Student3    False
Student4    False
Student5     True
Name: Score, dtype: bool

In [23]:
my_df.loc[my_df['Score'] > 80]

Unnamed: 0,Subject,Score,Grade,Remarks
Student1,Mathematics,85,B,Good
Student2,History,98,A,Excellent
Student5,Arts,95,A,Excellent


In [24]:
my_df.loc[(my_df['Score'] > 80) & (my_df['Remarks'] == 'Excellent')]

Unnamed: 0,Subject,Score,Grade,Remarks
Student2,History,98,A,Excellent
Student5,Arts,95,A,Excellent


In [26]:
my_df.loc[my_df['Score'] > 80 , ['Score', 'Grade']]

Unnamed: 0,Score,Grade
Student1,85,B
Student2,98,A
Student5,95,A


In [27]:
my_df

Unnamed: 0,Subject,Score,Grade,Remarks
Student1,Mathematics,85,B,Good
Student2,History,98,A,Excellent
Student3,English,76,C,Fair
Student4,Science,72,C,Fair
Student5,Arts,95,A,Excellent


In [28]:
my_df.loc['Student4'] = 90
my_df

Unnamed: 0,Subject,Score,Grade,Remarks
Student1,Mathematics,85,B,Good
Student2,History,98,A,Excellent
Student3,English,76,C,Fair
Student4,90,90,90,90
Student5,Arts,95,A,Excellent
