PANDAS
------------

Pandas provides high-performance data manipulation and analysis tool using its powerful data structures. 

When working with tabular data, such as data stored in spreadsheets or databases, pandas is the right tool for you.  pandas will help you to explore, clean, and process your data.

Supports the integration with many file formats or data sources out of the box (csv, excel, sql, json, xml,. . . ).

Selecting or filtering specific rows and/or columns. Filtering the data on a condition? Methods for slicing, selecting, and extracting the data you need are available in pandas.

Provides plotting your data out of the box, using the power of Matplotlib. You can pick the plot type (scatter, bar, boxplot,. . . ) corresponding to your data.

Basic statistics (mean, median, min, max, counts. . . ) are easily calculable.  These or custom aggregations can be applied on the entire data set, a sliding window of the data, or grouped by categories

Multiple tables can be concatenated both column wise and row wise as database-like join/merge operations are provided to combine multiple tables of data.

Pandas has great support for time series and has an extensive set of tools for working with dates, times, and time-indexed data.

Data sets do not only contain numerical data. pandas provides a wide range of functions to clean textual data and extract useful information from it.

Pandas can be installed via

In [None]:
! pip install pandas



In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame(
    {
       "Name": ["Sachin Tendulkar","Rahul Dravid","Mithali Raj"],
        "Age": [22, 35, 58],
        "Sex": ["male", "male", "female"],
    }
    )
df

Unnamed: 0,Name,Age,Sex
0,Sachin Tendulkar,22,male
1,Rahul Dravid,35,male
2,Mithali Raj,58,female


In [3]:
type(df)

pandas.core.frame.DataFrame

In [4]:
df

Unnamed: 0,Name,Age,Sex
0,Sachin Tendulkar,22,male
1,Rahul Dravid,35,male
2,Mithali Raj,58,female


In [8]:
type(df[["Age"]])

pandas.core.frame.DataFrame

In [None]:
type(df["Age"])

pandas.core.series.Series

In [23]:
df["Name"]

0    Sachin Tendulkar
1        Rahul Dravid
2         Mithali Raj
Name: Name, dtype: object

In [10]:
df[["Name", "Age"]]

Unnamed: 0,Name,Age
0,Sachin Tendulkar,22
1,Rahul Dravid,35
2,Mithali Raj,58


In [11]:
ages = pd.Series([22, 35, 58], name="Age")
print(ages)
print(type(ages))

0    22
1    35
2    58
Name: Age, dtype: int64
<class 'pandas.core.series.Series'>


### Case Study1

In [32]:
import numpy as np
import pandas as pd

In [23]:
data_employee={ 'employee_id':np.arange(1,101),
                'Age':np.random.randint(25,62,size=100),
                'Basic Pay':np.random.randint(15600,67100,size=100),
                'No of Clients':np.random.randint(1,1000,size=100),
                'Years of Service':np.random.randint(0,41,size=100),
                'Performance Score':np.random.randint(0,2,size=100)
              }

#print(type(data_employee))
df=pd.DataFrame(data_employee,columns=['employee_id','Age','Basic Pay','No of Clients','Years of Service','Performance Score'])            

#print(type(df))
df

Unnamed: 0,employee_id,Age,Basic Pay,No of Clients,Years of Service,Performance Score
0,1,27,53389,308,23,1
1,2,36,21127,111,27,1
2,3,36,34687,149,25,1
3,4,52,38280,162,6,1
4,5,43,63183,506,1,0
...,...,...,...,...,...,...
95,96,56,31644,928,9,0
96,97,52,43269,963,22,0
97,98,26,45127,764,17,1
98,99,34,20897,452,11,0


In [24]:
df.to_csv('emp.csv',sep=',',index=False)

In [25]:
df=pd.read_csv('emp.csv')

In [26]:
df

Unnamed: 0,employee_id,Age,Basic Pay,No of Clients,Years of Service,Performance Score
0,1,27,53389,308,23,1
1,2,36,21127,111,27,1
2,3,36,34687,149,25,1
3,4,52,38280,162,6,1
4,5,43,63183,506,1,0
...,...,...,...,...,...,...
95,96,56,31644,928,9,0
96,97,52,43269,963,22,0
97,98,26,45127,764,17,1
98,99,34,20897,452,11,0


In [None]:
df.iloc[[0,1,5,8,27]]

Unnamed: 0,employee_id,Age,Basic Pay,No of Clients,Years of Service,Performance Score
0,1,27,53389,308,23,1
1,2,36,21127,111,27,1
5,6,46,44016,373,40,0
8,9,32,54482,944,7,0
27,28,54,48187,73,27,0


### Case Study 2

In [5]:
import numpy as np
import pandas as pd

![Untitled.jpg](attachment:Untitled.jpg)

![Untitled.jpg](attachment:Untitled.jpg)

In [34]:
titanic = pd.read_csv("titanic.csv")

In [35]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [37]:
titanic["mean"]=titanic[["Age","Pclass"]].sum(axis=1)
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,mean
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,25.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,39.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,29.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,36.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,38.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,29.0
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,20.0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,3.0
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,27.0


In [38]:
titanic["me"]=titanic[["Age","Pclass"]].mean(axis=1)
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,mean,me
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,25.0,12.5
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,39.0,19.5
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,29.0,14.5
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,36.0,18.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,38.0,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,29.0,14.5
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,20.0,10.0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,3.0,3.0
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,27.0,13.5


In [None]:
def add(x):
    return x+100

In [None]:
titanic["add"]=titanic["me"].apply(add)

In [None]:
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,mean,me,add
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,25.0,12.5,112.5
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,39.0,19.5,119.5
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,29.0,14.5,114.5
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,36.0,18.0,118.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,38.0,19.0,119.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,29.0,14.5,114.5
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,20.0,10.0,110.0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,3.0,3.0,103.0
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,27.0,13.5,113.5


In [None]:
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,mean,me
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,25.0,12.5
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,39.0,19.5
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,29.0,14.5
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,36.0,18.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,38.0,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,29.0,14.5
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,20.0,10.0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,3.0,3.0
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,27.0,13.5


In [39]:
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,mean,me
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,25.0,12.5
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,39.0,19.5
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,29.0,14.5
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,36.0,18.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,38.0,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,29.0,14.5
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,20.0,10.0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,3.0,3.0
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,27.0,13.5


In [40]:
titanic["Name"]=titanic.Sex

In [None]:
titanic.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [None]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [None]:
ages = titanic["Age"]

In [None]:
ages.head(10)

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64

In [None]:
ages.tail()

886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, dtype: float64

In [None]:
type(titanic["Age"])

pandas.core.series.Series

In [None]:
titanic["Age"].shape

(891,)

In [None]:
age_sex = titanic[["Age", "Sex"]]
age_sex.head(10)

Unnamed: 0,Age,Sex
0,22.0,male
1,38.0,female
2,26.0,female
3,35.0,female
4,35.0,male
5,,male
6,54.0,male
7,2.0,male
8,27.0,female
9,14.0,female


In [None]:
type(titanic[["Age", "Sex"]])

pandas.core.frame.DataFrame

In [None]:
titanic[["Age", "Sex"]].shape

(891, 2)

In [None]:
#  interested in the passengers older than 35 years.

above_35 = titanic[titanic["Age"] > 35]
above_35

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S
13,14,0,3,"Andersson, Mr. Anders Johan",male,39.0,1,5,347082,31.2750,,S
15,16,1,2,"Hewlett, Mrs. (Mary D Kingcome)",female,55.0,0,0,248706,16.0000,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
865,866,1,2,"Bystrom, Mrs. (Karolina)",female,42.0,0,0,236852,13.0000,,S
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S
873,874,0,3,"Vander Cruyssen, Mr. Victor",male,47.0,0,0,345765,9.0000,,S
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C


In [41]:
# interested in the Titanic passengers from cabin class 2 and 3

class_23 = titanic[titanic["Pclass"].isin([2, 3])]
class_23

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,mean,me
0,1,0,3,male,male,22.0,1,0,A/5 21171,7.2500,,S,25.0,12.5
2,3,1,3,female,female,26.0,0,0,STON/O2. 3101282,7.9250,,S,29.0,14.5
4,5,0,3,male,male,35.0,0,0,373450,8.0500,,S,38.0,19.0
5,6,0,3,male,male,,0,0,330877,8.4583,,Q,3.0,3.0
7,8,0,3,male,male,2.0,3,1,349909,21.0750,,S,5.0,2.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,885,0,3,male,male,25.0,0,0,SOTON/OQ 392076,7.0500,,S,28.0,14.0
885,886,0,3,female,female,39.0,0,5,382652,29.1250,,Q,42.0,21.0
886,887,0,2,male,male,27.0,0,0,211536,13.0000,,S,29.0,14.5
888,889,0,3,female,female,,1,2,W./C. 6607,23.4500,,S,3.0,3.0


above is equivalent to filtering by rows for which the class is either 2 or 3 and combining the two statements with
an | (or) operator:

In [42]:

class_23 = titanic[(titanic["Pclass"] == 2) | (titanic["Pclass"] == 3)]
class_23


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,mean,me
0,1,0,3,male,male,22.0,1,0,A/5 21171,7.2500,,S,25.0,12.5
2,3,1,3,female,female,26.0,0,0,STON/O2. 3101282,7.9250,,S,29.0,14.5
4,5,0,3,male,male,35.0,0,0,373450,8.0500,,S,38.0,19.0
5,6,0,3,male,male,,0,0,330877,8.4583,,Q,3.0,3.0
7,8,0,3,male,male,2.0,3,1,349909,21.0750,,S,5.0,2.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884,885,0,3,male,male,25.0,0,0,SOTON/OQ 392076,7.0500,,S,28.0,14.0
885,886,0,3,female,female,39.0,0,5,382652,29.1250,,Q,42.0,21.0
886,887,0,2,male,male,27.0,0,0,211536,13.0000,,S,29.0,14.5
888,889,0,3,female,female,,1,2,W./C. 6607,23.4500,,S,3.0,3.0


In [None]:
# want to work with passenger data for which the age is known.

age_no_na = titanic[titanic["Age"].notna()]
age_no_na

# The notna() conditional function returns a True for each row the values are not an Null value.

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [None]:
# interested in the names of the passengers older than 35 years.

adult_names = titanic.loc[titanic["Age"] > 35, "Name"]
adult_names


1      Cumings, Mrs. John Bradley (Florence Briggs Th...
6                                McCarthy, Mr. Timothy J
11                              Bonnell, Miss. Elizabeth
13                           Andersson, Mr. Anders Johan
15                      Hewlett, Mrs. (Mary D Kingcome) 
                             ...                        
865                             Bystrom, Mrs. (Karolina)
871     Beckwith, Mrs. Richard Leonard (Sallie Monypeny)
873                          Vander Cruyssen, Mr. Victor
879        Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)
885                 Rice, Mrs. William (Margaret Norton)
Name: Name, Length: 217, dtype: object

In [None]:
# interested in rows 10 till 25 and columns 3 to 5.

titanic.iloc[9:25, 2:5]

Unnamed: 0,Pclass,Name,Sex
9,2,"Nasser, Mrs. Nicholas (Adele Achem)",female
10,3,"Sandstrom, Miss. Marguerite Rut",female
11,1,"Bonnell, Miss. Elizabeth",female
12,3,"Saundercock, Mr. William Henry",male
13,3,"Andersson, Mr. Anders Johan",male
14,3,"Vestrom, Miss. Hulda Amanda Adolfina",female
15,2,"Hewlett, Mrs. (Mary D Kingcome)",female
16,3,"Rice, Master. Eugene",male
17,2,"Williams, Mr. Charles Eugene",male
18,3,"Vander Planke, Mrs. Julius (Emelia Maria Vande...",female


When selecting subsets of data, square brackets [] are used.

Inside these brackets, you can use a single column/row label, a list of column/row labels, a slice of labels, a
conditional expression or a colon.

Select specific rows and/or columns using loc when using the row and column names

Select specific rows and/or columns using iloc when using the positions in the table

We can assign new values to a selection based on loc/iloc.

In [None]:
# the average age for male versus female Titanic passengers

titanic[["Sex", "Age"]].groupby("Sex").mean()


Unnamed: 0_level_0,Age
Sex,Unnamed: 1_level_1
female,27.915709
male,30.726645


In [None]:
# mean ticket fare price for each of the sex and cabin class combinations

titanic.groupby(["Sex", "Pclass"])["Fare"].mean()

Sex     Pclass
female  1         106.125798
        2          21.970121
        3          16.118810
male    1          67.226127
        2          19.741782
        3          12.661633
Name: Fare, dtype: float64

In [None]:
# number of passengers in each of the cabin classes

titanic["Pclass"].value_counts(normalize=True)

Pclass
3    0.551066
1    0.242424
2    0.206510
Name: proportion, dtype: float64

In [None]:
titanic["Survived"].value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

In [None]:
titanic["Survived"].unique()

array([0, 1], dtype=int64)

In [None]:
titanic.groupby("Pclass")["Pclass"].count()

Pclass
1    216
2    184
3    491
Name: Pclass, dtype: int64

In [None]:
# sort the Titanic data according to the age of the passengers

titanic.sort_values(by="Age")

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
803,804,1,3,"Thomas, Master. Assad Alexander",male,0.42,0,1,2625,8.5167,,C
755,756,1,2,"Hamalainen, Master. Viljo",male,0.67,1,1,250649,14.5000,,S
644,645,1,3,"Baclini, Miss. Eugenie",female,0.75,2,1,2666,19.2583,,C
469,470,1,3,"Baclini, Miss. Helene Barbara",female,0.75,2,1,2666,19.2583,,C
78,79,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29.0000,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


In [None]:
# sort the Titanic data according to the cabin class and age in descending order

titanic.sort_values(by=['Pclass', 'Age'], ascending=False)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
851,852,0,3,"Svensson, Mr. Johan",male,74.0,0,0,347060,7.7750,,S
116,117,0,3,"Connors, Mr. Patrick",male,70.5,0,0,370369,7.7500,,Q
280,281,0,3,"Duane, Mr. Frank",male,65.0,0,0,336439,7.7500,,Q
483,484,1,3,"Turkula, Mrs. (Hedwig)",female,63.0,0,0,4134,9.5875,,S
326,327,0,3,"Nysveen, Mr. Johan Hansen",male,61.0,0,0,345364,6.2375,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
766,767,0,1,"Brewe, Dr. Arthur Jackson",male,,0,0,112379,39.6000,,C
793,794,0,1,"Hoyt, Mr. William Fisher",male,,0,0,PC 17600,30.6958,,C
815,816,0,1,"Fry, Mr. Richard",male,,0,0,112058,0.0000,B102,S
839,840,1,1,"Marechal, Mr. Pierre",male,,0,0,11774,29.7000,C47,C


### Case Study3

In [103]:
import pandas as pd

df = pd.read_csv('whr.csv')
df

Unnamed: 0,Country,Happiness.Rank,Happiness.Score,Whisker.high,Whisker.low,Economy..GDP.per.Capita.,Family,Health..Life.Expectancy.,Freedom,Generosity,Trust..Government.Corruption.,Dystopia.Residual
0,Norway,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.355280,0.400770,2.313707
2,,3,7.504,7.622030,7.385970,1.480633,1.610574,0.833552,0.627163,0.475540,0.153527,
3,Switzerland,4,7.494,7.561772,7.426227,1.564980,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716
4,Finland,5,7.469,7.527542,7.410458,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182
...,...,...,...,...,...,...,...,...,...,...,...,...
150,Rwanda,151,3.471,3.543030,3.398970,0.368746,0.945707,0.326425,0.581844,0.252756,0.455220,0.540061
151,Syria,152,3.462,3.663669,3.260331,0.777153,0.396103,0.500533,0.081539,0.493664,0.151347,1.061574
152,Tanzania,153,3.349,3.461430,3.236570,0.511136,1.041990,0.364509,0.390018,0.354256,0.066035,0.621130
153,Burundi,154,2.905,3.074690,2.735310,0.091623,0.629794,0.151611,0.059901,0.204435,0.084148,1.683024


![whi.jpg](attachment:whi.jpg)

In [104]:
df['Country']

0                        Norway
1                       Denmark
2                           NaN
3                   Switzerland
4                       Finland
                 ...           
150                      Rwanda
151                       Syria
152                    Tanzania
153                     Burundi
154    Central African Republic
Name: Country, Length: 155, dtype: object

In [105]:
df[['Country','Happiness.Rank']]

Unnamed: 0,Country,Happiness.Rank
0,Norway,1
1,Denmark,2
2,,3
3,Switzerland,4
4,Finland,5
...,...,...
150,Rwanda,151
151,Syria,152
152,Tanzania,153
153,Burundi,154


In [106]:
df[['Country','Happiness.Rank','Health..Life.Expectancy.']]

Unnamed: 0,Country,Happiness.Rank,Health..Life.Expectancy.
0,Norway,1,0.796667
1,Denmark,2,0.792566
2,,3,0.833552
3,Switzerland,4,0.858131
4,Finland,5,0.809158
...,...,...,...
150,Rwanda,151,0.326425
151,Syria,152,0.500533
152,Tanzania,153,0.364509
153,Burundi,154,0.151611


In [107]:
df[:5]

Unnamed: 0,Country,Happiness.Rank,Happiness.Score,Whisker.high,Whisker.low,Economy..GDP.per.Capita.,Family,Health..Life.Expectancy.,Freedom,Generosity,Trust..Government.Corruption.,Dystopia.Residual
0,Norway,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.35528,0.40077,2.313707
2,,3,7.504,7.62203,7.38597,1.480633,1.610574,0.833552,0.627163,0.47554,0.153527,
3,Switzerland,4,7.494,7.561772,7.426227,1.56498,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716
4,Finland,5,7.469,7.527542,7.410458,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182


In [108]:
df.iloc[:,:2]

Unnamed: 0,Country,Happiness.Rank
0,Norway,1
1,Denmark,2
2,,3
3,Switzerland,4
4,Finland,5
...,...,...
150,Rwanda,151
151,Syria,152
152,Tanzania,153
153,Burundi,154


In [109]:
df.iloc[:5,:2]

Unnamed: 0,Country,Happiness.Rank
0,Norway,1
1,Denmark,2
2,,3
3,Switzerland,4
4,Finland,5


In [110]:
df[df['Country']=='India']

Unnamed: 0,Country,Happiness.Rank,Happiness.Score,Whisker.high,Whisker.low,Economy..GDP.per.Capita.,Family,Health..Life.Expectancy.,Freedom,Generosity,Trust..Government.Corruption.,Dystopia.Residual
121,India,122,4.315,4.371522,4.258478,0.792221,0.754373,0.455428,0.469987,0.231538,0.092227,1.519117


In [111]:
sorted_data = df.sort_values(by='Freedom')
sorted_data[:5]

Unnamed: 0,Country,Happiness.Rank,Happiness.Score,Whisker.high,Whisker.low,Economy..GDP.per.Capita.,Family,Health..Life.Expectancy.,Freedom,Generosity,Trust..Government.Corruption.,Dystopia.Residual
139,Angola,140,3.795,3.951642,3.638358,0.858428,1.104412,0.049869,0.0,0.097926,0.06972,1.614482
129,Sudan,130,4.139,4.345747,3.932253,0.659517,1.214009,0.290921,0.014996,0.182317,0.089848,1.687066
144,Haiti,145,3.603,3.734715,3.471285,0.36861,0.64045,0.277321,0.03037,0.489204,0.099872,1.697168
153,Burundi,154,2.905,3.07469,2.73531,0.091623,0.629794,0.151611,0.059901,0.204435,0.084148,1.683024
151,Syria,152,3.462,3.663669,3.260331,0.777153,0.396103,0.500533,0.081539,0.493664,0.151347,1.061574


In [59]:
df = pd.read_csv('whr.csv')
df

Unnamed: 0,Country,Happiness.Rank,Happiness.Score,Whisker.high,Whisker.low,Economy..GDP.per.Capita.,Family,Health..Life.Expectancy.,Freedom,Generosity,Trust..Government.Corruption.,Dystopia.Residual
0,Norway,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.355280,0.400770,2.313707
2,,3,7.504,7.622030,7.385970,1.480633,1.610574,0.833552,0.627163,0.475540,0.153527,
3,Switzerland,4,7.494,7.561772,7.426227,1.564980,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716
4,Finland,5,7.469,7.527542,7.410458,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182
...,...,...,...,...,...,...,...,...,...,...,...,...
150,Rwanda,151,3.471,3.543030,3.398970,0.368746,0.945707,0.326425,0.581844,0.252756,0.455220,0.540061
151,Syria,152,3.462,3.663669,3.260331,0.777153,0.396103,0.500533,0.081539,0.493664,0.151347,1.061574
152,Tanzania,153,3.349,3.461430,3.236570,0.511136,1.041990,0.364509,0.390018,0.354256,0.066035,0.621130
153,Burundi,154,2.905,3.074690,2.735310,0.091623,0.629794,0.151611,0.059901,0.204435,0.084148,1.683024


In [60]:
df.apply(lambda x : x)

Unnamed: 0,Country,Happiness.Rank,Happiness.Score,Whisker.high,Whisker.low,Economy..GDP.per.Capita.,Family,Health..Life.Expectancy.,Freedom,Generosity,Trust..Government.Corruption.,Dystopia.Residual
0,Norway,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.355280,0.400770,2.313707
2,,3,7.504,7.622030,7.385970,1.480633,1.610574,0.833552,0.627163,0.475540,0.153527,
3,Switzerland,4,7.494,7.561772,7.426227,1.564980,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716
4,Finland,5,7.469,7.527542,7.410458,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182
...,...,...,...,...,...,...,...,...,...,...,...,...
150,Rwanda,151,3.471,3.543030,3.398970,0.368746,0.945707,0.326425,0.581844,0.252756,0.455220,0.540061
151,Syria,152,3.462,3.663669,3.260331,0.777153,0.396103,0.500533,0.081539,0.493664,0.151347,1.061574
152,Tanzania,153,3.349,3.461430,3.236570,0.511136,1.041990,0.364509,0.390018,0.354256,0.066035,0.621130
153,Burundi,154,2.905,3.074690,2.735310,0.091623,0.629794,0.151611,0.059901,0.204435,0.084148,1.683024


In [61]:
df.apply(lambda x : x[0], axis=0)

Country                            Norway
Happiness.Rank                          1
Happiness.Score                     7.537
Whisker.high                     7.594445
Whisker.low                      7.479556
Economy..GDP.per.Capita.         1.616463
Family                           1.533524
Health..Life.Expectancy.         0.796667
Freedom                          0.635423
Generosity                       0.362012
Trust..Government.Corruption.    0.315964
Dystopia.Residual                     NaN
dtype: object

In [62]:
df.apply(lambda x : x[0], axis=1)

  df.apply(lambda x : x[0], axis=1)


0                        Norway
1                       Denmark
2                           NaN
3                   Switzerland
4                       Finland
                 ...           
150                      Rwanda
151                       Syria
152                    Tanzania
153                     Burundi
154    Central African Republic
Length: 155, dtype: object

In [63]:
df.apply(lambda x : x['Happiness.Score'], axis=1)

0      7.537
1      7.522
2      7.504
3      7.494
4      7.469
       ...  
150    3.471
151    3.462
152    3.349
153    2.905
154    2.693
Length: 155, dtype: float64

In [64]:
def clip_score(score):
    if score > 7 :
        score=7
    return score

hs = df['Happiness.Score'].apply(lambda x: clip_score(x))
hs

0      7.000
1      7.000
2      7.000
3      7.000
4      7.000
       ...  
150    3.471
151    3.462
152    3.349
153    2.905
154    2.693
Name: Happiness.Score, Length: 155, dtype: float64

### 3. Descriptive Stats using CSV data

In [112]:
x = pd.read_csv('whr.csv')
x

Unnamed: 0,Country,Happiness.Rank,Happiness.Score,Whisker.high,Whisker.low,Economy..GDP.per.Capita.,Family,Health..Life.Expectancy.,Freedom,Generosity,Trust..Government.Corruption.,Dystopia.Residual
0,Norway,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.355280,0.400770,2.313707
2,,3,7.504,7.622030,7.385970,1.480633,1.610574,0.833552,0.627163,0.475540,0.153527,
3,Switzerland,4,7.494,7.561772,7.426227,1.564980,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716
4,Finland,5,7.469,7.527542,7.410458,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182
...,...,...,...,...,...,...,...,...,...,...,...,...
150,Rwanda,151,3.471,3.543030,3.398970,0.368746,0.945707,0.326425,0.581844,0.252756,0.455220,0.540061
151,Syria,152,3.462,3.663669,3.260331,0.777153,0.396103,0.500533,0.081539,0.493664,0.151347,1.061574
152,Tanzania,153,3.349,3.461430,3.236570,0.511136,1.041990,0.364509,0.390018,0.354256,0.066035,0.621130
153,Burundi,154,2.905,3.074690,2.735310,0.091623,0.629794,0.151611,0.059901,0.204435,0.084148,1.683024


In [113]:
print (x.index)

RangeIndex(start=0, stop=155, step=1)


In [114]:
print (x.columns)

Index(['Country', 'Happiness.Rank', 'Happiness.Score', 'Whisker.high',
       'Whisker.low', 'Economy..GDP.per.Capita.', 'Family',
       'Health..Life.Expectancy.', 'Freedom', 'Generosity',
       'Trust..Government.Corruption.', 'Dystopia.Residual'],
      dtype='object')


In [115]:
print (x.values)

[['Norway' 1 7.537000179 ... 0.362012237 0.315963835 nan]
 ['Denmark' 2 7.521999836 ... 0.355280489 0.400770068 2.313707352]
 [nan 3 7.504000187 ... 0.475540221 0.153526559 nan]
 ...
 ['Tanzania' 153 3.348999977 ... 0.354256362 0.066035107 0.621130466]
 ['Burundi' 154 2.904999971 ... 0.204435185 0.084147945 1.683024168]
 ['Central African Republic' 155 2.693000078 ... 0.280876487 0.056565076
  2.066004753]]


In [116]:
print (x.shape)

(155, 12)


In [117]:
print (x.count())

Country                          154
Happiness.Rank                   155
Happiness.Score                  155
Whisker.high                     155
Whisker.low                      155
Economy..GDP.per.Capita.         155
Family                           155
Health..Life.Expectancy.         155
Freedom                          155
Generosity                       155
Trust..Government.Corruption.    155
Dystopia.Residual                153
dtype: int64


In [118]:
print (x.describe(include=object))

       Country
count      154
unique     154
top     Norway
freq         1


In [119]:
print (x.head())

       Country  Happiness.Rank  Happiness.Score  Whisker.high  Whisker.low  \
0       Norway               1            7.537      7.594445     7.479556   
1      Denmark               2            7.522      7.581728     7.462272   
2          NaN               3            7.504      7.622030     7.385970   
3  Switzerland               4            7.494      7.561772     7.426227   
4      Finland               5            7.469      7.527542     7.410458   

   Economy..GDP.per.Capita.    Family  Health..Life.Expectancy.   Freedom  \
0                  1.616463  1.533524                  0.796667  0.635423   
1                  1.482383  1.551122                  0.792566  0.626007   
2                  1.480633  1.610574                  0.833552  0.627163   
3                  1.564980  1.516912                  0.858131  0.620071   
4                  1.443572  1.540247                  0.809158  0.617951   

   Generosity  Trust..Government.Corruption.  Dystopia.Residual  
0 

In [120]:
print (x.tail())

                      Country  Happiness.Rank  Happiness.Score  Whisker.high  \
150                    Rwanda             151            3.471      3.543030   
151                     Syria             152            3.462      3.663669   
152                  Tanzania             153            3.349      3.461430   
153                   Burundi             154            2.905      3.074690   
154  Central African Republic             155            2.693      2.864884   

     Whisker.low  Economy..GDP.per.Capita.    Family  \
150     3.398970                  0.368746  0.945707   
151     3.260331                  0.777153  0.396103   
152     3.236570                  0.511136  1.041990   
153     2.735310                  0.091623  0.629794   
154     2.521116                  0.000000  0.000000   

     Health..Life.Expectancy.   Freedom  Generosity  \
150                  0.326425  0.581844    0.252756   
151                  0.500533  0.081539    0.493664   
152              

### 4.Introd to DataFrame

In [121]:
data = [['Alex',10],['Bob',12],['Clarke',13]]
print(type(data))
df = pd.DataFrame(data,columns=['Name','Age'])
print (df)
print(type(df))

<class 'list'>
     Name  Age
0    Alex   10
1     Bob   12
2  Clarke   13
<class 'pandas.core.frame.DataFrame'>


In [122]:
data = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'],'Age':[28,34,29,42]}
 
print(type(data))

df = pd.DataFrame(data)

print (df)

print(type(df))
#Create a DataFrame from Dict of ndarrays / Lists

<class 'dict'>
    Name  Age
0    Tom   28
1   Jack   34
2  Steve   29
3  Ricky   42
<class 'pandas.core.frame.DataFrame'>


In [123]:
data = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'],'Age':[28,34,29,42]}
df = pd.DataFrame(data, index=['rank1','rank2','rank3','rank4'])
print (df)

        Name  Age
rank1    Tom   28
rank2   Jack   34
rank3  Steve   29
rank4  Ricky   42


In [124]:
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df = pd.DataFrame(data)
print (df)

   a   b     c
0  1   2   NaN
1  5  10  20.0


In [125]:
data = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df = pd.DataFrame(data, index=['first', 'second'])
print (df)

        a   b     c
first   1   2   NaN
second  5  10  20.0


In [126]:
import pandas as pd

d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
      'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)
print (df)

print(df['one'])

print(type(df['one']))
#Create a DataFrame from Dict of Series

   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4
a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64
<class 'pandas.core.series.Series'>


In [78]:
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
      'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)
print (df[['one','two']])

#Column Selection

   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4


In [None]:
print (df['two'])

In [79]:
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
      'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)
print (df['one'])
print()
print (df['two'])
#Column Selection

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

a    1
b    2
c    3
d    4
Name: two, dtype: int64


In [80]:
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
      'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
df
# Adding a new column to an existing DataFrame object with column label by passing new series

Unnamed: 0,one,two
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


In [81]:
print ("Adding a new column by passing as Series:")
df['three']=pd.Series([10,20,30],index=['a','b','c'])
print (df)

Adding a new column by passing as Series:
   one  two  three
a  1.0    1   10.0
b  2.0    2   20.0
c  3.0    3   30.0
d  NaN    4    NaN


In [82]:
print ("Adding a new column using the existing columns in DataFrame:")
df['four']=df['one'] + df['three']
print (df)

Adding a new column using the existing columns in DataFrame:
   one  two  three  four
a  1.0    1   10.0  11.0
b  2.0    2   20.0  22.0
c  3.0    3   30.0  33.0
d  NaN    4    NaN   NaN


In [83]:
# using del function
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']), 
     'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']), 
     'three' : pd.Series([10,20,30], index=['a','b','c'])}
df = pd.DataFrame(d)
print (df)

   one  two  three
a  1.0    1   10.0
b  2.0    2   20.0
c  3.0    3   30.0
d  NaN    4    NaN


In [84]:
# using del function
print ("Deleting the first column using DEL function:")
del df['one']
print (df)

Deleting the first column using DEL function:
   two  three
a    1   10.0
b    2   20.0
c    3   30.0
d    4    NaN


In [85]:
# using pop function
print ("Deleting another column using POP function:")
df.pop('two')
print (df)

Deleting another column using POP function:
   three
a   10.0
b   20.0
c   30.0
d    NaN


In [86]:
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']), 
     'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)
print (df.loc['c'])

#Selection by Label Rows can be selected by passing row label 
# to a loc function. 

one    3.0
two    3.0
Name: c, dtype: float64


In [87]:
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
     'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)
print (df.iloc[3])

#Rows can be selected by passing integer location to an iloc function.

one    NaN
two    4.0
Name: d, dtype: float64


In [88]:
#Multiple rows can be selected using ‘ : ’ operator.

d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']), 
    'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)
print (df[2:4])

   one  two
c  3.0    3
d  NaN    4


## Iterating over dataframe

In [90]:
import pandas as pd

#Create a Dictionary of series
d = {
    'Name':pd.Series(['Tom','James','Ricky','Vin','Steve']),
    'Age':pd.Series([25,26,25,23,30]),
    'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20])
     }
#Create a DataFrame
df = pd.DataFrame(d)
print (df)

    Name  Age  Rating
0    Tom   25    4.23
1  James   26    3.24
2  Ricky   25    3.98
3    Vin   23    2.56
4  Steve   30    3.20


In [91]:
for row in df.iterrows():
    print (row)

(0, Name       Tom
Age         25
Rating    4.23
Name: 0, dtype: object)
(1, Name      James
Age          26
Rating     3.24
Name: 1, dtype: object)
(2, Name      Ricky
Age          25
Rating     3.98
Name: 2, dtype: object)
(3, Name       Vin
Age         23
Rating    2.56
Name: 3, dtype: object)
(4, Name      Steve
Age          30
Rating      3.2
Name: 4, dtype: object)


In [92]:
for row in df.itertuples():
    print (row)

Pandas(Index=0, Name='Tom', Age=25, Rating=4.23)
Pandas(Index=1, Name='James', Age=26, Rating=3.24)
Pandas(Index=2, Name='Ricky', Age=25, Rating=3.98)
Pandas(Index=3, Name='Vin', Age=23, Rating=2.56)
Pandas(Index=4, Name='Steve', Age=30, Rating=3.2)


In [93]:
df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'],
                  'population': [1864, 22000, 80000]},
                  index=['panda', 'polar', 'koala'])
df

Unnamed: 0,species,population
panda,bear,1864
polar,bear,22000
koala,marsupial,80000


In [94]:
for species, population in df.items():
    print(f'label: {species}')
    print(f'content: {population}', sep='\n')

label: species
content: panda         bear
polar         bear
koala    marsupial
Name: species, dtype: object
label: population
content: panda     1864
polar    22000
koala    80000
Name: population, dtype: int64


### 5. Working on Dataframes

In [86]:
import pandas as pd
import numpy as np
raw_data = {
        'first_name': ['Jason', np.nan, 'Tina', 'Jake', 'Amy'],
        'last_name': ['Miller', np.nan, 'Ali', 'Milner', 'Cooze'],
        'age': [42, np.nan, 36, 24, 73],
        'sex': ['m', np.nan, 'f', 'm', 'f'],
        'preTestScore': [4, np.nan, np.nan, 2, 3],
        'postTestScore': [25, np.nan, np.nan, 62, 70]
        }

df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'sex', 'preTestScore', 'postTestScore'])
print (df)

  first_name last_name   age  sex  preTestScore  postTestScore
0      Jason    Miller  42.0    m           4.0           25.0
1        NaN       NaN   NaN  NaN           NaN            NaN
2       Tina       Ali  36.0    f           NaN            NaN
3       Jake    Milner  24.0    m           2.0           62.0
4        Amy     Cooze  73.0    f           3.0           70.0


In [87]:
print('----Drop missing observations------')
df_no_missing = df.dropna()
print (df_no_missing)

----Drop missing observations------
  first_name last_name   age sex  preTestScore  postTestScore
0      Jason    Miller  42.0   m           4.0           25.0
3       Jake    Milner  24.0   m           2.0           62.0
4        Amy     Cooze  73.0   f           3.0           70.0


In [88]:
print('----Drop rows where all cells in that row is NA------')
df_cleaned = df.dropna(how='all')
print(df_cleaned)

----Drop rows where all cells in that row is NA------
  first_name last_name   age sex  preTestScore  postTestScore
0      Jason    Miller  42.0   m           4.0           25.0
2       Tina       Ali  36.0   f           NaN            NaN
3       Jake    Milner  24.0   m           2.0           62.0
4        Amy     Cooze  73.0   f           3.0           70.0


In [89]:
print('----Select some rows but ignore the missing data points------')
print (df[df['age'].notnull() & df['sex'].notnull()])

----Select some rows but ignore the missing data points------
  first_name last_name   age sex  preTestScore  postTestScore
0      Jason    Miller  42.0   m           4.0           25.0
2       Tina       Ali  36.0   f           NaN            NaN
3       Jake    Milner  24.0   m           2.0           62.0
4        Amy     Cooze  73.0   f           3.0           70.0


In [90]:
print('----Drop rows that contain less than four observations------')
print(df.dropna(thresh=4))

----Drop rows that contain less than four observations------
  first_name last_name   age sex  preTestScore  postTestScore
0      Jason    Miller  42.0   m           4.0           25.0
2       Tina       Ali  36.0   f           NaN            NaN
3       Jake    Milner  24.0   m           2.0           62.0
4        Amy     Cooze  73.0   f           3.0           70.0


In [91]:
print('----Fill in missing data with zeros------')
print(df.fillna(0))

----Fill in missing data with zeros------
  first_name last_name   age sex  preTestScore  postTestScore
0      Jason    Miller  42.0   m           4.0           25.0
1          0         0   0.0   0           0.0            0.0
2       Tina       Ali  36.0   f           0.0            0.0
3       Jake    Milner  24.0   m           2.0           62.0
4        Amy     Cooze  73.0   f           3.0           70.0


In [92]:
import warnings
warnings.filterwarnings('ignore')

In [93]:
print('--Fill in missing in preTestScore with the mean value of preTestScore--')
#inplace=True means that the changes are saved to the df right away
df_cleaned["preTestScore"].fillna(df_cleaned["preTestScore"].mean(),inplace=True)
print (df_cleaned)

--Fill in missing in preTestScore with the mean value of preTestScore--
  first_name last_name   age sex  preTestScore  postTestScore
0      Jason    Miller  42.0   m           4.0           25.0
2       Tina       Ali  36.0   f           3.0            NaN
3       Jake    Milner  24.0   m           2.0           62.0
4        Amy     Cooze  73.0   f           3.0           70.0


In [85]:
print('--Fill in missing in preTestScore with the mean value of preTestScore--')
#inplace=True means that the changes are saved to the df right away
df_cleaned["postTestScore"].fillna(df_cleaned["postTestScore"].median(),inplace=True)
df_cleaned

--Fill in missing in preTestScore with the mean value of preTestScore--


NameError: name 'df_cleaned' is not defined

In [50]:
import pandas as pd
import numpy as np

#Create a Dictionary of series
d = {'Name':pd.Series(['Tom','Tom','Ricky','Vin','Steve','Smith','Jack','Lee','David','Gasper','Betina','Andres']),
   'Age':pd.Series([25,26,25,23,30,29,23,34,40,30,51,46]),
   'Rating':pd.Series([4.23,3.24,3.98,2.56,3.20,4.6,3.8,3.78,2.98,4.80,4.10,3.65])}

#Create a DataFrame
df = pd.DataFrame(d)
print (df.describe(include=['object']))

       Name
count    12
unique   11
top     Tom
freq      2


In [None]:
print (df. describe(include='all'))

In [None]:
import pandas as pd 

df = pd.read_csv('cancer.csv')
df.head()

In [None]:
# cols names
df.columns.tolist()

In [None]:
df['diagnosis']

In [None]:
df['diagnosis'].tail(5)

In [None]:
df.isnull().sum()

In [None]:
df['diagnosis'].value_counts(normalize=True)

In [None]:
df['diagnosis'].value_counts()

In [None]:
# frequency of values
df['diagnosis'].value_counts()[1]

In [None]:
# axis=0 means Rows and axis=1 means cols
df1=df.drop(['id'],axis=1)
df1

### 6.Data frame example for practice

In [None]:
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('sales.csv', index_col='month')
df

In [None]:
df['eggs'][4]

In [None]:
df['eggs'][0:4]

In [None]:
df.salt['Jan']

In [None]:
df.loc['May', 'spam']

In [None]:
df.iloc[4, 2]

In [None]:
df_new = df[['salt','eggs']]
df_new

In [None]:
df

In [None]:
df['eggs']

In [None]:
type(df['eggs'])

In [None]:
type(df)

In [None]:
df['eggs'][1:4]

In [None]:
df['eggs'][4]

In [None]:
df.loc[:, 'eggs':'salt']

In [None]:
df.loc['Jan':'Apr',:]

In [None]:
df.loc['Mar':'May', 'salt':'spam']

In [None]:
df.iloc[2:5, 1:] 

In [None]:
df.loc['Jan':'May', ['eggs', 'spam']]

In [None]:
df

In [None]:
df.salt > 60

In [None]:
df[df.salt > 60]

In [None]:
df.loc[:, df.isnull().any()]

In [None]:
df.loc[:, df.notnull().all()]

In [None]:
df.dropna(how='any')

In [None]:
df.eggs[df.salt > 55]

In [58]:
df.eggs[df.salt > 55] += 5
df

AttributeError: 'DataFrame' object has no attribute 'eggs'

In [76]:
data=pd.DataFrame({"name":["Ramu","Raju","Rajesh"],
                   "Age":[22,33,44]})

In [80]:
data

Unnamed: 0,name,Age
0,Ramu,22
1,Raju,33
2,Rajesh,44


In [82]:
data.loc[:,"Age"]

0    22
1    33
2    44
Name: Age, dtype: int64

In [84]:
data.index

RangeIndex(start=0, stop=3, step=1)