In [1]:
import pandas as pd
import pprint

df = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")
print(f'Columns of the dataframe are {df.columns}')

print('\nThe DataFrame df is displayed below: ')
df

Columns of the dataframe are Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

The DataFrame df is displayed below: 


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


### In pandas, the drop method is used to remove rows or columns from a DataFrame or Series. This method is quite versatile and can be used to clean and preprocess data by eliminating unwanted or unnecessary data.

In [3]:
df.drop('PassengerId')

KeyError: "['PassengerId'] not found in axis"

#### <u>Dropping Columns</u>: To drop columns, you need to specify the column names and set the axis parameter to 1.

#### In pandas, the axis parameter is used to specify the direction along which an operation should be performed. It is an important part of many pandas methods, including drop, sum, mean, and others. Understanding the axis parameter helps in manipulating and analyzing data efficiently.

## Here's a breakdown of how the axis parameter works:

#### axis=0: Refers to the rows of the DataFrame or Series. When you specify axis=0, operations are performed along the rows. For instance, if you use df.drop(1, axis=0), it will drop the row with index 1.
#### axis=1: Refers to the columns of the DataFrame. When you specify axis=1, operations are performed along the columns. For example, df.drop('B', axis=1) will drop the column named 'B'.

In [5]:
df.drop('PassengerId', axis=1)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [6]:
df
# 'df' remains unchanged

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


#### <u>Inplace Operation</u>: By default, drop returns a new DataFrame with the specified rows or columns removed. If you want to modify the original DataFrame directly, you can use the inplace=True parameter.

In [23]:
# df = df.drop('PassengerId', axis=1)
# df.drop('PassengerId', axis=1, inplace = True)

In [7]:
# Deleting rows
# Dropping Rows: To drop rows, you need to specify the index or indices of the rows you want to remove
df.drop(3, inplace=True)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


### <u>set_index</u> is a powerful method for changing the index of a DataFrame to one or more of its columns, which can be useful for data alignment, merging, and more advanced data operations.

In [9]:
# df.set_index('Name')
df.set_index('Name', inplace = True)
df

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,0,A/5 21171,7.2500,,S
"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
"Heikkinen, Miss. Laina",3,1,3,female,26.0,0,0,STON/O2. 3101282,7.9250,,S
"Allen, Mr. William Henry",5,0,3,male,35.0,0,0,373450,8.0500,,S
"Moran, Mr. James",6,0,3,male,,0,0,330877,8.4583,,Q
...,...,...,...,...,...,...,...,...,...,...,...
"Montvila, Rev. Juozas",887,0,2,male,27.0,0,0,211536,13.0000,,S
"Graham, Miss. Margaret Edith",888,1,1,female,19.0,0,0,112053,30.0000,B42,S
"Johnston, Miss. Catherine Helen ""Carrie""",889,0,3,female,,1,2,W./C. 6607,23.4500,,S
"Behr, Mr. Karl Howell",890,1,1,male,26.0,0,0,111369,30.0000,C148,C


### If you want to revert to the original indexing or remove the index, you can use the reset_index method.

In [11]:
df.reset_index()

Unnamed: 0,Name,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,0,A/5 21171,7.2500,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,"Heikkinen, Miss. Laina",3,1,3,female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,"Allen, Mr. William Henry",5,0,3,male,35.0,0,0,373450,8.0500,,S
4,"Moran, Mr. James",6,0,3,male,,0,0,330877,8.4583,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
885,"Montvila, Rev. Juozas",887,0,2,male,27.0,0,0,211536,13.0000,,S
886,"Graham, Miss. Margaret Edith",888,1,1,female,19.0,0,0,112053,30.0000,B42,S
887,"Johnston, Miss. Catherine Helen ""Carrie""",889,0,3,female,,1,2,W./C. 6607,23.4500,,S
888,"Behr, Mr. Karl Howell",890,1,1,male,26.0,0,0,111369,30.0000,C148,C


### The DataFrame() function in pandas is used to create a new DataFrame object. A DataFrame is a core data structure in pandas, which is essentially a two-dimensional, size-mutable, and heterogeneous tabular data structure with labeled axes (rows and columns).

In [14]:
data = {
    'key1' : [3, 4, 5, 6, 7],
    'key2' : [7, 9, 2, 1],
    'key3' : [6, 4, 1]
}

pd.DataFrame(data)

ValueError: All arrays must be of the same length

In [16]:
student = {
    'name' : ['nik', 'sweat', 'dan', 'emily', 'luna'],
    'usn' : [90, 414, 82, 96, 123],
    'course' : ['cse', 'cse', 'ise', 'eee', 'cs-ds']
}

student_df = pd.DataFrame(student)
student_df

Unnamed: 0,name,usn,course
0,nik,90,cse
1,sweat,414,cse
2,dan,82,ise
3,emily,96,eee
4,luna,123,cs-ds


In [23]:
df1 = pd.read_csv('taxonomy.csv.xls')
len(df1)

290

#### In pandas, <U>the dropna()</U> method is used to remove missing values (NaNs) from a DataFrame or Series. It is a common operation in data cleaning and preprocessing to ensure that your dataset is free of incomplete or invalid entries.

### <U>Key Uses of dropna()</U>
#### Removing Rows with Missing Values: By default, dropna() removes rows where any column contains a missing value.
#### Removing Columns with Missing Values: You can specify to remove columns with missing values instead of rows.
#### Conditional Removal: You can customize the criteria for dropping rows or columns based on the number of missing values.

### <U>Here's a breakdown of the parameters:</U>

#### 1. axis: Determines whether to drop rows or columns.
#### axis=0: Drop rows (default).
#### axis=1: Drop columns.

#### 2. how: Specifies the condition for dropping.
#### how='any': Drop the row/column if any NaNs are present (default).
#### how='all': Drop the row/column only if all values are NaNs.

#### 3. thresh: Requires that many non-NaN values to keep the row/column. For example, if thresh=2, a row or column must have at least 2 non-NaN values to be retained.

#### 4. subset: Specifies which columns to check for missing values when dropping rows. It is used only if axis=0.

#### 5. inplace: Whether to modify the DataFrame in place or return a new DataFrame. inplace=False (default) returns a new DataFrame, while inplace=True modifies the original DataFrame.

In [25]:
# df1.dropna()
df1.dropna(inplace = True)
df1

Unnamed: 0,taxonomy_id,name,parent_id,parent_name
1,101-01,Disaster Response,101,Emergency
2,101-02,Emergency Cash,101,Emergency
3,101-02-01,Help Pay for Food,101-02,Emergency Cash
4,101-02-02,Help Pay for Healthcare,101-02,Emergency Cash
5,101-02-03,Help Pay for Housing,101-02,Emergency Cash
...,...,...,...,...
285,111-01-07,Workplace Rights,111-01,Advocacy & Legal Aid
286,111-02,Mediation,111,Legal
287,111-03,Notary,111,Legal
288,111-04,Representation,111,Legal


In [27]:
df1.dropna(axis=1)

Unnamed: 0,taxonomy_id,name,parent_id,parent_name
1,101-01,Disaster Response,101,Emergency
2,101-02,Emergency Cash,101,Emergency
3,101-02-01,Help Pay for Food,101-02,Emergency Cash
4,101-02-02,Help Pay for Healthcare,101-02,Emergency Cash
5,101-02-03,Help Pay for Housing,101-02,Emergency Cash
...,...,...,...,...
285,111-01-07,Workplace Rights,111-01,Advocacy & Legal Aid
286,111-02,Mediation,111,Legal
287,111-03,Notary,111,Legal
288,111-04,Representation,111,Legal


In [29]:
df2 = pd.read_csv('taxonomy.csv.xls')
df2.dropna(axis=1) 

Unnamed: 0,taxonomy_id,name
0,101,Emergency
1,101-01,Disaster Response
2,101-02,Emergency Cash
3,101-02-01,Help Pay for Food
4,101-02-02,Help Pay for Healthcare
...,...,...
285,111-01-07,Workplace Rights
286,111-02,Mediation
287,111-03,Notary
288,111-04,Representation


In [31]:
df2

Unnamed: 0,taxonomy_id,name,parent_id,parent_name
0,101,Emergency,,
1,101-01,Disaster Response,101,Emergency
2,101-02,Emergency Cash,101,Emergency
3,101-02-01,Help Pay for Food,101-02,Emergency Cash
4,101-02-02,Help Pay for Healthcare,101-02,Emergency Cash
...,...,...,...,...
285,111-01-07,Workplace Rights,111-01,Advocacy & Legal Aid
286,111-02,Mediation,111,Legal
287,111-03,Notary,111,Legal
288,111-04,Representation,111,Legal


#### The <U>fillna()</U> method in pandas is used to fill missing values (NaNs) in a DataFrame or Series with a specified value or method. This is a common operation in data preprocessing to handle missing data and ensure that analyses or computations can be performed without errors due to NaNs.

## <U>Key Useof fillna()</U>
#### Filling Missing Values with a Specific Value: You can replace NaN values with a constant value, such as 0, the mean of the column, or any other value that makes sense in the context of your data.

In [61]:
df2.fillna('NULL')

Unnamed: 0,taxonomy_id,name,parent_id,parent_name
0,101,Emergency,,
1,101-01,Disaster Response,101,Emergency
2,101-02,Emergency Cash,101,Emergency
3,101-02-01,Help Pay for Food,101-02,Emergency Cash
4,101-02-02,Help Pay for Healthcare,101-02,Emergency Cash
...,...,...,...,...
285,111-01-07,Workplace Rights,111-01,Advocacy & Legal Aid
286,111-02,Mediation,111,Legal
287,111-03,Notary,111,Legal
288,111-04,Representation,111,Legal


In [63]:
df.reset_index(inplace=True)
df

Unnamed: 0,index,Name,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,0,A/5 21171,7.2500,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,2,"Heikkinen, Miss. Laina",3,1,3,female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,3,"Allen, Mr. William Henry",5,0,3,male,35.0,0,0,373450,8.0500,,S
4,4,"Moran, Mr. James",6,0,3,male,,0,0,330877,8.4583,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,885,"Montvila, Rev. Juozas",887,0,2,male,27.0,0,0,211536,13.0000,,S
886,886,"Graham, Miss. Margaret Edith",888,1,1,female,19.0,0,0,112053,30.0000,B42,S
887,887,"Johnston, Miss. Catherine Helen ""Carrie""",889,0,3,female,,1,2,W./C. 6607,23.4500,,S
888,888,"Behr, Mr. Karl Howell",890,1,1,male,26.0,0,0,111369,30.0000,C148,C


#### In pandas, the groupby() method is used to group data based on one or more columns and perform operations on these groups. This is a powerful feature for data aggregation, transformation, and analysis. By grouping data, you can easily perform operations like calculating summary statistics, aggregating data, and applying functions to specific subsets of your data.

In [65]:
g = df.groupby('Survived')
g

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000017CD9CA96D0>

In [67]:
g.sum()

Unnamed: 0_level_0,index,Name,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,244315,"Braund, Mr. Owen HarrisAllen, Mr. William Henr...",245412,1390,malemalemalemalemalemalemalefemalemalefemalema...,12985.5,304,181,A/5 2117137345033087717463349909A/5. 215134708...,12142.7199,E46C23 C25 C27B30C83F G73E31A5D26C110B58 B60D2...,SSQSSSSSQSSSCSSCSCSSSSSCSQCSSSCCSCSSCSSSSSCSSS...
1,151290,"Cumings, Mrs. John Bradley (Florence Briggs Th...",151970,666,femalefemalefemalefemalefemalefemalefemalemale...,8184.67,161,159,PC 17599STON/O2. 3101282347742237736PP 9549113...,16498.1294,C85G6C103D56A6B78D33C52B28F33C23 C25 C27D10 D1...,CSSCSSSSCSQSSQCQCCCQQCSSSSCSSSSSSQSSSCSSSQSCSC...


In [69]:
g.mean(numeric_only = True)

Unnamed: 0_level_0,index,PassengerId,Pclass,Age,SibSp,Parch,Fare
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,445.018215,447.016393,2.531876,30.626179,0.553734,0.32969,22.117887
1,443.665689,445.659824,1.953079,28.320657,0.472141,0.466276,48.381611


In [71]:
g1 = df.groupby('Pclass')
g1

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000017CDAE07FD0>

In [73]:
g1.sum(numeric_only = True)

Unnamed: 0_level_0,index,PassengerId,Survived,Age,SibSp,Parch,Fare
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,99272,99701,135,7076.42,89,77,18124.3125
2,81688,82056,87,5168.83,74,70,3801.8417
3,214645,215625,119,8924.92,302,193,6714.6951


In [75]:
g1.mean(numeric_only = True)

Unnamed: 0_level_0,index,PassengerId,Survived,Age,SibSp,Parch,Fare
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,461.730233,463.725581,0.627907,38.250919,0.413953,0.35814,84.299128
2,443.956522,445.956522,0.472826,29.87763,0.402174,0.380435,20.662183
3,437.158859,439.154786,0.242363,25.14062,0.615071,0.393075,13.67555


In [77]:
g1.max(numeric_only = True)

Unnamed: 0_level_0,index,PassengerId,Survived,Age,SibSp,Parch,Fare
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,888,890,1,80.0,3,4,512.3292
2,885,887,1,70.0,3,3,73.5
3,889,891,1,74.0,8,6,69.55


In [54]:
g1.max(numeric_only = True)['Fare']

Pclass
1    512.3292
2     73.5000
3     69.5500
Name: Fare, dtype: float64

In [56]:
g1.max(numeric_only = True)

Unnamed: 0_level_0,PassengerId,Survived,Age,SibSp,Parch,Fare
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,890,1,80.0,3,4,512.3292
2,887,1,70.0,3,3,73.5
3,891,1,74.0,8,6,69.55


### In pandas, the .T attribute (or transpose() method) is used to transpose a DataFrame or Series. Transposing means swapping the rows and columns of the data structure.

In [58]:
g1.max(numeric_only = True).T # transpose

Pclass,1,2,3
PassengerId,890.0,887.0,891.0
Survived,1.0,1.0,1.0
Age,80.0,70.0,74.0
SibSp,3.0,3.0,8.0
Parch,4.0,3.0,6.0
Fare,512.3292,73.5,69.55


In [142]:
g1.max(numeric_only = True).transpose()

Pclass,1,2,3
index,889.0,886.0,890.0
PassengerId,890.0,887.0,891.0
Survived,1.0,1.0,1.0
Age,80.0,70.0,74.0
SibSp,3.0,3.0,8.0
Parch,4.0,3.0,6.0
Fare,512.3292,73.5,69.55


In [144]:
df

Unnamed: 0,index,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
4,5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
886,887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
887,888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
888,889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


#### The <u>concat()</u> function in pandas is used to concatenate or join two or more DataFrames or Series along a particular axis (either rows or columns)

In [86]:
# Syntax
'''pandas.concat(
    objs, 
    axis=0, 
    join='outer', 
    join_axes=None, 
    ignore_index=False, 
    keys=None, 
    levels=None, 
    names=None, 
    verify_integrity=False, 
    sort=False, 
    copy=True
)'''

"pandas.concat(\n    objs, \n    axis=0, \n    join='outer', \n    join_axes=None, \n    ignore_index=False, \n    keys=None, \n    levels=None, \n    names=None, \n    verify_integrity=False, \n    sort=False, \n    copy=True\n)"

## Here's a breakdown of the parameters:

#### objs: A sequence or mapping of DataFrames or Series to concatenate.

#### axis: The axis to concatenate along.
#### axis=0: Concatenate along rows (default).
#### axis=1: Concatenate along columns.

#### join: Specifies how to handle indexes or columns that do not match.
#### 'outer': Union of all indexes/columns (default).
#### 'inner': Intersection of all indexes/columns.

#### ignore_index: Whether to reset the index in the resulting DataFrame. Default is False.

#### keys: Sequence of keys to use for constructing a MultiIndex in the resulting DataFrame.

#### verify_integrity: Whether to check for duplicate indices. Default is False.

#### sort: Whether to sort non-concatenation axis if it is not already aligned. Default is False.

#### copy: Whether to copy data. Default is True.

In [109]:
df5

Unnamed: 0,PassengerId,Survived,Name
0,1,0,"Braund, Mr. Owen Harris"
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,3,1,"Heikkinen, Miss. Laina"
3,5,0,"Allen, Mr. William Henry"
4,6,0,"Moran, Mr. James"


In [111]:
df6

Unnamed: 0,PassengerId,Survived,Name
6,8,0,"Palsson, Master. Gosta Leonard"
7,9,1,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)"
8,10,1,"Nasser, Mrs. Nicholas (Adele Achem)"
9,11,1,"Sandstrom, Miss. Marguerite Rut"
10,12,1,"Bonnell, Miss. Elizabeth"


In [83]:
df5 = df[['PassengerId', 'Survived', 'Name']][0:5]
df6 = df[['PassengerId', 'Survived', 'Name']][6:11]

pd.concat([df5, df6])

Unnamed: 0,PassengerId,Survived,Name
0,1,0,"Braund, Mr. Owen Harris"
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,3,1,"Heikkinen, Miss. Laina"
3,5,0,"Allen, Mr. William Henry"
4,6,0,"Moran, Mr. James"
6,8,0,"Palsson, Master. Gosta Leonard"
7,9,1,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)"
8,10,1,"Nasser, Mrs. Nicholas (Adele Achem)"
9,11,1,"Sandstrom, Miss. Marguerite Rut"
10,12,1,"Bonnell, Miss. Elizabeth"


In [154]:
pd.concat([df5, df6], axis=1)
# Here, df6 is placed next to df5, resulting in a DataFrame with columns from both df5 and df6

Unnamed: 0,PassengerId,Survived,Name,PassengerId.1,Survived.1,Name.1
0,1.0,0.0,"Braund, Mr. Owen Harris",,,
1,2.0,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",,,
2,3.0,1.0,"Heikkinen, Miss. Laina",,,
3,5.0,0.0,"Allen, Mr. William Henry",,,
4,6.0,0.0,"Moran, Mr. James",,,
6,,,,8.0,0.0,"Palsson, Master. Gosta Leonard"
7,,,,9.0,1.0,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)"
8,,,,10.0,1.0,"Nasser, Mrs. Nicholas (Adele Achem)"
9,,,,11.0,1.0,"Sandstrom, Miss. Marguerite Rut"
10,,,,12.0,1.0,"Bonnell, Miss. Elizabeth"


In [113]:
df7 = pd.concat([df5, df6], axis=1)
df7.fillna(0)

Unnamed: 0,PassengerId,Survived,Name,PassengerId.1,Survived.1,Name.1
0,1.0,0.0,"Braund, Mr. Owen Harris",0.0,0.0,0
1,2.0,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0.0,0.0,0
2,3.0,1.0,"Heikkinen, Miss. Laina",0.0,0.0,0
3,5.0,0.0,"Allen, Mr. William Henry",0.0,0.0,0
4,6.0,0.0,"Moran, Mr. James",0.0,0.0,0
6,0.0,0.0,0,8.0,0.0,"Palsson, Master. Gosta Leonard"
7,0.0,0.0,0,9.0,1.0,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)"
8,0.0,0.0,0,10.0,1.0,"Nasser, Mrs. Nicholas (Adele Achem)"
9,0.0,0.0,0,11.0,1.0,"Sandstrom, Miss. Marguerite Rut"
10,0.0,0.0,0,12.0,1.0,"Bonnell, Miss. Elizabeth"


In [91]:
data1 = pd.DataFrame(
    {
        'name' : ['nik', 'sweat', 'dan', 'emily', 'luna'],
        'usn' : [90, 414, 82, 96, 123],
        'course' : ['cse', 'cse', 'ise', 'eee', 'cs-ds']
    }
)

In [93]:
data2 = pd.DataFrame(
    {
        'name' : ['nik', 'sweat', 'kavinda', 'emily', 'luna', 'hoovi'],
        'usn' : [90, 414, 98, 96, 123, 222],
        'mentors' : ['moana', 'raya', 'eleven', 'nairobi', 'natasha', 'byers'],
        'discipline' : [3, 4.8, 4.95, 4.02, 3.766, 4.11]
    }
)

In [95]:
data1

Unnamed: 0,name,usn,course
0,nik,90,cse
1,sweat,414,cse
2,dan,82,ise
3,emily,96,eee
4,luna,123,cs-ds


In [97]:
data2

Unnamed: 0,name,usn,mentors,discipline
0,nik,90,moana,3.0
1,sweat,414,raya,4.8
2,kavinda,98,eleven,4.95
3,emily,96,nairobi,4.02
4,luna,123,natasha,3.766
5,hoovi,222,byers,4.11


### In pandas, <u>the merge() function</u> is used to combine two DataFrames based on one or more keys (columns or indices) in a way similar to SQL joins. This function is crucial for merging datasets, aligning data, and performing relational operations. It allows for a variety of merge types, including inner, outer, left, and right joins.

In [None]:
# Syntax

'''pandas.merge(
    left, 
    right, 
    how='inner', 
    on=None, 
    left_on=None, 
    right_on=None, 
    left_index=False, 
    right_index=False, 
    sort=True, 
    suffixes=('_x', '_y'), 
    copy=True
)'''

## Here’s a breakdown of the parameters:

#### left: The first DataFrame to merge.

#### right: The second DataFrame to merge.

#### how: Type of merge to be performed.
#### 'inner': Only include rows with keys present in both DataFrames (default).
#### 'outer': Include all rows from both DataFrames, with NaNs where there is no match.
#### 'left': Include all rows from the left DataFrame, with matching rows from the right DataFrame.
#### 'right': Include all rows from the right DataFrame, with matching rows from the left DataFrame.
#### on: Column or index level names to join on. Must be present in both DataFrames.

#### left_on: Column or index level names from the left DataFrame to join on.

#### right_on: Column or index level names from the right DataFrame to join on.

#### left_index: Whether to use the index from the left DataFrame as a join key.

#### right_index: Whether to use the index from the right DataFrame as a join key.

#### sort: Whether to sort the join keys lexicographically. Default is True.

#### suffixes: Tuple of string suffixes to apply to overlapping column names. Default is ('_x', '_y').

#### copy: Whether to copy the data. Default is True.

In [99]:
pd.merge(data1, data2)

Unnamed: 0,name,usn,course,mentors,discipline
0,nik,90,cse,moana,3.0
1,sweat,414,cse,raya,4.8
2,emily,96,eee,nairobi,4.02
3,luna,123,cs-ds,natasha,3.766


In [101]:
pd.merge(data1, data2, how='left')

Unnamed: 0,name,usn,course,mentors,discipline
0,nik,90,cse,moana,3.0
1,sweat,414,cse,raya,4.8
2,dan,82,ise,,
3,emily,96,eee,nairobi,4.02
4,luna,123,cs-ds,natasha,3.766


In [103]:
pd.merge(data1, data2, how='right')

Unnamed: 0,name,usn,course,mentors,discipline
0,nik,90,cse,moana,3.0
1,sweat,414,cse,raya,4.8
2,kavinda,98,,eleven,4.95
3,emily,96,eee,nairobi,4.02
4,luna,123,cs-ds,natasha,3.766
5,hoovi,222,,byers,4.11


In [105]:
pd.merge(data1, data2, how='outer')

Unnamed: 0,name,usn,course,mentors,discipline
0,nik,90,cse,moana,3.0
1,sweat,414,cse,raya,4.8
2,dan,82,ise,,
3,emily,96,eee,nairobi,4.02
4,luna,123,cs-ds,natasha,3.766
5,kavinda,98,,eleven,4.95
6,hoovi,222,,byers,4.11


In [107]:
pd.merge(data1, data2, how='cross') # cartesian product

Unnamed: 0,name_x,usn_x,course,name_y,usn_y,mentors,discipline
0,nik,90,cse,nik,90,moana,3.0
1,nik,90,cse,sweat,414,raya,4.8
2,nik,90,cse,kavinda,98,eleven,4.95
3,nik,90,cse,emily,96,nairobi,4.02
4,nik,90,cse,luna,123,natasha,3.766
5,nik,90,cse,hoovi,222,byers,4.11
6,sweat,414,cse,nik,90,moana,3.0
7,sweat,414,cse,sweat,414,raya,4.8
8,sweat,414,cse,kavinda,98,eleven,4.95
9,sweat,414,cse,emily,96,nairobi,4.02


In [208]:
pd.merge(data1, data2, on='usn')

Unnamed: 0,name_x,usn,course,name_y,mentors,discipline
0,nik,90,cse,nik,moana,3.0
1,sweat,414,cse,sweat,raya,4.8
2,emily,96,eee,emily,nairobi,4.02
3,luna,123,cs-ds,luna,natasha,3.766


In [118]:
data3 = pd.DataFrame(
    {
        'name' : ['nik', 'sweat', 'dan', 'emily', 'luna'],
        'rank' : [4, 2, 5, 1, 3],
        'course' : ['cse', 'cse', 'ise', 'eee', 'cs-ds']
    },
    index = [90, 414, 82, 96, 123]
)

data3

Unnamed: 0,name,rank,course
90,nik,4,cse
414,sweat,2,cse
82,dan,5,ise
96,emily,1,eee
123,luna,3,cs-ds


In [120]:
data4 = pd.DataFrame(
    {
        'name' : ['nik', 'neilius', 'dan', 'steve', 'kel'],
        'rank' : [4, 8, 5, 6, 7],
        'course' : ['cse', 'ai-ml', 'ise', 'eee', 'cs-ds']
    },
    index = [90, 114, 82, 69, 45]
)

data4

Unnamed: 0,name,rank,course
90,nik,4,cse
114,neilius,8,ai-ml
82,dan,5,ise
69,steve,6,eee
45,kel,7,cs-ds


In [220]:
data3.join(data4)

ValueError: columns overlap but no suffix specified: Index(['name', 'rank', 'course'], dtype='object')

In [224]:
data3

Unnamed: 0,name,rank,course
90,nik,4,cse
414,sweat,2,cse
82,dan,5,ise
96,emily,1,eee
123,luna,3,cs-ds


In [122]:
data5 = pd.DataFrame(
    {
        'mail' : ['nik@gmail.com', 'neilius@bussiness.org', 'dan@outlook.com', 'steveNancy@gamil.com', 'kel@ineuron.in', 'sweat.in@gmail.com'],
        'mvp' : [1, 0, 1, 0, 1, 1],
        'level' : [149, 60, 124, 88, 150, 120]
    },
    index = [90, 114, 82, 69, 45, 414]
)

data5

Unnamed: 0,mail,mvp,level
90,nik@gmail.com,1,149
114,neilius@bussiness.org,0,60
82,dan@outlook.com,1,124
69,steveNancy@gamil.com,0,88
45,kel@ineuron.in,1,150
414,sweat.in@gmail.com,1,120


In [124]:
data3.join(data5)

Unnamed: 0,name,rank,course,mail,mvp,level
90,nik,4,cse,nik@gmail.com,1.0,149.0
414,sweat,2,cse,sweat.in@gmail.com,1.0,120.0
82,dan,5,ise,dan@outlook.com,1.0,124.0
96,emily,1,eee,,,
123,luna,3,cs-ds,,,


In [126]:
data3.join(data5, how='right')

Unnamed: 0,name,rank,course,mail,mvp,level
90,nik,4.0,cse,nik@gmail.com,1,149
114,,,,neilius@bussiness.org,0,60
82,dan,5.0,ise,dan@outlook.com,1,124
69,,,,steveNancy@gamil.com,0,88
45,,,,kel@ineuron.in,1,150
414,sweat,2.0,cse,sweat.in@gmail.com,1,120


In [128]:
df

Unnamed: 0,index,Name,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,0,A/5 21171,7.2500,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,2,"Heikkinen, Miss. Laina",3,1,3,female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,3,"Allen, Mr. William Henry",5,0,3,male,35.0,0,0,373450,8.0500,,S
4,4,"Moran, Mr. James",6,0,3,male,,0,0,330877,8.4583,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,885,"Montvila, Rev. Juozas",887,0,2,male,27.0,0,0,211536,13.0000,,S
886,886,"Graham, Miss. Margaret Edith",888,1,1,female,19.0,0,0,112053,30.0000,B42,S
887,887,"Johnston, Miss. Catherine Helen ""Carrie""",889,0,3,female,,1,2,W./C. 6607,23.4500,,S
888,888,"Behr, Mr. Karl Howell",890,1,1,male,26.0,0,0,111369,30.0000,C148,C


### In pandas, the apply() function is used to apply a function along an axis of a DataFrame or Series.

In [135]:
df['Fare_INR'] = df['Fare'].apply(lambda x : x * 92.71)

df

Unnamed: 0,index,Name,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Fare_INR
0,0,"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,0,A/5 21171,7.2500,,S,672.147500
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,6608.674743
2,2,"Heikkinen, Miss. Laina",3,1,3,female,26.0,0,0,STON/O2. 3101282,7.9250,,S,734.726750
3,3,"Allen, Mr. William Henry",5,0,3,male,35.0,0,0,373450,8.0500,,S,746.315500
4,4,"Moran, Mr. James",6,0,3,male,,0,0,330877,8.4583,,Q,784.168993
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,885,"Montvila, Rev. Juozas",887,0,2,male,27.0,0,0,211536,13.0000,,S,1205.230000
886,886,"Graham, Miss. Margaret Edith",888,1,1,female,19.0,0,0,112053,30.0000,B42,S,2781.300000
887,887,"Johnston, Miss. Catherine Helen ""Carrie""",889,0,3,female,,1,2,W./C. 6607,23.4500,,S,2174.049500
888,888,"Behr, Mr. Karl Howell",890,1,1,male,26.0,0,0,111369,30.0000,C148,C,2781.300000


## Explanation

### 1. df['Fare']:
#### This accesses the 'Fare' column in the DataFrame df. This column likely contains numerical values representing fares in a certain currency (e.g., USD).

### 2. .apply(lambda x: x * 92.71):
#### The .apply() method is used to apply a function to each element of the 'Fare' column.
#### Here, a lambda function is used as the function to apply. A lambda function is an anonymous function defined with the lambda keyword.
#### lambda x: x * 92.71 is a lambda function that takes one argument x (each value in the 'Fare' column) and multiplies it by 92.71.

### 3. df['Fare_INR'] =:
#### This assigns the result of the apply() method to a new column in the DataFrame called 'Fare_INR'.

### As a result, a new column 'Fare_INR' will be created in the DataFrame, where each value is the result of multiplying the corresponding value in the 'Fare' column by 92.71

In [138]:
def euro_to_inr(x):
    return x * 92.71


df['Fare_INR'] = df['Fare'].apply(euro_to_inr)
df

Unnamed: 0,index,Name,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Fare_INR
0,0,"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,0,A/5 21171,7.2500,,S,672.147500
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,6608.674743
2,2,"Heikkinen, Miss. Laina",3,1,3,female,26.0,0,0,STON/O2. 3101282,7.9250,,S,734.726750
3,3,"Allen, Mr. William Henry",5,0,3,male,35.0,0,0,373450,8.0500,,S,746.315500
4,4,"Moran, Mr. James",6,0,3,male,,0,0,330877,8.4583,,Q,784.168993
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,885,"Montvila, Rev. Juozas",887,0,2,male,27.0,0,0,211536,13.0000,,S,1205.230000
886,886,"Graham, Miss. Margaret Edith",888,1,1,female,19.0,0,0,112053,30.0000,B42,S,2781.300000
887,887,"Johnston, Miss. Catherine Helen ""Carrie""",889,0,3,female,,1,2,W./C. 6607,23.4500,,S,2174.049500
888,888,"Behr, Mr. Karl Howell",890,1,1,male,26.0,0,0,111369,30.0000,C148,C,2781.300000


In [140]:
df.head()

Unnamed: 0,index,Name,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Fare_INR
0,0,"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,0,A/5 21171,7.25,,S,672.1475
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,6608.674743
2,2,"Heikkinen, Miss. Laina",3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,734.72675
3,3,"Allen, Mr. William Henry",5,0,3,male,35.0,0,0,373450,8.05,,S,746.3155
4,4,"Moran, Mr. James",6,0,3,male,,0,0,330877,8.4583,,Q,784.168993


In [142]:
df['Name_length'] = df['Name'].apply(len)
df

Unnamed: 0,index,Name,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Fare_INR,Name_length
0,0,"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,0,A/5 21171,7.2500,,S,672.147500,23
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,6608.674743,51
2,2,"Heikkinen, Miss. Laina",3,1,3,female,26.0,0,0,STON/O2. 3101282,7.9250,,S,734.726750,22
3,3,"Allen, Mr. William Henry",5,0,3,male,35.0,0,0,373450,8.0500,,S,746.315500,24
4,4,"Moran, Mr. James",6,0,3,male,,0,0,330877,8.4583,,Q,784.168993,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,885,"Montvila, Rev. Juozas",887,0,2,male,27.0,0,0,211536,13.0000,,S,1205.230000,21
886,886,"Graham, Miss. Margaret Edith",888,1,1,female,19.0,0,0,112053,30.0000,B42,S,2781.300000,28
887,887,"Johnston, Miss. Catherine Helen ""Carrie""",889,0,3,female,,1,2,W./C. 6607,23.4500,,S,2174.049500,40
888,888,"Behr, Mr. Karl Howell",890,1,1,male,26.0,0,0,111369,30.0000,C148,C,2781.300000,21


In [144]:
def categorize_fare(x):
    if x < 10 :
        return 'Low'
    elif x >= 10 and x < 30 :
        return 'Mid'
    else :
        return 'High'

df['Categorized_Fare'] = df['Fare'].apply(categorize_fare)
df

Unnamed: 0,index,Name,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Fare_INR,Name_length,Categorized_Fare
0,0,"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,0,A/5 21171,7.2500,,S,672.147500,23,Low
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,6608.674743,51,High
2,2,"Heikkinen, Miss. Laina",3,1,3,female,26.0,0,0,STON/O2. 3101282,7.9250,,S,734.726750,22,Low
3,3,"Allen, Mr. William Henry",5,0,3,male,35.0,0,0,373450,8.0500,,S,746.315500,24,Low
4,4,"Moran, Mr. James",6,0,3,male,,0,0,330877,8.4583,,Q,784.168993,16,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,885,"Montvila, Rev. Juozas",887,0,2,male,27.0,0,0,211536,13.0000,,S,1205.230000,21,Mid
886,886,"Graham, Miss. Margaret Edith",888,1,1,female,19.0,0,0,112053,30.0000,B42,S,2781.300000,28,High
887,887,"Johnston, Miss. Catherine Helen ""Carrie""",889,0,3,female,,1,2,W./C. 6607,23.4500,,S,2174.049500,40,Mid
888,888,"Behr, Mr. Karl Howell",890,1,1,male,26.0,0,0,111369,30.0000,C148,C,2781.300000,21,High
