In [2]:
pip install pandas

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd

In [5]:
s = pd.Series([10,20,30,40,50], name = "Marks")

In [6]:
s

0    10
1    20
2    30
3    40
4    50
Name: Marks, dtype: int64

In [7]:
print(s)

0    10
1    20
2    30
3    40
4    50
Name: Marks, dtype: int64


In [9]:
print(s.values)

[10 20 30 40 50]


In [10]:
print(s.index)

RangeIndex(start=0, stop=5, step=1)


In [12]:
data = {
    'Name':['Alice','Bob','charlie'],
    'Age': [25,30,40],
    'Salary': [50000,70000,80000]
}

In [13]:
df = pd.DataFrame(data)

In [14]:
df

Unnamed: 0,Name,Age,Salary
0,Alice,25,50000
1,Bob,30,70000
2,charlie,40,80000


In [15]:
print(df.shape)

(3, 3)


In [19]:
print(df.columns)

Index(['Name', 'Age', 'Salary'], dtype='object')


In [20]:
print(df.index)

RangeIndex(start=0, stop=3, step=1)


In [21]:
df['Name']

0      Alice
1        Bob
2    charlie
Name: Name, dtype: object

In [29]:
df[['Name','Age']]

Unnamed: 0,Name,Age
0,Alice,25
1,Bob,30
2,charlie,40


In [31]:
df.iloc[0]  # row selection with index

Name        Bob
Age          30
Salary    70000
Name: 1, dtype: object

In [32]:
df.iloc[1:3] 

Unnamed: 0,Name,Age,Salary
1,Bob,30,70000
2,charlie,40,80000


In [36]:
print(df[df['Age'] > 30])

      Name  Age  Salary
2  charlie   40   80000


In [38]:
# Data Cleaning 

In [40]:
import numpy as np

In [70]:
data = {
    'Name':['Alice','Bob','charlie','David'],
    'Age': [25,np.nan,30,40],
    'Salary': [50000,70000,np.nan, 80000]
}

In [71]:
df = pd.DataFrame(data)

In [72]:
print(df.isnull()) # detect missing data

    Name    Age  Salary
0  False  False   False
1  False   True   False
2  False  False    True
3  False  False   False


In [73]:
df.isnull().sum() # count missing per column

Name      0
Age       1
Salary    1
dtype: int64

In [74]:
df

Unnamed: 0,Name,Age,Salary
0,Alice,25.0,50000.0
1,Bob,,70000.0
2,charlie,30.0,
3,David,40.0,80000.0


In [75]:
df['Age'] = df['Age'].fillna(df['Age'].mean())

In [76]:
print(df)

      Name        Age   Salary
0    Alice  25.000000  50000.0
1      Bob  31.666667  70000.0
2  charlie  30.000000      NaN
3    David  40.000000  80000.0


In [77]:
df['Salary'] = df['Salary'].fillna(0)

In [78]:
df

Unnamed: 0,Name,Age,Salary
0,Alice,25.0,50000.0
1,Bob,31.666667,70000.0
2,charlie,30.0,0.0
3,David,40.0,80000.0


In [132]:
data = {
    'Name':['Alice','Bob','charlie','Alice'],
    'Age': [25,np.nan,30,40],
    'Salary': [50000,50000,np.nan, 50000],
    'Salary2': [10000,50000,np.nan, 50000]
}

In [133]:
df = pd.DataFrame(data)

In [134]:
df

Unnamed: 0,Name,Age,Salary,Salary2
0,Alice,25.0,50000.0,10000.0
1,Bob,,50000.0,50000.0
2,charlie,30.0,,
3,Alice,40.0,50000.0,50000.0


In [107]:
df.dropna() # drop rown when missing values

Unnamed: 0,Name,Age,Salary
0,Alice,25.0,50000.0
3,David,40.0,50000.0


In [151]:
df = pd.DataFrame({
    'Name':['Alice','Bob','Alice'],
    'Age': [25,     50,     25]
})

In [152]:
print(df.duplicated())  # cheeck duplicate

0    False
1    False
2     True
dtype: bool


In [153]:
df.drop_duplicates()  # remove duplicates

Unnamed: 0,Name,Age
0,Alice,25
1,Bob,50


In [154]:
df.rename(columns={'Age':'Secret'}, inplace = True)

In [155]:
df

Unnamed: 0,Name,Secret
0,Alice,25
1,Bob,50
2,Alice,25


In [163]:
df =pd.DataFrame ({
    'Name':['Alice','Bob','charlie','David'],
    'Age': [25,30,30,40],
    'Salary': [50000,60000,70000, 80000]
})

In [164]:
df['Bonus'] = df['Salary'] * 0.1 

In [165]:
df

Unnamed: 0,Name,Age,Salary,Bonus
0,Alice,25,50000,5000.0
1,Bob,30,60000,6000.0
2,charlie,30,70000,7000.0
3,David,40,80000,8000.0


In [166]:
df['Total'] = df['Salary'] + df['Bonus']

In [167]:
df

Unnamed: 0,Name,Age,Salary,Bonus,Total
0,Alice,25,50000,5000.0,55000.0
1,Bob,30,60000,6000.0,66000.0
2,charlie,30,70000,7000.0,77000.0
3,David,40,80000,8000.0,88000.0


In [168]:
# Grouping and Aggregation 

In [171]:
df =pd.DataFrame ({
    'Department':['HR','IT','HR','IT','Finance','Marketing'],
    'Age': [25,30,30,40,20,21],
    'Salary': [50000,60000,70000, 80000, 30000, 20000]
})

In [172]:
df.groupby('Department')['Salary'].mean() 

Department
Finance      30000.0
HR           60000.0
IT           70000.0
Marketing    20000.0
Name: Salary, dtype: float64

In [173]:
# Feature Engineering 

In [174]:
df['Salary_in_Lakhs'] = df['Salary']/100000

In [175]:
df

Unnamed: 0,Department,Age,Salary,Salary_in_Lakhs
0,HR,25,50000,0.5
1,IT,30,60000,0.6
2,HR,30,70000,0.7
3,IT,40,80000,0.8
4,Finance,20,30000,0.3
5,Marketing,21,20000,0.2


In [176]:
df['Age_Category'] = df['Age'].apply(lambda x: 'Junior' if x < 30 else 'Senior')

In [177]:
df

Unnamed: 0,Department,Age,Salary,Salary_in_Lakhs,Age_Category
0,HR,25,50000,0.5,Junior
1,IT,30,60000,0.6,Senior
2,HR,30,70000,0.7,Senior
3,IT,40,80000,0.8,Senior
4,Finance,20,30000,0.3,Junior
5,Marketing,21,20000,0.2,Junior


In [187]:
df1 = pd.DataFrame({
    'ID' : [1,2],
    'Name' : ['Alice', 'Bob'],
    'Salary': [50000,60000]
})

In [188]:
df2 = pd.DataFrame({
    'ID' : [3, 4, 5],
    'Name' : ['Charlie', 'david', 'John']
})

In [189]:
result = pd.concat([df1, df2])

In [190]:
result

Unnamed: 0,ID,Name,Salary
0,1,Alice,50000.0
1,2,Bob,60000.0
0,3,Charlie,
1,4,david,
2,5,John,


In [191]:
pip install seaborn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
import seaborn as sn 

In [2]:
titanic = sn.load_dataset("titanic")

In [3]:
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [5]:
titanic.shape

(891, 15)

In [6]:
titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [8]:
titanic['survived'].value_counts()

survived
0    549
1    342
Name: count, dtype: int64

In [9]:
# correlation

In [10]:
titanic[['age','survived']].corr()

Unnamed: 0,age,survived
age,1.0,-0.077221
survived,-0.077221,1.0
