In [1]:
import pandas as pd
import numpy as np


In [2]:
data = {
    'Name': ['Aarav', 'Sita', 'Ramesh', 'Sita', 'Binod', 'Rita', 'Aarav', 'Kiran', 'Laxmi', 'Sita'],
    'Age': [18, 19, 17, np.nan, 18, 20, 18, 17, 16, 19],
    'Gender': ['Male', 'Female', 'Male', np.nan, 'Male', 'Female', 'Male', 'Male', 'Female', 'Female'],
    'Math': [80, 85, 75, 90, np.nan, 70, 80, 60, 65, 85],
    'Science': [78, 82, 72, 95, 80, np.nan, 78, 58, 60, 88],
    'English': [85, 88, 70, 92, 75, 65, 85, 55, 60, 90],
    'Grade': ['A', 'A', 'B', 'A', 'B', 'C', 'A', 'C', 'C', 'A']
}

df = pd.DataFrame(data)

print(df)

     Name   Age  Gender  Math  Science  English Grade
0   Aarav  18.0    Male  80.0     78.0       85     A
1    Sita  19.0  Female  85.0     82.0       88     A
2  Ramesh  17.0    Male  75.0     72.0       70     B
3    Sita   NaN     NaN  90.0     95.0       92     A
4   Binod  18.0    Male   NaN     80.0       75     B
5    Rita  20.0  Female  70.0      NaN       65     C
6   Aarav  18.0    Male  80.0     78.0       85     A
7   Kiran  17.0    Male  60.0     58.0       55     C
8   Laxmi  16.0  Female  65.0     60.0       60     C
9    Sita  19.0  Female  85.0     88.0       90     A


In [3]:
print("Dataset columns")
print(df.columns)
print("Dataset shape")
print(df.shape)
print("Dataset types")
print(df.dtypes)
print("Dataset info")
print(df.info())

Dataset columns
Index(['Name', 'Age', 'Gender', 'Math', 'Science', 'English', 'Grade'], dtype='object')
Dataset shape
(10, 7)
Dataset types
Name        object
Age        float64
Gender      object
Math       float64
Science    float64
English      int64
Grade       object
dtype: object
Dataset info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Name     10 non-null     object 
 1   Age      9 non-null      float64
 2   Gender   9 non-null      object 
 3   Math     9 non-null      float64
 4   Science  9 non-null      float64
 5   English  10 non-null     int64  
 6   Grade    10 non-null     object 
dtypes: float64(3), int64(1), object(3)
memory usage: 692.0+ bytes
None


In [4]:
print(df.describe())

             Age       Math    Science    English
count   9.000000   9.000000   9.000000  10.000000
mean   18.000000  76.666667  76.777778  76.500000
std     1.224745  10.000000  12.018504  13.377012
min    16.000000  60.000000  58.000000  55.000000
25%    17.000000  70.000000  72.000000  66.250000
50%    18.000000  80.000000  78.000000  80.000000
75%    19.000000  85.000000  82.000000  87.250000
max    20.000000  90.000000  95.000000  92.000000


In [5]:
print("Missing value in the column", df.isnull().sum())


Missing value in the column Name       0
Age        1
Gender     1
Math       1
Science    1
English    0
Grade      0
dtype: int64


In [6]:
mode_gender = df['Gender'].mode()[0]
df['Gender'].fillna(mode_gender, inplace=True)
print(df)

     Name   Age  Gender  Math  Science  English Grade
0   Aarav  18.0    Male  80.0     78.0       85     A
1    Sita  19.0  Female  85.0     82.0       88     A
2  Ramesh  17.0    Male  75.0     72.0       70     B
3    Sita   NaN    Male  90.0     95.0       92     A
4   Binod  18.0    Male   NaN     80.0       75     B
5    Rita  20.0  Female  70.0      NaN       65     C
6   Aarav  18.0    Male  80.0     78.0       85     A
7   Kiran  17.0    Male  60.0     58.0       55     C
8   Laxmi  16.0  Female  65.0     60.0       60     C
9    Sita  19.0  Female  85.0     88.0       90     A


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Gender'].fillna(mode_gender, inplace=True)


In [7]:
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Math'] = df['Math'].fillna(df['Math'].median())
df['Science'] = df['Science'].fillna(df['Science'].median())
print(df)

     Name   Age  Gender  Math  Science  English Grade
0   Aarav  18.0    Male  80.0     78.0       85     A
1    Sita  19.0  Female  85.0     82.0       88     A
2  Ramesh  17.0    Male  75.0     72.0       70     B
3    Sita  18.0    Male  90.0     95.0       92     A
4   Binod  18.0    Male  80.0     80.0       75     B
5    Rita  20.0  Female  70.0     78.0       65     C
6   Aarav  18.0    Male  80.0     78.0       85     A
7   Kiran  17.0    Male  60.0     58.0       55     C
8   Laxmi  16.0  Female  65.0     60.0       60     C
9    Sita  19.0  Female  85.0     88.0       90     A


In [8]:
print(df.isnull().sum())

Name       0
Age        0
Gender     0
Math       0
Science    0
English    0
Grade      0
dtype: int64


In [9]:
print("Number of duplicates:", df.duplicated().sum())
df.drop_duplicates(inplace=True)
print(df.isnull().sum())

Number of duplicates: 1
Name       0
Age        0
Gender     0
Math       0
Science    0
English    0
Grade      0
dtype: int64


In [10]:
df['Age'] = df['Age'].astype(int)
print(df.dtypes)
print(df)

Name        object
Age          int64
Gender      object
Math       float64
Science    float64
English      int64
Grade       object
dtype: object
     Name  Age  Gender  Math  Science  English Grade
0   Aarav   18    Male  80.0     78.0       85     A
1    Sita   19  Female  85.0     82.0       88     A
2  Ramesh   17    Male  75.0     72.0       70     B
3    Sita   18    Male  90.0     95.0       92     A
4   Binod   18    Male  80.0     80.0       75     B
5    Rita   20  Female  70.0     78.0       65     C
7   Kiran   17    Male  60.0     58.0       55     C
8   Laxmi   16  Female  65.0     60.0       60     C
9    Sita   19  Female  85.0     88.0       90     A


In [11]:
df['Math'] = df['Math'].astype(int)
df['Science'] = df['Science'].astype(int)
print(df.dtypes)
print(df)

Name       object
Age         int64
Gender     object
Math        int64
Science     int64
English     int64
Grade      object
dtype: object
     Name  Age  Gender  Math  Science  English Grade
0   Aarav   18    Male    80       78       85     A
1    Sita   19  Female    85       82       88     A
2  Ramesh   17    Male    75       72       70     B
3    Sita   18    Male    90       95       92     A
4   Binod   18    Male    80       80       75     B
5    Rita   20  Female    70       78       65     C
7   Kiran   17    Male    60       58       55     C
8   Laxmi   16  Female    65       60       60     C
9    Sita   19  Female    85       88       90     A


In [12]:
df['Result'] = df['Math'] + df['Science'] + df['English']
print(df)

     Name  Age  Gender  Math  Science  English Grade  Result
0   Aarav   18    Male    80       78       85     A     243
1    Sita   19  Female    85       82       88     A     255
2  Ramesh   17    Male    75       72       70     B     217
3    Sita   18    Male    90       95       92     A     277
4   Binod   18    Male    80       80       75     B     235
5    Rita   20  Female    70       78       65     C     213
7   Kiran   17    Male    60       58       55     C     173
8   Laxmi   16  Female    65       60       60     C     185
9    Sita   19  Female    85       88       90     A     263


In [13]:
df['Percentage'] = (df['Result'] / 300) * 100
print(df)

     Name  Age  Gender  Math  Science  English Grade  Result  Percentage
0   Aarav   18    Male    80       78       85     A     243   81.000000
1    Sita   19  Female    85       82       88     A     255   85.000000
2  Ramesh   17    Male    75       72       70     B     217   72.333333
3    Sita   18    Male    90       95       92     A     277   92.333333
4   Binod   18    Male    80       80       75     B     235   78.333333
5    Rita   20  Female    70       78       65     C     213   71.000000
7   Kiran   17    Male    60       58       55     C     173   57.666667
8   Laxmi   16  Female    65       60       60     C     185   61.666667
9    Sita   19  Female    85       88       90     A     263   87.666667


In [14]:
df["Percentage"]= df["Percentage"].astype(float).round(2)
print(df)

     Name  Age  Gender  Math  Science  English Grade  Result  Percentage
0   Aarav   18    Male    80       78       85     A     243       81.00
1    Sita   19  Female    85       82       88     A     255       85.00
2  Ramesh   17    Male    75       72       70     B     217       72.33
3    Sita   18    Male    90       95       92     A     277       92.33
4   Binod   18    Male    80       80       75     B     235       78.33
5    Rita   20  Female    70       78       65     C     213       71.00
7   Kiran   17    Male    60       58       55     C     173       57.67
8   Laxmi   16  Female    65       60       60     C     185       61.67
9    Sita   19  Female    85       88       90     A     263       87.67


In [15]:
df.loc[df["Name"] == "Sita", "Gender"] = "Female"
print(df)

     Name  Age  Gender  Math  Science  English Grade  Result  Percentage
0   Aarav   18    Male    80       78       85     A     243       81.00
1    Sita   19  Female    85       82       88     A     255       85.00
2  Ramesh   17    Male    75       72       70     B     217       72.33
3    Sita   18  Female    90       95       92     A     277       92.33
4   Binod   18    Male    80       80       75     B     235       78.33
5    Rita   20  Female    70       78       65     C     213       71.00
7   Kiran   17    Male    60       58       55     C     173       57.67
8   Laxmi   16  Female    65       60       60     C     185       61.67
9    Sita   19  Female    85       88       90     A     263       87.67


In [16]:
df.to_csv("result.csv", index=False)
print("Cleaned data saved to 'result.csv")

Cleaned data saved to 'result.csv
