In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [46]:
data = {
    'date': ['2021-12-01', '01-12-2022', '2022/12/01', '12-01-2021'],
    'country': ['USA', 'U.S.A.', 'America', 'United States'],
    'name': ['Kaleem', 'Kalim', 'Hamza', 'Hazma'],
    'sales_2020': [100, 200, None, 200],
    'sales_2021': [None, 150, 300, 150]
}
# make pandas dataframe
df = pd.DataFrame(data)
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,USA,Kaleem,100.0,
1,01-12-2022,U.S.A.,Kalim,200.0,150.0
2,2022/12/01,America,Hamza,,300.0
3,12-01-2021,United States,Hazma,200.0,150.0


In [47]:
# Standardizing the date format
df['date']= pd.to_datetime(df['date'], errors='coerce')
df['date']= df['date'].dt.strftime('%Y-%m-%d')
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,USA,Kaleem,100.0,
1,,U.S.A.,Kalim,200.0,150.0
2,,America,Hamza,,300.0
3,,United States,Hazma,200.0,150.0


In [48]:
# Fill NaN values in 'date' column with a specific values
df['date'] = df['date'].fillna('2023-01-01')
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,USA,Kaleem,100.0,
1,2023-01-01,U.S.A.,Kalim,200.0,150.0
2,2023-01-01,America,Hamza,,300.0
3,2023-01-01,United States,Hazma,200.0,150.0


In [49]:
# Harmonize the name of the country
country_mapping = {'USA': 'United States', 'U.S.A.': 'United States', 'America': 'United States'}
df['country']= df['country'].replace(country_mapping)
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,United States,Kaleem,100.0,
1,2023-01-01,United States,Kalim,200.0,150.0
2,2023-01-01,United States,Hamza,,300.0
3,2023-01-01,United States,Hazma,200.0,150.0


In [50]:
# Correct the typographical mistakes in name
# Let's assume we want to correct 'Kalim' to 'Kaleem' and 'Hazma' to 'Hamza'
df['name'] = df['name'].replace({'Kalim':'Kaleem', 'Hazma':'Hamza'})
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,United States,Kaleem,100.0,
1,2023-01-01,United States,Kaleem,200.0,150.0
2,2023-01-01,United States,Hamza,,300.0
3,2023-01-01,United States,Hamza,200.0,150.0


In [45]:
# Remove duplicates
df = df.drop_duplicates(subset= 'name')
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,United States,Kaleem,100.0,
2,2023-01-01,United States,Hamza,,300.0


In [51]:
# Resolving contradictory Data
# For demonstration, let's assume sales_2021 should always be higher than sales_2020
# We'll remove rows where this condition is not met
df = df.drop(df[df['sales_2021'] <= df['sales_2020']].index)
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,United States,Kaleem,100.0,
2,2023-01-01,United States,Hamza,,300.0
