## Loading the dataset

In [131]:
import pandas as pd
import numpy as np

df = pd.read_csv('Group_2_corrupted_data_set_group2_labour_unemployment.txt', delimiter='\t',encoding='ISO-8859-1')
df.head()


Unnamed: 0,T17,Labour force participation and unemployment,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,Region_Country/Area,,Year,Serias,Value,Foodnotes,Source
1,100,"Total, all countries or areas",62.9,Labour force participation - Total,2005,Estimate.,"International Labour Organization (ILO), Genev..."
2,100,"Total, all countries or areas",06. Mrz,Unemployment rate - Total,2005,Estimate.,"International Labour Organization (ILO), Genev..."
3,100,"Total, all countries or areas",76.1,Labour force participation - Male,2005,Estimate.,"International Labour Organization (ILO), Genev..."
4,100,"Total, all countries or areas",06. Feb,Unemployment rate - Male,2005,Estimate.,"International Labour Organization (ILO), Genev..."


#### 1. Incorrect column names


In [132]:
print("Original column names:", df.columns.tolist())
df.columns = ["Region_Country_Area","Labour force participation and unemployment", "Year", "Series", "Value", "Footnotes", "Source"]
print("Updated column names:", df.columns.tolist())

Original column names: ['T17', 'Labour force participation and unemployment', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6']
Updated column names: ['Region_Country_Area', 'Labour force participation and unemployment', 'Year', 'Series', 'Value', 'Footnotes', 'Source']


#### 2. Strip extra Spaces in column names and values


In [133]:
df.columns = df.columns.str.strip()
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
df.head()

  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


Unnamed: 0,Region_Country_Area,Labour force participation and unemployment,Year,Series,Value,Footnotes,Source
0,Region_Country/Area,,Year,Serias,Value,Foodnotes,Source
1,100,"Total, all countries or areas",62.9,Labour force participation - Total,2005,Estimate.,"International Labour Organization (ILO), Genev..."
2,100,"Total, all countries or areas",06. Mrz,Unemployment rate - Total,2005,Estimate.,"International Labour Organization (ILO), Genev..."
3,100,"Total, all countries or areas",76.1,Labour force participation - Male,2005,Estimate.,"International Labour Organization (ILO), Genev..."
4,100,"Total, all countries or areas",06. Feb,Unemployment rate - Male,2005,Estimate.,"International Labour Organization (ILO), Genev..."


#### 3. Fix format of "Year" column by converting non-year values to NAN


In [134]:
print(df['Year'].unique())
def clean_year(value):
    try:
        year = int(value)
        return year
    except ValueError:
        return np.nan

df['Year'] = df['Year'].apply(clean_year)

print(df['Year'].unique())


['Year' '62.9' '06. Mrz' '76.1' '06. Feb' '49.8' '06. Mai' '62.0' '75.3'
 '48.9' '06. Apr' '60.7' '6.0' '73.9' '47.6' '06. Jan' '59.7' '2023'
 '2005' '2010' '2015' nan '2020' '2000' '2001' '2011' '2009' '2012' '2013'
 '2006' '2008' '2002' '2003' '2021' '2014']
[  nan 2023. 2005. 2010. 2015. 2020. 2000. 2001. 2011. 2009. 2012. 2013.
 2006. 2008. 2002. 2003. 2021. 2014.]


#### 4. Remove NAN values in "Year" column

In [135]:
df.dropna(subset=['Year'], inplace=True)

#### 5. Drop duplicate rows


In [136]:
print("Duplicate rows before:", df.duplicated().sum())
df.drop_duplicates(inplace=True)
print("Duplicate rows after:", df.duplicated().sum())


Duplicate rows before: 4
Duplicate rows after: 0


#### 6. Replace month Abbreviations in Value Column with NaN


In [137]:
print(df['Value'].unique())
df['Value'] = pd.to_numeric(df['Value'], errors='coerce').round(2)
df['Value'].unique()

['05. Aug' '72.4' '05. Jul' '47.2' '64.4' '06. Jun' '74.2' '06. Mrz'
 '54.9' '06. Sep' '63.8' '06. Mai' '73.5' '54.3' '07. Apr' '62.7'
 '06. Jul' '72.0' '53.6' '07. Feb' '07. Jan' '71.8' '53.8' '07. Aug'
 '46.6' '12. Jul' '71.4' '10. Jun' '21. Jul' '19. Jun' '47.8' '10. Mai'
 '72.3' '07. Mai' '23. Jan' '20. Jan' '46.4' '13. Jan' '69.4' '10. Feb'
 '23. Feb' '21. Sep' '43.7' '11. Mrz' '67.4' '08. Mai' '19. Sep' '20. Mai'
 '69.8' '05. Mrz' '75.0' '05. Jan' '64.7' '05. Jun' '68.5' '73.9' '63.3'
 '6.0' '67.2' '05. Mai' '72.7' '05. Feb' '61.9' '67.6' '06. Apr' '73.0'
 '06. Jan' '62.3' '06. Aug' '76.3' '04. Jan' '81.1' '03. Jul' '04. Mai'
 '75.7' '04. Feb' '80.6' '71.0' '04. Jul' '75.1' '78.6' '70.6' '05. Apr'
 '4.0' '76.4' '04. Mrz' '70.8' '03. Jun' '70.2' '66.3' '05. Sep' '68.4'
 '72.9' '64.1' '68.3' '72.2' '60.0' '19. Aug' '17. Aug' '52.5' '22. Jan'
 '54.8' '22. Aug' '21. Feb' '47.7' '24. Jun' '58.0' '22. Mai' '65.3'
 '20. Aug' '51.5' '24. Mai' '58.4' '28. Aug' '64.9' '27. Mrz' '30. Mai'
 

array([        nan, 7.24000e+01, 4.72000e+01, 6.44000e+01, 7.42000e+01,
       5.49000e+01, 6.38000e+01, 7.35000e+01, 5.43000e+01, 6.27000e+01,
       7.20000e+01, 5.36000e+01, 7.18000e+01, 5.38000e+01, 4.66000e+01,
       7.14000e+01, 4.78000e+01, 7.23000e+01, 4.64000e+01, 6.94000e+01,
       4.37000e+01, 6.74000e+01, 6.98000e+01, 7.50000e+01, 6.47000e+01,
       6.85000e+01, 7.39000e+01, 6.33000e+01, 6.00000e+00, 6.72000e+01,
       7.27000e+01, 6.19000e+01, 6.76000e+01, 7.30000e+01, 6.23000e+01,
       7.63000e+01, 8.11000e+01, 7.57000e+01, 8.06000e+01, 7.10000e+01,
       7.51000e+01, 7.86000e+01, 7.06000e+01, 4.00000e+00, 7.64000e+01,
       7.08000e+01, 7.02000e+01, 6.63000e+01, 6.84000e+01, 7.29000e+01,
       6.41000e+01, 6.83000e+01, 7.22000e+01, 6.00000e+01, 5.25000e+01,
       5.48000e+01, 4.77000e+01, 5.80000e+01, 6.53000e+01, 5.15000e+01,
       5.84000e+01, 6.49000e+01, 6.97000e+01, 5.79000e+01, 6.92000e+01,
       5.74000e+01, 6.02000e+01, 6.64000e+01, 5.40000e+01, 6.800

#### 7. Remove Outliers in Value


In [138]:
upper_limit = df['Value'].mean() + 3 * df['Value'].std()
df.loc[df['Value'] > upper_limit, 'Value'] = upper_limit


#### 8. Remove whitespace in Source column

In [139]:
df['Source'] = df['Source'].str.strip()


#### 9. Drop rows with missing Region_Country_Area


In [140]:
df.dropna(subset=['Region_Country_Area'], inplace=True)


#### 10. Standardize Footnotes columns values 

In [141]:
df['Footnotes'] = df['Footnotes'].str.replace('Estimate.', 'Estimated', regex=False)
