In [1]:
# Libraries importation and initial exploration

import pandas as pd

In [13]:
df_path_rent = '/content/mean_rent_vlc.csv'
df_rent = pd.read_csv(df_path_rent, delimiter=',')

df_rent.sample(6)

Unnamed: 0,NUM DISTRITO,DISTRITO,PERIODO,RENTA MEDIA
29,12,Camins Al Grau,2020,35031.0
103,8,Patraix,2022,36536.0
52,6,El Pla Del Real,2019,53193.0
108,18,Poblats De L'Oest,2019,27256.0
101,8,Patraix,2020,33154.0
122,19,Poblats Del Sud,2017,27148.0


In [3]:
df_rent.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152 entries, 0 to 151
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   NUM DISTRITO  152 non-null    int64  
 1   DISTRITO      152 non-null    object 
 2   PERIODO       152 non-null    int64  
 3   RENTA MEDIA   152 non-null    float64
dtypes: float64(1), int64(2), object(1)
memory usage: 4.9+ KB


In [None]:
# Exploration of unique values in different columns
sorted(df_rent['PERIODO'].unique())

[2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]

In [6]:
sorted(df_rent['DISTRITO'].unique())

['Algiros',
 'Benicalap',
 'Benimaclet',
 'Camins Al Grau',
 'Campanar',
 'Ciutat Vella',
 'El Pla Del Real',
 'Extramurs',
 'Jesus',
 "L'Eixample",
 "L'Olivereta",
 'La Saidia',
 'Patraix',
 "Poblats De L'Oest",
 'Poblats Del Nord',
 'Poblats Del Sud',
 'Poblats Maritims',
 'Quatre Carreres',
 'Rascanya']

In [7]:
# Exploration of minimum values
df_rent['RENTA MEDIA'].min()

23049.0

In [8]:
# Exploration of maximum values
df_rent['RENTA MEDIA'].max()

58092.0

In [10]:
# Exploration of 'Poblats Del Sud' values.
# It happens to not be this district in life expectancy dataset.
df_rent.loc[df_rent['DISTRITO'] == 'Poblats Del Sud']

Unnamed: 0,NUM DISTRITO,DISTRITO,PERIODO,RENTA MEDIA
120,19,Poblats Del Sud,2015,25374.0
121,19,Poblats Del Sud,2016,26224.0
122,19,Poblats Del Sud,2017,27148.0
123,19,Poblats Del Sud,2018,28269.0
124,19,Poblats Del Sud,2019,29526.0
125,19,Poblats Del Sud,2020,29563.0
126,19,Poblats Del Sud,2021,30634.0
127,19,Poblats Del Sud,2022,32240.0


In [11]:
# Creation of a class to clean data
class DataCleaner:
    def __init__(self, df_rent):
        self.df_rent = df_rent

    # Remove a value from a column
    def remove_value_from_column(self, column, value_to_remove):
        self.df_rent = self.df_rent[self.df_rent[column] != value_to_remove]

    # Rename remaining columns for this analysis
    def rename_columns(self, new_column_names):
        self.df_rent = self.df_rent.rename(columns=new_column_names)

    def get_cleaned_df_rent(self):
        return self.df_rent

In [12]:
# Process of cleaning data

cleaner = DataCleaner(df_rent)

cleaner.remove_value_from_column('DISTRITO', 'Poblats Del Sud')
cleaner.remove_value_from_column('PERIODO', 2022)

cleaner.rename_columns({
    'NUM DISTRITO': 'id_district',
    'DISTRITO': 'district',
    'PERIODO': 'date_year',
    'RENTA MEDIA': 'mean_rent'
    })

df_rent_cleaned = cleaner.get_cleaned_df_rent()
df_rent_cleaned.head(10)

Unnamed: 0,id_district,district,date_year,mean_rent
0,13,Algiros,2015,32429.0
1,13,Algiros,2016,32898.0
2,13,Algiros,2017,33494.0
3,13,Algiros,2018,34742.0
4,13,Algiros,2019,36045.0
5,13,Algiros,2020,36006.0
6,13,Algiros,2021,37115.0
8,16,Benicalap,2015,25505.0
9,16,Benicalap,2016,26330.0
10,16,Benicalap,2017,26945.0


In [None]:
# Save clean data to a new CSV file
df_rent_cleaned.to_csv('rent_data_vlc_cleaned.csv', sep=',', index=False)
